analysis.js 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. 'use strict'
  2. global.ROOTPATH = __dirname;
  3. global.LOGPATH = ROOTPATH + '/log';
  4. global.SAVEPATH = ROOTPATH + '/save';
  5. global.TEMPPATH = ROOTPATH + '/tmp';
  6. global.FILE = require('fs-extra');
  7. global.Request = require('request');
  8. global.cheerio = require('cheerio');
  9. global.exec = require('child_process').exec;
  10. global.Tracer = require('tracer').dailyfile({root:LOGPATH,format : "{{timestamp}} <{{title}}> {{file}}:{{line}} {{message}}", dateformat : "HH:MM:ss.L"});
  11. global.LOGGER = require(ROOTPATH + '/lib/logger.js');
  12. global.Config = {
  13. debug:true,
  14. homePage:'https://club.m.autohome.com.cn',
  15. listUrl:'https://club.m.autohome.com.cn/jingxuan',
  16. }
  17. var headers = {
  18. 'Host': 'club.m.autohome.com.cn',
  19. 'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Mobile Safari/537.36',
  20. 'Cookie': 'fvlid=1515595862266rIC6BeVn; sessionid=CF2AA80D-F88B-4007-802C-1A60E154A378%7C%7C2018-01-10+22%3A51%3A02.928%7C%7C0; ahpau=1; area=440399; papopclub=FCFD964BC086EFCCA5F267501588178C; pepopclub=C4E754AE8FD3216F924BF284091604AF; sessionip=183.13.91.175; isFromBaiDuSearch=; isFromBD=; historyClub=4139%2C4252; sessionvid=0AB9C5B6-8426-4134-A483-7C1AB024411C; sessionuid=CF2AA80D-F88B-4007-802C-1A60E154A378%7C%7C2018-01-10+22%3A51%3A02.928%7C%7C0; pvidchain=2342170,2342169,2342169,2342169,104785,2342169,2342170,2342170,2342170,2342170; ref=x.autoimg.cn%7C0%7C0%7C0%7C2018-01-23+00%3A20%3A46.088%7C2018-01-22+23%3A34%3A32.244; ahpvno=75',
  21. 'Upgrade-Insecure-Requests':'1'
  22. };
  23. var listCodeArr = [104,172,292,295];
  24. var tmpListArr = [];
  25. var currentCodeIndex = 0;
  26. var currentArticleIndex = 0;
  27. var currentPageIndex = 1;
  28. var wgetMission = ROOTPATH + '/wgetlist';
  29. var tempFile = TEMPPATH + '/temp';
  30. var wgetCmd = 'wget -i "' + wgetMission + '" -q -t 10 -nc -nH -P ';
  31. var last = {};
  32. initLast();
  33. startMission();
  34. function initLast(){
  35. try{
  36. FILE.statSync(TEMPPATH+'/last');
  37. let data = FILE.readFileSync(TEMPPATH+'/last','utf-8');
  38. last = JSON.parse(data);
  39. }catch(e){}
  40. }
  41. function startMission(){
  42. if(listCodeArr[currentCodeIndex]){
  43. currentArticleIndex = 0;
  44. currentPageIndex = 1;
  45. analysisList();
  46. }else{
  47. LOGGER.info(`ALL TASK DONE!!!`);
  48. }
  49. }
  50. function analysisList(){
  51. let code = listCodeArr[currentCodeIndex];
  52. // try{
  53. // FILE.statSync(TEMPPATH+'/list_' + code);
  54. // let data = FILE.readFileSync(TEMPPATH+'/list_' + code,'utf-8');
  55. // tmpListArr = JSON.parse(data);
  56. // if(tmpListArr.length>0){
  57. // LOGGER.info(`List load from temp file done,article count [${tmpListArr.length}]`);
  58. // analysisArticle();
  59. // return;
  60. // }
  61. // }catch(e){}
  62. let url = Config.listUrl+'/'+code+'/'+currentPageIndex;
  63. Request({url:url,encoding: 'utf-8',headers:headers},function(err,response,body){
  64. if(err){
  65. LOGGER.error(`List get error URL:${url}`);
  66. LOGGER.error(err);
  67. return;
  68. }
  69. let isBreak = false;
  70. let $ = cheerio.load(body);
  71. let length = $('ul.module-list-daily a').length;
  72. if(length>0){
  73. $('ul.module-list-daily a').each(function(i,a){
  74. let article = {};
  75. article.id = $(a).attr('data-topicid');
  76. article.href = $(a).attr('href');
  77. article.title = $(a).find('h4').text();
  78. if(article.id == last[code]) isBreak = true;
  79. if(!isBreak) tmpListArr.push(article);
  80. });
  81. LOGGER.info(`List[${code}] page[${currentPageIndex}] done`);
  82. if(isBreak){
  83. LOGGER.info(`List[${code}] all page done,article count [${tmpListArr.length}]`);
  84. LOGGER.info(`Start parse articles...`);
  85. FILE.writeFileSync(TEMPPATH+'/list_' + code,JSON.stringify(tmpListArr),'utf-8');
  86. setTimeout(analysisArticle,5000);
  87. return;
  88. }
  89. currentPageIndex++;
  90. setTimeout(analysisList,5000);
  91. }else if($('body').length>0 && $('ul.module-list-daily a').length==0){
  92. LOGGER.info(`List[${code}] all page done,article count [${tmpListArr.length}]`);
  93. LOGGER.info(`Start parse articles...`);
  94. FILE.writeFileSync(TEMPPATH+'/list_' + code,JSON.stringify(tmpListArr),'utf-8');
  95. setTimeout(analysisArticle,5000);
  96. }else{
  97. LOGGER.error(`List[${code}] page[${currentPageIndex}] retry`);
  98. setTimeout(analysisList,10000);
  99. }
  100. });
  101. }
  102. function analysisArticle(){
  103. if(tmpListArr.length==0){
  104. nextArticle();
  105. return;
  106. }
  107. let article = tmpListArr[currentArticleIndex];
  108. let url = Config.homePage + article.href;
  109. console.log(url)
  110. exec(`wget --random-wait ${url} -O ${tempFile}`,function(err,stdout,stderr){
  111. // console.log(err,stdout,stderr);
  112. if(err) console.log(err,stdout,stderr);
  113. let data = FILE.readFileSync(tempFile,'utf-8');
  114. let $ = cheerio.load(data);
  115. let length = $('.content img').length;
  116. let list = [];
  117. if(length>0){
  118. $('.content img').each(function(i,img){
  119. let imgSrc = $(img).data('original');
  120. if(imgSrc){
  121. imgSrc = imgSrc.split('/');
  122. let fileName = imgSrc[imgSrc.length-1];
  123. fileName = fileName.replace(/\d{3,5}_/,'');
  124. fileName = fileName.replace(/\?\d+/,'');
  125. imgSrc[imgSrc.length-1] = fileName;
  126. imgSrc = 'http:' + imgSrc.join('/');
  127. list.push(imgSrc);
  128. }
  129. });
  130. FILE.writeFileSync(wgetMission,list.join('\n'),'utf-8');
  131. let saveFolder = SAVEPATH+'/'+listCodeArr[currentCodeIndex]+'/['+article.id+']'+article.title;
  132. let execCmd = wgetCmd + '"' + saveFolder + '"';
  133. exec(execCmd,function(err,stdout,stderr){
  134. if(err) console.log(err,stdout,stderr);
  135. LOGGER.info(`Article[${article.id}] img count[${list.length}] done`);
  136. nextArticle(article);
  137. });
  138. }else if($('.content').length==1){
  139. LOGGER.info(`Article[${article.id}] no img`);
  140. nextArticle();
  141. }else{
  142. LOGGER.error(`Article[${article.id}] retry`);
  143. setTimeout(analysisArticle,10000);
  144. }
  145. });
  146. }
  147. function updateLast(key,value){
  148. last[key] = value;
  149. FILE.writeFileSync(TEMPPATH+'/last',JSON.stringify(last),'utf-8');
  150. }
  151. function nextArticle(article){
  152. currentArticleIndex++;
  153. if(tmpListArr[currentArticleIndex]){
  154. setTimeout(analysisArticle,5000);
  155. }else{
  156. if(tmpListArr.length) updateLast(listCodeArr[currentCodeIndex],tmpListArr[0]['id']);
  157. LOGGER.info(`List[${listCodeArr[currentCodeIndex]}] all article download done!!!`);
  158. currentCodeIndex ++;
  159. startMission();
  160. }
  161. }