'use strict' global.ROOTPATH = __dirname; global.LOGPATH = ROOTPATH + '/log'; global.SAVEPATH = ROOTPATH + '/save'; global.TEMPPATH = ROOTPATH + '/tmp'; global.FILE = require('fs-extra'); global.Request = require('request'); global.cheerio = require('cheerio'); global.exec = require('child_process').exec; global.Tracer = require('tracer').dailyfile({root:LOGPATH,format : "{{timestamp}} <{{title}}> {{file}}:{{line}} {{message}}", dateformat : "HH:MM:ss.L"}); global.LOGGER = require(ROOTPATH + '/lib/logger.js'); global.Config = { debug:true, homePage:'https://club.m.autohome.com.cn', listUrl:'https://club.m.autohome.com.cn/jingxuan', } var headers = { 'Host': 'club.m.autohome.com.cn', 'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Mobile Safari/537.36', 'Cookie': 'fvlid=1515595862266rIC6BeVn; sessionid=CF2AA80D-F88B-4007-802C-1A60E154A378%7C%7C2018-01-10+22%3A51%3A02.928%7C%7C0; ahpau=1; area=440399; papopclub=FCFD964BC086EFCCA5F267501588178C; pepopclub=C4E754AE8FD3216F924BF284091604AF; sessionip=183.13.91.175; isFromBaiDuSearch=; isFromBD=; historyClub=4139%2C4252; sessionvid=0AB9C5B6-8426-4134-A483-7C1AB024411C; sessionuid=CF2AA80D-F88B-4007-802C-1A60E154A378%7C%7C2018-01-10+22%3A51%3A02.928%7C%7C0; pvidchain=2342170,2342169,2342169,2342169,104785,2342169,2342170,2342170,2342170,2342170; ref=x.autoimg.cn%7C0%7C0%7C0%7C2018-01-23+00%3A20%3A46.088%7C2018-01-22+23%3A34%3A32.244; ahpvno=75', 'Upgrade-Insecure-Requests':'1' }; var listCodeArr = [104,172,292,295]; var tmpListArr = []; var currentCodeIndex = 0; var currentArticleIndex = 0; var currentPageIndex = 1; var wgetMission = ROOTPATH + '/wgetlist'; var tempFile = TEMPPATH + '/temp'; var wgetCmd = 'wget -i "' + wgetMission + '" -q -t 10 -nc -nH -P '; var last = {}; initLast(); startMission(); function initLast(){ try{ FILE.statSync(TEMPPATH+'/last'); let data = FILE.readFileSync(TEMPPATH+'/last','utf-8'); last = JSON.parse(data); }catch(e){} } function startMission(){ if(listCodeArr[currentCodeIndex]){ currentArticleIndex = 0; currentPageIndex = 1; analysisList(); }else{ LOGGER.info(`ALL TASK DONE!!!`); } } function analysisList(){ let code = listCodeArr[currentCodeIndex]; // try{ // FILE.statSync(TEMPPATH+'/list_' + code); // let data = FILE.readFileSync(TEMPPATH+'/list_' + code,'utf-8'); // tmpListArr = JSON.parse(data); // if(tmpListArr.length>0){ // LOGGER.info(`List load from temp file done,article count [${tmpListArr.length}]`); // analysisArticle(); // return; // } // }catch(e){} let url = Config.listUrl+'/'+code+'/'+currentPageIndex; Request({url:url,encoding: 'utf-8',headers:headers},function(err,response,body){ if(err){ LOGGER.error(`List get error URL:${url}`); LOGGER.error(err); return; } let isBreak = false; let $ = cheerio.load(body); let length = $('ul.module-list-daily a').length; if(length>0){ $('ul.module-list-daily a').each(function(i,a){ let article = {}; article.id = $(a).attr('data-topicid'); article.href = $(a).attr('href'); article.title = $(a).find('h4').text(); if(article.id == last[code]) isBreak = true; if(!isBreak) tmpListArr.push(article); }); LOGGER.info(`List[${code}] page[${currentPageIndex}] done`); if(isBreak){ LOGGER.info(`List[${code}] all page done,article count [${tmpListArr.length}]`); LOGGER.info(`Start parse articles...`); FILE.writeFileSync(TEMPPATH+'/list_' + code,JSON.stringify(tmpListArr),'utf-8'); setTimeout(analysisArticle,5000); return; } currentPageIndex++; setTimeout(analysisList,5000); }else if($('body').length>0 && $('ul.module-list-daily a').length==0){ LOGGER.info(`List[${code}] all page done,article count [${tmpListArr.length}]`); LOGGER.info(`Start parse articles...`); FILE.writeFileSync(TEMPPATH+'/list_' + code,JSON.stringify(tmpListArr),'utf-8'); setTimeout(analysisArticle,5000); }else{ LOGGER.error(`List[${code}] page[${currentPageIndex}] retry`); setTimeout(analysisList,10000); } }); } function analysisArticle(){ if(tmpListArr.length==0){ nextArticle(); return; } let article = tmpListArr[currentArticleIndex]; let url = Config.homePage + article.href; console.log(url) exec(`wget --random-wait ${url} -O ${tempFile}`,function(err,stdout,stderr){ // console.log(err,stdout,stderr); if(err) console.log(err,stdout,stderr); let data = FILE.readFileSync(tempFile,'utf-8'); let $ = cheerio.load(data); let length = $('.content img').length; let list = []; if(length>0){ $('.content img').each(function(i,img){ let imgSrc = $(img).data('original'); if(imgSrc){ imgSrc = imgSrc.split('/'); let fileName = imgSrc[imgSrc.length-1]; fileName = fileName.replace(/\d{3,5}_/,''); fileName = fileName.replace(/\?\d+/,''); imgSrc[imgSrc.length-1] = fileName; imgSrc = 'http:' + imgSrc.join('/'); list.push(imgSrc); } }); FILE.writeFileSync(wgetMission,list.join('\n'),'utf-8'); let saveFolder = SAVEPATH+'/'+listCodeArr[currentCodeIndex]+'/['+article.id+']'+article.title; let execCmd = wgetCmd + '"' + saveFolder + '"'; exec(execCmd,function(err,stdout,stderr){ if(err) console.log(err,stdout,stderr); LOGGER.info(`Article[${article.id}] img count[${list.length}] done`); nextArticle(article); }); }else if($('.content').length==1){ LOGGER.info(`Article[${article.id}] no img`); nextArticle(); }else{ LOGGER.error(`Article[${article.id}] retry`); setTimeout(analysisArticle,10000); } }); } function updateLast(key,value){ last[key] = value; FILE.writeFileSync(TEMPPATH+'/last',JSON.stringify(last),'utf-8'); } function nextArticle(article){ currentArticleIndex++; if(tmpListArr[currentArticleIndex]){ setTimeout(analysisArticle,5000); }else{ if(tmpListArr.length) updateLast(listCodeArr[currentCodeIndex],tmpListArr[0]['id']); LOGGER.info(`List[${listCodeArr[currentCodeIndex]}] all article download done!!!`); currentCodeIndex ++; startMission(); } }