123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177 |
- 'use strict'
- global.ROOTPATH = __dirname;
- global.LOGPATH = ROOTPATH + '/log';
- global.SAVEPATH = ROOTPATH + '/save';
- global.TEMPPATH = ROOTPATH + '/tmp';
- global.FILE = require('fs-extra');
- global.Request = require('request');
- global.cheerio = require('cheerio');
- global.exec = require('child_process').exec;
- global.Tracer = require('tracer').dailyfile({root:LOGPATH,format : "{{timestamp}} <{{title}}> {{file}}:{{line}} {{message}}", dateformat : "HH:MM:ss.L"});
- global.LOGGER = require(ROOTPATH + '/lib/logger.js');
- global.Config = {
- debug:true,
- homePage:'https://club.m.autohome.com.cn',
- listUrl:'https://club.m.autohome.com.cn/jingxuan',
- }
- var headers = {
- 'Host': 'club.m.autohome.com.cn',
- 'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Mobile Safari/537.36',
- 'Cookie': 'fvlid=1515595862266rIC6BeVn; sessionid=CF2AA80D-F88B-4007-802C-1A60E154A378%7C%7C2018-01-10+22%3A51%3A02.928%7C%7C0; ahpau=1; area=440399; papopclub=FCFD964BC086EFCCA5F267501588178C; pepopclub=C4E754AE8FD3216F924BF284091604AF; sessionip=183.13.91.175; isFromBaiDuSearch=; isFromBD=; historyClub=4139%2C4252; sessionvid=0AB9C5B6-8426-4134-A483-7C1AB024411C; sessionuid=CF2AA80D-F88B-4007-802C-1A60E154A378%7C%7C2018-01-10+22%3A51%3A02.928%7C%7C0; pvidchain=2342170,2342169,2342169,2342169,104785,2342169,2342170,2342170,2342170,2342170; ref=x.autoimg.cn%7C0%7C0%7C0%7C2018-01-23+00%3A20%3A46.088%7C2018-01-22+23%3A34%3A32.244; ahpvno=75',
- 'Upgrade-Insecure-Requests':'1'
- };
- var listCodeArr = [104,172,292,295];
- var tmpListArr = [];
- var currentCodeIndex = 0;
- var currentArticleIndex = 0;
- var currentPageIndex = 1;
- var wgetMission = ROOTPATH + '/wgetlist';
- var tempFile = TEMPPATH + '/temp';
- var wgetCmd = 'wget -i "' + wgetMission + '" -q -t 10 -nc -nH -P ';
- var last = {};
- initLast();
- startMission();
- function initLast(){
- try{
- FILE.statSync(TEMPPATH+'/last');
- let data = FILE.readFileSync(TEMPPATH+'/last','utf-8');
- last = JSON.parse(data);
- }catch(e){}
- }
- function startMission(){
- if(listCodeArr[currentCodeIndex]){
- currentArticleIndex = 0;
- currentPageIndex = 1;
- analysisList();
- }else{
- LOGGER.info(`ALL TASK DONE!!!`);
- }
-
- }
- function analysisList(){
- let code = listCodeArr[currentCodeIndex];
- // try{
- // FILE.statSync(TEMPPATH+'/list_' + code);
- // let data = FILE.readFileSync(TEMPPATH+'/list_' + code,'utf-8');
- // tmpListArr = JSON.parse(data);
- // if(tmpListArr.length>0){
- // LOGGER.info(`List load from temp file done,article count [${tmpListArr.length}]`);
- // analysisArticle();
- // return;
- // }
- // }catch(e){}
- let url = Config.listUrl+'/'+code+'/'+currentPageIndex;
- Request({url:url,encoding: 'utf-8',headers:headers},function(err,response,body){
- if(err){
- LOGGER.error(`List get error URL:${url}`);
- LOGGER.error(err);
- return;
- }
- let isBreak = false;
- let $ = cheerio.load(body);
- let length = $('ul.module-list-daily a').length;
- if(length>0){
- $('ul.module-list-daily a').each(function(i,a){
- let article = {};
- article.id = $(a).attr('data-topicid');
- article.href = $(a).attr('href');
- article.title = $(a).find('h4').text();
- if(article.id == last[code]) isBreak = true;
- if(!isBreak) tmpListArr.push(article);
- });
- LOGGER.info(`List[${code}] page[${currentPageIndex}] done`);
- if(isBreak){
- LOGGER.info(`List[${code}] all page done,article count [${tmpListArr.length}]`);
- LOGGER.info(`Start parse articles...`);
- FILE.writeFileSync(TEMPPATH+'/list_' + code,JSON.stringify(tmpListArr),'utf-8');
- setTimeout(analysisArticle,5000);
- return;
- }
- currentPageIndex++;
- setTimeout(analysisList,5000);
- }else if($('body').length>0 && $('ul.module-list-daily a').length==0){
- LOGGER.info(`List[${code}] all page done,article count [${tmpListArr.length}]`);
- LOGGER.info(`Start parse articles...`);
- FILE.writeFileSync(TEMPPATH+'/list_' + code,JSON.stringify(tmpListArr),'utf-8');
- setTimeout(analysisArticle,5000);
- }else{
- LOGGER.error(`List[${code}] page[${currentPageIndex}] retry`);
- setTimeout(analysisList,10000);
- }
- });
- }
- function analysisArticle(){
- if(tmpListArr.length==0){
- nextArticle();
- return;
- }
- let article = tmpListArr[currentArticleIndex];
- let url = Config.homePage + article.href;
- console.log(url)
- exec(`wget --random-wait ${url} -O ${tempFile}`,function(err,stdout,stderr){
- // console.log(err,stdout,stderr);
- if(err) console.log(err,stdout,stderr);
- let data = FILE.readFileSync(tempFile,'utf-8');
- let $ = cheerio.load(data);
- let length = $('.content img').length;
- let list = [];
- if(length>0){
- $('.content img').each(function(i,img){
- let imgSrc = $(img).data('original');
- if(imgSrc){
- imgSrc = imgSrc.split('/');
- let fileName = imgSrc[imgSrc.length-1];
- fileName = fileName.replace(/\d{3,5}_/,'');
- fileName = fileName.replace(/\?\d+/,'');
- imgSrc[imgSrc.length-1] = fileName;
- imgSrc = 'http:' + imgSrc.join('/');
- list.push(imgSrc);
- }
- });
- FILE.writeFileSync(wgetMission,list.join('\n'),'utf-8');
- let saveFolder = SAVEPATH+'/'+listCodeArr[currentCodeIndex]+'/['+article.id+']'+article.title;
- let execCmd = wgetCmd + '"' + saveFolder + '"';
- exec(execCmd,function(err,stdout,stderr){
- if(err) console.log(err,stdout,stderr);
- LOGGER.info(`Article[${article.id}] img count[${list.length}] done`);
- nextArticle(article);
- });
- }else if($('.content').length==1){
- LOGGER.info(`Article[${article.id}] no img`);
- nextArticle();
- }else{
- LOGGER.error(`Article[${article.id}] retry`);
- setTimeout(analysisArticle,10000);
- }
- });
- }
- function updateLast(key,value){
- last[key] = value;
- FILE.writeFileSync(TEMPPATH+'/last',JSON.stringify(last),'utf-8');
- }
- function nextArticle(article){
- currentArticleIndex++;
- if(tmpListArr[currentArticleIndex]){
- setTimeout(analysisArticle,5000);
- }else{
- if(tmpListArr.length) updateLast(listCodeArr[currentCodeIndex],tmpListArr[0]['id']);
- LOGGER.info(`List[${listCodeArr[currentCodeIndex]}] all article download done!!!`);
- currentCodeIndex ++;
- startMission();
- }
- }
|