HonorLee 6 years ago
commit
ed5f06692c
10 changed files with 388 additions and 0 deletions
  1. 2 0
      .gitignore
  2. 0 0
      README.md
  3. 174 0
      analysis.js
  4. 24 0
      config.js
  5. 1 0
      finalRepinTime.txt
  6. 22 0
      lib/logger.js
  7. 95 0
      lib/mysql-pool.js
  8. 10 0
      lib/mysqldb.js
  9. 32 0
      package.json
  10. 28 0
      schedule.js

+ 2 - 0
.gitignore

@@ -0,0 +1,2 @@
+node_modules/
+.DS_Store

+ 0 - 0
README.md


+ 174 - 0
analysis.js

@@ -0,0 +1,174 @@
+'use strict'
+require('./config.js');
+global.Request = require('request');
+
+let lastPinTime = 0,firstRepinTime = 0,currentTask = null,articleCount = 0;
+let mysqlConfig = {
+    pool:{
+        name: 'TouTiaoFav',
+        maxconn: 5
+    },
+    db:Config.mysqlConfig
+};
+let headers = {
+    'Host': 'www.toutiao.com','User-Agent':'Paw/3.1 (Macintosh; OS X/10.12.6) GCDHTTPRequest',
+    'Cookie':'install_id=12936422119; ttreq=1$1176d4ec1d8c4b340741b2ba75742f13f3581fa0; alert_coverage=56; _ga=GA1.2.1364790727.1494317820; _gid=GA1.2.1734828197.1502001335; qh[360]=1; login_flag=0f67bb380c54dc63fb41864f7f44160a; sessionid=7499554ce9453e80a602a56388e65d7b; sid_guard="7499554ce9453e80a602a56388e65d7b|1501589523|2591997|Thu\\054 31-Aug-2017 12:12:00 GMT"; sid_tt=7499554ce9453e80a602a56388e65d7b; uid_tt=bf0e5ab2d584acefc18547a78fada6b0',
+    'X-SS-Cookie':'install_id=12936422119; ttreq=1$1176d4ec1d8c4b340741b2ba75742f13f3581fa0; alert_coverage=56; _ga=GA1.2.1364790727.1494317820; _gid=GA1.2.1734828197.1502001335; qh[360]=1; login_flag=0f67bb380c54dc63fb41864f7f44160a; sessionid=7499554ce9453e80a602a56388e65d7b; sid_guard="7499554ce9453e80a602a56388e65d7b|1501589523|2591997|Thu\\054 31-Aug-2017 12:12:00 GMT"; sid_tt=7499554ce9453e80a602a56388e65d7b; uid_tt=bf0e5ab2d584acefc18547a78fada6b0'
+};
+
+global.MysqlPool  = require(ROOTPATH + '/lib/mysql-pool.js').instance(mysqlConfig);
+testMysql();
+function testMysql(){
+    let con = MysqlPool.getConnection(mysqlConfig.pool.name);
+    con.query('SELECT VERSION() as version',function(err,result,fields){
+    MysqlPool.freeConnection(mysqlConfig.pool.name,con);
+    if(err){
+        LOGGER.error('Mysql Connect error,please recheck your config');
+        LOGGER.error(err);
+    }else{
+        LOGGER.info('Mysql Connect success');
+        LOGGER.info('Mysql Version: ' + result[0]['version'] + ' | User: ' + Config.mysqlConfig.user + ' | Database: ' + Config.mysqlConfig.database);
+        global.MysqlDB   = require(ROOTPATH + '/lib/mysqldb.js');
+        init();
+    }
+});
+}
+
+function init(){
+    LOGGER.info(`Last Pin Time: ${FinalRepinTime}|${new Date(FinalRepinTime*1000)}`);
+    currentTask = new Task();
+}
+
+function finish(){
+    FinalRepinTime = firstRepinTime;
+    FILE.writeFileSync(FinalRepinFile,FinalRepinTime,'UTF-8');
+    LOGGER.info('--------------------------------------------------------------');
+    LOGGER.info('All fav article download finished!!!');
+    LOGGER.info(`Article analysis count [ ${articleCount} ]`);
+    LOGGER.info('--------------------------------------------------------------');
+    MysqlPool.end();
+}
+
+function Task(){
+    let artticleList = [];
+    let current = -1;
+    let hasMore = false;
+
+    getList();
+    function getList(){
+        let url = Config.listUrl;
+        let args = [];
+        args.push('page_type=2');
+        args.push(`user_id=${Config.uid}`);
+        args.push('max_behot_time=0');
+        args.push('count=20');
+        if(lastPinTime>0) args.push(`max_repin_time=${lastPinTime}`);
+        url += args.join('&');
+        Request({url:url,encoding: 'utf-8',json:true,headers:headers},function(err,response,body){
+            if(err){
+                LOGGER.error(`List get error,max repin time:${lastPinTime}`);
+                LOGGER.error(`URL:${url}`);
+                return;
+            }
+            
+            if(lastPinTime==0){
+                firstRepinTime = body.data[0].repin_time;
+                if(FinalRepinTime==firstRepinTime){
+                    finish();
+                    return;
+                }
+            }
+            hasMore = body.has_more;
+            lastPinTime = body.max_repin_time;
+            artticleList = body.data;
+            getArticle();
+        });    
+    }
+
+    function getArticle(){
+        current++;
+        if(current==artticleList.length){
+            return taskFinish();
+        }
+        let url = Config.articleUrl;
+        let article = artticleList[current];
+        if(FinalRepinTime == article.repin_time){
+            finish();
+            return;
+        }
+        article.content = '';
+        url += article.item_id;
+        Request({url:url,encoding: 'utf-8',headers:headers},function(err,response,body){
+            if(err){
+                LOGGER.error(`Article get error,max repin time:${lastPinTime},article id:${article.item_id}`);
+                LOGGER.error(`URL:${url}`);
+                return;
+            }
+            let content;
+            if(article.has_gallery){
+                content = body.match(/gallery: (.*),/);
+                if(content && content.length>=2){
+                    content = JSON.parse(content[1]).sub_images;
+                    let arr = [];
+                    for(let i in content){
+                        arr.push(content[i].url);
+                    }
+                    article.content = arr.join(',');
+                }
+            }else if(article.has_video){
+                content = body.match(/shareUrl: '(.*)'/);
+                if(content && content.length>=2) article.content = content[1];
+            }else{
+                content = body.match(/content: '(.*)'\.replace/);
+                if(content && content.length>=2) article.content = (content[1]).replace(/\"/g,'\\"');
+            }
+            if(article.content==''){
+                getArticle();
+                return;
+            }
+            insertArticle(article);
+        });
+    }
+    function insertArticle(article){
+        // console.log(article.content);
+        let tempArr = [];
+        let type = article.has_video?2:article.has_gallery?1:0;
+        tempArr.push(`"${article.title.replace(/\"/g,'\\"')}"`);
+        tempArr.push(type);
+        tempArr.push(`"${article.chinese_tag}"`);
+        tempArr.push(`"${article.image_url}"`);
+        tempArr.push(`"${article.abstract.replace(/\"/g,'\\"')}"`);
+        tempArr.push(article.gallary_image_count);
+        tempArr.push(`"${article.display_url}"`);
+        tempArr.push(`"${article.item_id}"`);
+        let sql = `insert IGNORE into list (title,type,tag,cover,abstract,gallaryCount,source,item_id) values(${tempArr.join(',')})`;
+        MysqlDB.query(sql,function(err,result){
+            if(err){
+                LOGGER.error(`Article insert error,article id:${article.item_id}`);
+                LOGGER.error(`SQL:${sql}`);
+                return;
+            }
+            sql = `insert ignore into article (articleID,content) values("${article.item_id}","${article.content}")`
+            MysqlDB.query(sql,function(err,result){
+                if(err){
+                    LOGGER.error(`Article insert error,article id:${article.item_id}`);
+                    LOGGER.error(`SQL:${sql}`);
+                    return;
+                }
+                articleCount++;
+                getArticle();
+            });
+        });
+
+    }
+    function taskFinish(){
+        LOGGER.info('List Finished!');
+        if(hasMore){
+            LOGGER.info('More List read!!!');
+            currentTask = null;
+            currentTask = new Task();
+        }else{
+            finish();
+        }
+    }
+}

+ 24 - 0
config.js

@@ -0,0 +1,24 @@
+global.ROOTPATH = __dirname;
+global.LOGPATH  = ROOTPATH + '/log';
+global.FILE     = require('fs-extra');
+global.Tracer   = require('tracer').dailyfile({root:LOGPATH,format : "{{timestamp}} <{{title}}> {{file}}:{{line}} {{message}}", dateformat : "HH:MM:ss.L"});
+global.LOGGER   = require(ROOTPATH + '/lib/logger.js');
+global.Config   = {
+    uid:9129707997,
+    listUrl:'https://www.toutiao.com/c/user/favourite/?',
+    articleUrl:'https://m.toutiao.com/i',
+    mysqlConfig:{
+        host:'localhost',
+        port:3306,
+        user:'root',
+        password:'',
+        database:'TouTiao',
+        prefix:''
+    },
+    debug:true,
+    write_log_file:false
+}
+global.FinalRepinFile = ROOTPATH + '/finalRepinTime.txt';
+global.FinalRepinTime = Number(FILE.readFileSync(FinalRepinFile,'UTF-8'));
+
+// https://www.toutiao.com/c/user/favourite/?page_type=2&user_id=9129707997&max_behot_time=0&count=20&as=A1B519B8F75D744&cp=5987BDC764040E1&max_repin_time=0

+ 1 - 0
finalRepinTime.txt

@@ -0,0 +1 @@
+1504255053

+ 22 - 0
lib/logger.js

@@ -0,0 +1,22 @@
+/** Logger **/
+const colors = require('colors');
+
+var Logger = {
+    log:function(msg){
+        if(Config.debug) console.log(('[LOG] '+msg).yellow);
+        if(Config.write_log_file) this.out('log',msg);
+    },
+    info:function(msg){
+        if(Config.debug) console.log(('[INF] '+msg).green);
+        if(Config.write_log_file) this.out('info',msg);
+    },
+    error:function(msg){
+        console.log(('[ERR] '+msg).red);
+        if(Config.write_error_file) this.out('error',msg);
+    },
+    out:function(level,msg){
+        Tracer[level](msg);
+    }
+};
+
+module.exports = Logger;

+ 95 - 0
lib/mysql-pool.js

@@ -0,0 +1,95 @@
+/**
+ */
+var Mysql = require('mysql');  
+
+var pool = function(config){  
+    this.free = [];//空闲连接集合
+    this.used = 0; //已使用连接集合
+    for(var key in config.pool){this[key] = config.pool[key];}
+
+    this.newConnection = function(){
+        var con = Mysql.createConnection(config.db);
+        return con;
+    };
+    this.getConnection = function(){
+        var con = null;
+        if(this.used < this.maxconn){
+            if(this.free.length > 0){
+                con = this.free.shift();
+            }else{
+                con = this.newConnection();
+            }
+            this.used++;
+            // console.log('当前使用连接: ' + this.used + ' 空闲连接: ' + this.free.length);
+        }
+        return con;
+    };
+    this.freeConnection = function(con){
+        this.free.push(con);
+        this.used--;
+    };
+    this.freeAll = function(){
+        this.used = 0;
+        for(var i = 0; i < this.free.length; i++){
+            this.free[i].end();
+        }
+        this.free = [];
+    };
+    this.endConnection = function(){
+        this.freeAll();
+        // for(var i = 0; i < this.free.length; i++){
+        //     this.free[i].end();
+        // }
+    }
+};
+var client = {  
+    pools: [],
+    /**
+     * 得到一个数据库连接
+     * @param name 数据库pool名字name
+     * @return {*}
+     */
+    getConnection: function(name){
+        var pool = this.pools[name];
+        return pool ? pool.getConnection() : null;
+    },
+    /**
+     * 释放一个数据库连接
+     * @param name 数据库pool名字name
+     * @param con 连接
+     */
+    freeConnection: function(name,con){
+        var pool = this.pools[name];
+        if(pool){
+            pool.freeConnection(con);
+        }
+    },
+    /**
+     * 释放一个数据库所有连接
+     * @param name 数据库pool名字name
+     */
+    freePool: function(name){
+        var pool = this.pools[name];
+        if(pool)
+            pool.freeAll();
+    },
+    /**
+     * 释放所有数据库的所有连接
+     */
+    freeAll: function(){
+        for(var key in this.pools)
+            this.pools[key].freeAll();
+    },
+    end:function(){
+        for(var key in this.pools)
+            this.pools[key].freeAll();
+    }
+};
+exports.instance = function(config){
+    // if(client.pools.length < 1){
+        // for(var i = 0; i < config.length; i++){
+            client.pools[config.pool.name] = new pool(config);
+        // }
+    // }
+    return client;
+};

+ 10 - 0
lib/mysqldb.js

@@ -0,0 +1,10 @@
+'use strict'
+module.exports={
+    query:function(query,callback){
+        let mysql = MysqlPool.getConnection('TouTiaoFav');
+        mysql.query(query,function(err, results, fields) {  
+            MysqlPool.freeConnection('TouTiaoFav',mysql);
+            callback(err,results,fields);
+        });
+    }
+};

+ 32 - 0
package.json

@@ -0,0 +1,32 @@
+{
+  "name": "TouTiao-Fav-Crawler",
+  "version": "0.1.0",
+  "description": "TouTiao-Fav-Crawler",
+  "scripts": {
+    "test": "echo \"Error: no test specified\" && exit 1"
+  },
+  "dependencies": {
+    "async": "^2.0.0-rc.5",
+    "cheerio": "^0.20.0",
+    "colors": "^1.1.2",
+    "cron": "^1.2.1",
+    "date-format": "0.0.2",
+    "fs-extra": "^0.30.0",
+    "iconv-lite": "^0.4.13",
+    "js-base64": "^2.1.9",
+    "mime-types": "^2.1.11",
+    "mongodb": "^2.1.21",
+    "mysql": "^2.14.1",
+    "querystring": "^0.2.0",
+    "request": "^2.74.0",
+    "tracer": "^0.8.3"
+  },
+  "engines": {
+    "node": ">=4.4.5"
+  },
+  "repository": {
+    "private": true
+  },
+  "author": "HonorLee",
+  "license": "ISC"
+}

+ 28 - 0
schedule.js

@@ -0,0 +1,28 @@
+'use strict'
+require('./config.js');
+let child_process = require('child_process');
+let CronJob = require('cron').CronJob;
+let job = new CronJob({
+    cronTime: '0 0,15 * * * *',
+    onTick: runAnalysis,
+    start: false,
+    timeZone: "Asia/Shanghai"
+});
+
+job.start();
+runAnalysis();
+function runAnalysis(){
+    let date = new Date();
+    let startTime = date.getTime();
+    LOGGER.info('--------------------------------------------------------------');
+    LOGGER.info('Analysis task start : ' + date);
+    LOGGER.info('--------------------------------------------------------------');
+    let task = child_process.fork('./analysis.js');
+    task.on('close',function(){
+        date = new Date();
+        let runThrough = Math.floor((date.getTime()-startTime)/1000);
+        LOGGER.info('--------------------------------------------------------------');
+        LOGGER.info(`Analysis task finished in [ ${runThrough}s ]`);
+        LOGGER.info('--------------------------------------------------------------');
+    })
+}