analysis.js 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. 'use strict'
  2. require('./config.js');
  3. global.Request = require('request');
  4. let lastPinTime = 0,firstRepinTime = 0,currentTask = null,articleCount = 0;
  5. let mysqlConfig = {
  6. pool:{
  7. name: 'TouTiaoFav',
  8. maxconn: 5
  9. },
  10. db:Config.mysqlConfig
  11. };
  12. let headers = {
  13. 'Host': 'www.toutiao.com','User-Agent':'Paw/3.1 (Macintosh; OS X/10.12.6) GCDHTTPRequest',
  14. 'Cookie':'install_id=12936422119; ttreq=1$1176d4ec1d8c4b340741b2ba75742f13f3581fa0; alert_coverage=56; _ga=GA1.2.1364790727.1494317820; _gid=GA1.2.1734828197.1502001335; qh[360]=1; login_flag=0f67bb380c54dc63fb41864f7f44160a; sessionid=7499554ce9453e80a602a56388e65d7b; sid_guard="7499554ce9453e80a602a56388e65d7b|1501589523|2591997|Thu\\054 31-Aug-2017 12:12:00 GMT"; sid_tt=7499554ce9453e80a602a56388e65d7b; uid_tt=bf0e5ab2d584acefc18547a78fada6b0',
  15. 'X-SS-Cookie':'install_id=12936422119; ttreq=1$1176d4ec1d8c4b340741b2ba75742f13f3581fa0; alert_coverage=56; _ga=GA1.2.1364790727.1494317820; _gid=GA1.2.1734828197.1502001335; qh[360]=1; login_flag=0f67bb380c54dc63fb41864f7f44160a; sessionid=7499554ce9453e80a602a56388e65d7b; sid_guard="7499554ce9453e80a602a56388e65d7b|1501589523|2591997|Thu\\054 31-Aug-2017 12:12:00 GMT"; sid_tt=7499554ce9453e80a602a56388e65d7b; uid_tt=bf0e5ab2d584acefc18547a78fada6b0'
  16. };
  17. global.MysqlPool = require(ROOTPATH + '/lib/mysql-pool.js').instance(mysqlConfig);
  18. testMysql();
  19. function testMysql(){
  20. let con = MysqlPool.getConnection(mysqlConfig.pool.name);
  21. con.query('SELECT VERSION() as version',function(err,result,fields){
  22. MysqlPool.freeConnection(mysqlConfig.pool.name,con);
  23. if(err){
  24. LOGGER.error('Mysql Connect error,please recheck your config');
  25. LOGGER.error(err);
  26. }else{
  27. LOGGER.info('Mysql Connect success');
  28. LOGGER.info('Mysql Version: ' + result[0]['version'] + ' | User: ' + Config.mysqlConfig.user + ' | Database: ' + Config.mysqlConfig.database);
  29. global.MysqlDB = require(ROOTPATH + '/lib/mysqldb.js');
  30. init();
  31. }
  32. });
  33. }
  34. function init(){
  35. LOGGER.info(`Last Pin Time: ${FinalRepinTime}|${new Date(FinalRepinTime*1000)}`);
  36. currentTask = new Task();
  37. }
  38. function finish(){
  39. FinalRepinTime = firstRepinTime;
  40. FILE.writeFileSync(FinalRepinFile,FinalRepinTime,'UTF-8');
  41. LOGGER.info('--------------------------------------------------------------');
  42. LOGGER.info('All fav article download finished!!!');
  43. LOGGER.info(`Article analysis count [ ${articleCount} ]`);
  44. LOGGER.info('--------------------------------------------------------------');
  45. MysqlPool.end();
  46. }
  47. function Task(){
  48. let artticleList = [];
  49. let current = -1;
  50. let hasMore = false;
  51. getList();
  52. function getList(){
  53. let url = Config.listUrl;
  54. let args = [];
  55. args.push('page_type=2');
  56. args.push(`user_id=${Config.uid}`);
  57. args.push('max_behot_time=0');
  58. args.push('count=20');
  59. if(lastPinTime>0) args.push(`max_repin_time=${lastPinTime}`);
  60. url += args.join('&');
  61. Request({url:url,encoding: 'utf-8',json:true,headers:headers},function(err,response,body){
  62. if(err){
  63. LOGGER.error(`List get error,max repin time:${lastPinTime}`);
  64. LOGGER.error(`URL:${url}`);
  65. return;
  66. }
  67. if(lastPinTime==0){
  68. firstRepinTime = body.data[0].repin_time;
  69. if(FinalRepinTime==firstRepinTime){
  70. finish();
  71. return;
  72. }
  73. }
  74. hasMore = body.has_more;
  75. lastPinTime = body.max_repin_time;
  76. artticleList = body.data;
  77. getArticle();
  78. });
  79. }
  80. function getArticle(){
  81. current++;
  82. if(current==artticleList.length){
  83. return taskFinish();
  84. }
  85. let url = Config.articleUrl;
  86. let article = artticleList[current];
  87. if(FinalRepinTime == article.repin_time){
  88. finish();
  89. return;
  90. }
  91. article.content = '';
  92. url += article.item_id;
  93. Request({url:url,encoding: 'utf-8',headers:headers},function(err,response,body){
  94. if(err){
  95. LOGGER.error(`Article get error,max repin time:${lastPinTime},article id:${article.item_id}`);
  96. LOGGER.error(`URL:${url}`);
  97. return;
  98. }
  99. let content;
  100. if(article.has_gallery){
  101. content = body.match(/gallery: (.*),/);
  102. if(content && content.length>=2){
  103. content = JSON.parse(content[1]).sub_images;
  104. let arr = [];
  105. for(let i in content){
  106. arr.push(content[i].url);
  107. }
  108. article.content = arr.join(',');
  109. }
  110. }else if(article.has_video){
  111. content = body.match(/shareUrl: '(.*)'/);
  112. if(content && content.length>=2) article.content = content[1];
  113. }else{
  114. content = body.match(/content: '(.*)'\.replace/);
  115. if(content && content.length>=2) article.content = (content[1]).replace(/\"/g,'\\"');
  116. }
  117. if(article.content==''){
  118. getArticle();
  119. return;
  120. }
  121. insertArticle(article);
  122. });
  123. }
  124. function insertArticle(article){
  125. // console.log(article.content);
  126. let tempArr = [];
  127. let type = article.has_video?2:article.has_gallery?1:0;
  128. tempArr.push(`"${article.title.replace(/\"/g,'\\"')}"`);
  129. tempArr.push(type);
  130. tempArr.push(`"${article.chinese_tag}"`);
  131. tempArr.push(`"${article.image_url}"`);
  132. tempArr.push(`"${article.abstract.replace(/\"/g,'\\"')}"`);
  133. tempArr.push(article.gallary_image_count);
  134. tempArr.push(`"${article.display_url}"`);
  135. tempArr.push(`"${article.item_id}"`);
  136. let sql = `insert IGNORE into list (title,type,tag,cover,abstract,gallaryCount,source,item_id) values(${tempArr.join(',')})`;
  137. MysqlDB.query(sql,function(err,result){
  138. if(err){
  139. LOGGER.error(`Article insert error,article id:${article.item_id}`);
  140. LOGGER.error(`SQL:${sql}`);
  141. return;
  142. }
  143. sql = `insert ignore into article (articleID,content) values("${article.item_id}","${article.content}")`
  144. MysqlDB.query(sql,function(err,result){
  145. if(err){
  146. LOGGER.error(`Article insert error,article id:${article.item_id}`);
  147. LOGGER.error(`SQL:${sql}`);
  148. return;
  149. }
  150. articleCount++;
  151. getArticle();
  152. });
  153. });
  154. }
  155. function taskFinish(){
  156. LOGGER.info('List Finished!');
  157. if(hasMore){
  158. LOGGER.info('More List read!!!');
  159. currentTask = null;
  160. currentTask = new Task();
  161. }else{
  162. finish();
  163. }
  164. }
  165. }