analysis.js 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. 'use strict'
  2. require('./config.js');
  3. global.Request = require('request');
  4. let lastPinTime = 0,firstRepinTime = 0,currentTask = null,articleCount = 0;
  5. let mysqlConfig = {
  6. pool:{
  7. name: 'TouTiaoFav',
  8. maxconn: 5
  9. },
  10. db:Config.mysqlConfig
  11. };
  12. let headers = {
  13. 'Host': 'www.toutiao.com','User-Agent':'Paw/3.1 (Macintosh; OS X/10.12.6) GCDHTTPRequest',
  14. 'Cookie':'install_id=20457824635; ttreq=1$76925223765b5c30e70d4d35057e7c8d0f6abce6; alert_coverage=20; qh[360]=1; _ga=GA1.2.1897556923.1505932678; sessionid=6a2793f3363dc9444631d6ea4a909996; sid_guard=6a2793f3363dc9444631d6ea4a909996%7C1514306238%7C2592000%7CThu%2C+25-Jan-2018+16%3A37%3A18+GMT; sid_tt=6a2793f3363dc9444631d6ea4a909996; uid_tt=72dc0018060086d5c191476eaa449db1; login_flag=dfcae148f63adb82f58df7993787ba70; odin_tt=9715d5e76f01d8bfac4f8518405e218913dde1adb7dd677f9ec02c4603d5034af08b0568d8c103c5d264d57e1d62cd08; UM_distinctid=15eebdefca9236-0e5d58bc43fb0e-2834516c-4a640-15eebdefcaa9a2',
  15. 'X-SS-Cookie':'install_id=20457824635; ttreq=1$76925223765b5c30e70d4d35057e7c8d0f6abce6; alert_coverage=20; qh[360]=1; _ga=GA1.2.1897556923.1505932678; sessionid=6a2793f3363dc9444631d6ea4a909996; sid_guard=6a2793f3363dc9444631d6ea4a909996%7C1514306238%7C2592000%7CThu%2C+25-Jan-2018+16%3A37%3A18+GMT; sid_tt=6a2793f3363dc9444631d6ea4a909996; uid_tt=72dc0018060086d5c191476eaa449db1; login_flag=dfcae148f63adb82f58df7993787ba70; odin_tt=9715d5e76f01d8bfac4f8518405e218913dde1adb7dd677f9ec02c4603d5034af08b0568d8c103c5d264d57e1d62cd08; UM_distinctid=15eebdefca9236-0e5d58bc43fb0e-2834516c-4a640-15eebdefcaa9a2'
  16. };
  17. global.MysqlPool = require(ROOTPATH + '/lib/mysql-pool.js').instance(Config.mysqlConfig);
  18. testMysql();
  19. function testMysql(){
  20. MysqlPool.getConnection(function(err,con){
  21. con.query('SELECT VERSION() as version',function(err,result,fields){
  22. if(err){
  23. LOGGER.error('Mysql Connect error,please recheck your config');
  24. LOGGER.error(err);
  25. }else{
  26. LOGGER.info('Mysql Connect success');
  27. LOGGER.info('Mysql Version: ' + result[0]['version'] + ' | User: ' + Config.mysqlConfig.user + ' | Database: ' + Config.mysqlConfig.database);
  28. global.MysqlDB = require(ROOTPATH + '/lib/mysqldb.js');
  29. init();
  30. }
  31. });
  32. });
  33. }
  34. function init(){
  35. LOGGER.info(`Last Pin Time: ${FinalRepinTime}|${new Date(FinalRepinTime*1000)}`);
  36. currentTask = new Task();
  37. }
  38. function finish(){
  39. FinalRepinTime = firstRepinTime;
  40. FILE.writeFileSync(FinalRepinFile,FinalRepinTime,'UTF-8');
  41. LOGGER.info('--------------------------------------------------------------');
  42. LOGGER.info('All fav article download finished!!!');
  43. LOGGER.info(`Article analysis count [ ${articleCount} ]`);
  44. LOGGER.info('--------------------------------------------------------------');
  45. MysqlPool.end();
  46. }
  47. function Task(){
  48. let artticleList = [];
  49. let current = -1;
  50. let hasMore = false;
  51. getList();
  52. function getList(){
  53. let url = Config.listUrl;
  54. let args = [];
  55. args.push('page_type=2');
  56. args.push(`user_id=${Config.uid}`);
  57. args.push('max_behot_time=0');
  58. args.push('count=20');
  59. if(lastPinTime>0) args.push(`max_repin_time=${lastPinTime}`);
  60. url += args.join('&');
  61. Request({url:url,encoding: 'utf-8',json:true,headers:headers},function(err,response,body){
  62. if(err){
  63. LOGGER.error(`List get error,max repin time:${lastPinTime}`);
  64. LOGGER.error(`URL:${url}`);
  65. return;
  66. }
  67. if(lastPinTime==0){
  68. firstRepinTime = body.data[0].repin_time;
  69. if(FinalRepinTime==firstRepinTime){
  70. finish();
  71. return;
  72. }
  73. }
  74. hasMore = body.has_more;
  75. lastPinTime = body.max_repin_time;
  76. artticleList = body.data;
  77. getArticle();
  78. });
  79. }
  80. function getArticle(){
  81. current++;
  82. if(current==artticleList.length){
  83. return taskFinish();
  84. }
  85. let url = Config.articleUrl;
  86. let article = artticleList[current];
  87. if(FinalRepinTime == article.repin_time){
  88. finish();
  89. return;
  90. }
  91. article.content = '';
  92. url += article.item_id;
  93. Request({url:url,encoding: 'utf-8',headers:headers},function(err,response,body){
  94. if(err){
  95. LOGGER.error(`Article get error,max repin time:${lastPinTime},article id:${article.item_id}`);
  96. LOGGER.error(`URL:${url}`);
  97. return;
  98. }
  99. let content;
  100. if(article.has_gallery){
  101. content = body.match(/gallery: (.*),/);
  102. if(content && content.length>=2){
  103. content = content[1];
  104. if(content.match(/JSON\.parse\("(.*)"\)/)){
  105. content = eval(content);
  106. }else{
  107. content = JSON.parse(content).sub_images;
  108. }
  109. let arr = [];
  110. for(let i in content){
  111. arr.push(content[i].url);
  112. }
  113. article.content = arr.join(',');
  114. }
  115. }else if(article.has_video){
  116. content = body.match(/shareUrl: '(.*)'/);
  117. if(content && content.length>=2) article.content = content[1];
  118. }else{
  119. content = body.match(/content: '(.*)'\.replace/);
  120. if(content && content.length>=2) article.content = (content[1]).replace(/\"/g,'\\"');
  121. }
  122. if(article.content==''){
  123. getArticle();
  124. return;
  125. }
  126. insertArticle(article);
  127. });
  128. }
  129. function insertArticle(article){
  130. // console.log(article.content);
  131. let tempArr = [];
  132. let type = article.has_video?2:article.has_gallery?1:0;
  133. tempArr.push(`"${article.title.replace(/\"/g,'\\"')}"`);
  134. tempArr.push(type);
  135. tempArr.push(`"${article.chinese_tag}"`);
  136. tempArr.push(`"${article.image_url}"`);
  137. tempArr.push(`"${article.abstract.replace(/\"/g,'\\"')}"`);
  138. tempArr.push(article.gallary_image_count);
  139. tempArr.push(`"${article.display_url}"`);
  140. tempArr.push(`"${article.item_id}"`);
  141. let sql = `insert IGNORE into list (title,type,tag,cover,abstract,gallaryCount,source,item_id) values(${tempArr.join(',')})`;
  142. MysqlDB.query(sql,function(err,result){
  143. if(err){
  144. LOGGER.error(`Article insert error,article id:${article.item_id}`);
  145. LOGGER.error(`SQL:${sql}`);
  146. return;
  147. }
  148. sql = `insert ignore into article (articleID,content) values("${article.item_id}","${article.content}")`
  149. MysqlDB.query(sql,function(err,result){
  150. if(err){
  151. LOGGER.error(`Article insert error,article id:${article.item_id}`);
  152. LOGGER.error(`SQL:${sql}`);
  153. return;
  154. }
  155. articleCount++;
  156. getArticle();
  157. });
  158. });
  159. }
  160. function taskFinish(){
  161. LOGGER.info('List Finished!');
  162. if(hasMore){
  163. LOGGER.info('More List read!!!');
  164. currentTask = null;
  165. currentTask = new Task();
  166. }else{
  167. finish();
  168. }
  169. }
  170. }