disinfoRG / ZeroScraper

Web scraper made by 0archive.
https://0archive.tw
MIT License
10 stars 2 forks source link

use api for 今日頭條 #109

Open andreawwenyi opened 4 years ago

andreawwenyi commented 4 years ago

今日頭條's article list would not load when using selenium. We could use their api https://www.toutiao.com/api/pc/realtime_news/ to collect article urls.

andreawwenyi commented 4 years ago

ToutiaoDiscoverSpider works for awhile, however, recently (after May), there are no useful information in the snapshots of toutiao articles (an example shown below). Using selenium does not help. Deactivate toutiao for now.

<html><head><meta charset="UTF-8" />
  <script>!function(a){var e="https://s.go-mpulse.net/boomerang/",t="addEventListener";if("False"=="True")a.BOOMR_config=a.BOOMR_config||{},a.BOOMR_config.PageParams=a.BOOMR_config.PageParams||{},a.BOOMR_config.PageParams.pci=!0,e="https://s2.go-mpulse.net/boomerang/";if(window.BOOMR_API_key="3SNHB-N5CT4-EEMK6-MJGN4-6D8V7",function(){function n(e){a.BOOMR_onload=e&&e.timeStamp||(new Date).getTime()}if(!a.BOOMR||!a.BOOMR.version&&!a.BOOMR.snippetExecuted){a.BOOMR=a.BOOMR||{},a.BOOMR.snippetExecuted=!0;var i,o,_,r=document.createElement("iframe");if(a[t])a[t]("load",n,!1);else if(a.attachEvent)a.attachEvent("onload",n);r.src="javascript:void(0)",r.title="",r.role="presentation",(r.frameElement||r).style.cssText="width:0;height:0;border:0;display:none;",_=document.getElementsByTagName("script")[0],_.parentNode.insertBefore(r,_);try{o=r.contentWindow.document}catch(O){i=document.domain,r.src="javascript:var d=document.open();d.domain='"+i+"';void(0);",o=r.contentWindow.document}o.open()._l=function(){var a=this.createElement("script");if(i)this.domain=i;a.id="boomr-if-as",a.src=e+"3SNHB-N5CT4-EEMK6-MJGN4-6D8V7",BOOMR_lstart=(new Date).getTime(),this.body.appendChild(a)},o.write("<bo"+'dy onload="document._l();">'),o.close()}}(),"".length>0)if(a&&"performance"in a&&a.performance&&"function"==typeof a.performance.setResourceTimingBufferSize)a.performance.setResourceTimingBufferSize();!function(){if(BOOMR=a.BOOMR||{},BOOMR.plugins=BOOMR.plugins||{},!BOOMR.plugins.AK){var e=""=="true"?1:0,t="",n="rorhx5yxfqdtaxvtxmfa-f-7e8c8060d-clientnsv4-s.akamaihd.net",i={"ak.v":"27","ak.cp":"617858","ak.ai":parseInt("502054",10),"ak.ol":"0","ak.cr":6976,"ak.ipv":4,"ak.proto":"","ak.rid":"405c08b","ak.r":37234,"ak.a2":e,"ak.m":"r","ak.n":"ff","ak.bpcip":"139.162.123.0","ak.cport":48572,"ak.gh":"23.44.7.44","ak.quicv":"","ak.tlsv":"tls1.3","ak.0rtt":"","ak.csrc":"-","ak.acc":"reno","ak.t":"1588837130","ak.ak":"hOBiQwZUYzCg5VSAfCLimQ==9NuGMXnfwsJdKPC8vprWvfwGYbr09+AINIyQKBL60Dgtw2EPvtrOVv/kkhBGIY6D7Tk8PYhpC9hQ2bhQxE8fdkSw0sq+vz5+67xTklgGSlCKz27jrQOJFlUJEcZf75B7tTSXEsCXelIJ/CiIEcftwrcKU2VwiYpMeDzYBv1/OQoHW6uhUeJK8M84PQd9C+vwcndr9WrZY8V2J6T0BEBCc9hhmyFjxTcgi8nFrciBD58PZRqkMRMkpDQdIKmUyEHooTh7u4mROSjbSInqI9MnIYyvgoVYhpT9nF9g9buw7zfvshAqL1yW8X28Fb4QC0iFecA7Sgahg72cbqd7ntMloCjDM/WB3onaVFHbSDk2g6QDgm6Za3EgEx6FW1JKnvHvkx3fizd5Znst+jl2SnuE8ttPrgyx5neNxuNIl11xyHM=","ak.pv":"7"};if(""!==t)i["ak.ruds"]=t;var o={i:!1,av:function(e){var t="http.initiator";if(e&&(!e[t]||"spa_hard"===e[t]))i["ak.feo"]=void 0!==a.aFeoApplied?1:0,BOOMR.addVar(i)},rv:function(){var a=["ak.bpcip","ak.cport","ak.cr","ak.csrc","ak.gh","ak.ipv","ak.m","ak.n","ak.ol","ak.proto","ak.quicv","ak.tlsv","ak.0rtt","ak.r","ak.acc","ak.t"];BOOMR.removeVar(a)}};BOOMR.plugins.AK={akVars:i,akDNSPreFetchDomain:n,init:function(){if(!o.i){var a=BOOMR.subscribe;a("before_beacon",o.av,null,null),a("onbeacon",o.rv,null,null),o.i=!0}return this},is_complete:function(){return!0}}}}()}(window);</script></head><body></body><script src='https://sf1-ttcdn-tos.pstatp.com/obj/rc-web-sdk/acrawler.js'></script><script>window.byted_acrawler.init({aid:99999999,dfp:!0});var b;a:{for(var c=document.cookie.split(/[;&]/),d,e=0;e<c.length;e++){for(d=c[e];" "===d.charAt(0);)d=d.substring(1,d.length);if(0===d.indexOf("__ac_nonce=")){b=d.substring(11,d.length);break a}}b=""}var f=b;var g=window.byted_acrawler.sign("",f);document.cookie="__ac_signature=; expires=Mon, 20 Sep 1970 00:00:00 UTC; path=/;";
document.cookie="__ac_signature="+g+"; expires="+(new Date((new Date).getTime()+18E5)).toGMTString()+"; path=/;";window.location.reload();</script></html>