extractus / article-extractor

To extract main article from given URL with Node.js
https://extractor-demos.pages.dev/article-extractor
MIT License
1.6k stars 140 forks source link

Error: First argument to Readability constructor should be a document object. #310

Closed dimaslanjaka closed 2 years ago

dimaslanjaka commented 2 years ago

article parser version

{
  "article-parser": "^7.2.4"
}

snippet i used

import { extract } from 'article-parser';
import axios from 'axios';

axios
  .get('https://www.webmanajemen.com/chimeraland/sitemap.txt')
  .then((response) => {
    const list = response.data.split(/\r?\n/gm) as string[];
    list.map((url) => {
      extract(url).then((data) => {
        console.log(data);
      });
    });
  });

error logs

dimas@DESKTOP-9JFNTEA /cygdrive/d/Repositories/hexo-backend
$ ts-node "d:\Repositories\hexo-backend\src\standalone\get-links.ts"
d:\Repositories\hexo-backend\node_modules\article-parser\dist\cjs\article-parser.js:54
`)});let n=e.nonTextTags||["script","style","textarea","option"],p,u;e.allowedAttributes&&(p={},u={},rs(e.allowedAttributes,function(v,y){p[y]=[];let T=[];v.forEach(function(P){typeof P=="string"&&P.indexOf("*")>=0?T.push(lm(P).replace(/\\\*/g,".*")):p[y].push(P)}),T.length&&(u[y]=new RegExp("^("+T.join("|")+")$"))}));let o={},d={},c={};rs(e.allowedClasses,function(v,y){p&&(qr(p,y)||(p[y]=[]),p[y].push("class")),o[y]=[],c[y]=[];let T=[];v.forEach(function(P){typeof P=="string"&&P.indexOf("*")>=0?T.push(lm(P).replace(/\\\*/g,".*")):P instanceof RegExp?c[y].push(P):o[y].push(P)}),T.length&&(d[y]=new RegExp("^("+T.join("|")+")$"))});let l={},h;rs(e.transformTags,function(v,y){let T;typeof v=="function"?T=v:typeof v=="string"&&(T=as.simpleTransform(v)),y==="*"?h=T:l[y]=T});let g,_,S,R,O,M,E=!1;A();let V=new Lb.Parser({onopentag:function(v,y){if(e.enforceHtmlBoundary&&v==="html"&&A(),O){M++;return}let T=new s(v,y);_.push(T);let P=!1,De=!!T.text,Le;if(qr(l,v)&&(Le=l[v](v,y),T.attribs=y=Le.attribs,Le.text!==void 0&&(T.innerText=Le.text),v!==Le.tagName&&(T.name=v=Le.tagName,R[g]=Le.tagName)),h&&(Le=h(v,y),T.attribs=y=Le.attribs,v!==Le.tagName&&(T.name=v=Le.tagName,R[g]=Le.tagName)),(e.allowedTags&&e.allowedTags.indexOf(v)===-1||e.disallowedTagsMode==="recursiveEscape"&&!Pb(S)||e.nestingLimit!=null&&g>=e.nestingLimit)&&(P=!0,S[g]=!0,e.disallowedTagsMode==="discard"&&n.indexOf(v)!==-1&&(O=!0,M=1),S[g]=!0),g++,P){if(e.disallowedTagsMode==="discard")return;i=a,a=""}a+="<"+v,v==="script"&&(e.allowedScriptHostnames||e.allowedScriptDomains)&&(T.innerText=""),(!p||qr(p,v)||p["*"])&&rs(y,function(k,Q){if(!kb.test(Q)){delete T.attribs[Q];return}let P1=!1;if(!p||qr(p,v)&&p[v].indexOf(Q)!==-1||p["*"]&&p["*"].indexOf(Q)!==-1||qr(u,v)&&u[v].test(Q)||u["*"]&&u["*"].test(Q))P1=!0;else if(p&&p[v]){for(let $ of p[v])if(Ob($)&&$.name&&$.name===Q){P1=!0;let W="";if($.multiple===!0){let Vt=k.split(" ");for(let Je of Vt)$.values.indexOf(Je)!==-1&&(W===""?W=Je:W+=" "+Je)}else $.values.indexOf(k)>=0&&(W=k);k=W}}if(P1){if(e.allowedSchemesAppliedToAttributes.indexOf(Q)!==-1&&X(v,k)){delete T.attribs[Q];return}if(v==="script"&&Q==="src"){let $=!0;try{let W=I(k);if(e.allowedScriptHostnames||e.allowedScriptDomains){let Vt=(e.allowedScriptHostnames||[]).find(function(Ge){return Ge===W.url.hostname}),Je=(e.allowedScriptDomains||[]).find(function(Ge){return W.url.hostname===Ge||W.url.hostname.endsWith(`.${Ge}`)});$=Vt||Je}}catch{$=!1}if(!$){delete T.attribs[Q];return}}if(v==="iframe"&&Q==="src"){let $=!0;try{let W=I(k);if(W.isRelativeUrl)$=qr(e,"allowIframeRelativeUrls")?e.allowIframeRelativeUrls:!e.allowedIframeHostnames&&!e.allowedIframeDomains;else if(e.allowedIframeHostnames||e.allowedIframeDomains){let Vt=(e.allowedIframeHostnames||[]).find(function(Ge){return Ge===W.url.hostname}),Je=(e.allowedIframeDomains||[]).find(function(Ge){return W.url.hostname===Ge||W.url.hostname.endsWith(`.${Ge}`)});$=Vt||Je}}catch{$=!1}if(!$){delete T.attribs[Q];return}}if(Q==="srcset")try{let $=Rb(k);if($.forEach(function(W){X("srcset",W.url)&&(W.evil=!0)}),$=mm($,function(W){return!W.evil}),$.length)k=qb(mm($,function(W){return!W.evil})),T.attribs[Q]=k;else{delete T.attribs[Q];return}}catch{delete T.attribs[Q];return}if(Q==="class"){let $=o[v],W=o["*"],Vt=d[v],Je=c[v],Ge=d["*"],kr=[Vt,Ge].concat(Je).filter(function(l0){return l0});if($&&W?k=de(k,cm($,W),kr):k=de(k,$||W,kr),!k.length){delete 
T.attribs[Q];return}}if(Q==="style")try{let $=Mb(v+" {"+k+"}"),W=J($,e.allowedStyles);if(k=xe(W),k.length===0){delete T.attribs[Q];return}}catch{delete T.attribs[Q];return}a+=" "+Q,k&&k.length&&(a+='="'+K(k,!0)+'"')}else delete T.attribs[Q]}),e.selfClosing.indexOf(v)!==-1?a+=" />":(a+=">",T.innerText&&!De&&!e.textFilter&&(a+=K(T.innerText),E=!0)),P&&(a=i+K(a),i="")},ontext:function(v){if(O)return;let y=_[_.length-1],T;if(y&&(T=y.tag,v=y.innerText!==void 0?y.innerText:v),e.disallowedTagsMode==="discard"&&(T==="script"||T==="style"))a+=v;else{let P=K(v,!1);e.textFilter&&!E?a+=e.textFilter(P,T):E||(a+=P)}if(_.length){let P=_[_.length-1];P.text+=v}},onclosetag:function(v){if(O)if(M--,!M)O=!1;else return;let y=_.pop();if(!y)return;if(y.tag!==v){_.push(y);return}O=e.enforceHtmlBoundary?v==="html":!1,g--;let T=S[g];if(T){if(delete S[g],e.disallowedTagsMode==="discard"){y.updateParentNodeText();return}i=a,a=""}if(R[g]&&(v=R[g],delete R[g]),e.exclusiveFilter&&e.exclusiveFilter(y)){a=a.substr(0,y.tagPosition);return}if(y.updateParentNodeMediaChildren(),y.updateParentNodeText(),e.selfClosing.indexOf(v)!==-1){T&&(a=i,i="");return}a+="</"+v+">",T&&(a=i+K(a),i=""),E=!1}},e.parser);return V.write(t),V.end(),a;function A(){a="",g=0,_=[],S={},R={},O=!1,M=0}function K(v,y){return typeof v!="string"&&(v=v+""),e.parser.decodeEntities&&(v=v.replace(/&/g,"&amp;").replace(/</g,"&lt;").replace(/>/g,"&gt;"),y&&(v=v.replace(/"/g,"&quot;"))),v=v.replace(/&(?![a-zA-Z0-9#]{1,20};)/g,"&amp;").replace(/</g,"&lt;").replace(/>/g,"&gt;"),y&&(v=v.replace(/"/g,"&quot;")),v}function X(v,y){for(y=y.replace(/[\x00-\x20]+/g,"");;){let De=y.indexOf("<!--");if(De===-1)break;let Le=y.indexOf("-->",De+4);if(Le===-1)break;y=y.substring(0,De)+y.substring(Le+3)}let T=y.match(/^([a-zA-Z][a-zA-Z0-9.\-+]*):/);if(!T)return y.match(/^[/\\]{2}/)?!e.allowProtocolRelative:!1;let P=T[1].toLowerCase();return qr(e.allowedSchemesByTag,v)?e.allowedSchemesByTag[v].indexOf(P)===-1:!e.allowedSchemes||e.allowedSchemes.indexOf(P)===-1}function I(v){if(v=v.replace(/^(\w+:)?\s*[\\/]\s*[\\/]/,"$1//"),v.startsWith("relative:"))throw new Error("relative: exploit attempt");let y="relative://relative-site";for(let De=0;De<100;De++)y+=`/${De}`;let T=new URL(v,y);return{isRelativeUrl:T&&T.hostname==="relative-site"&&T.protocol==="relative:",url:T}}function J(v,y){if(!y)return v;let T=v.nodes[0],P;return y[T.selector]&&y["*"]?P=cm(y[T.selector],y["*"]):P=y[T.selector]||y["*"],P&&(v.nodes[0].nodes=T.nodes.reduce(ne(P),[])),v}function xe(v){return v.nodes[0].nodes.reduce(function(y,T){return y.push(`${T.prop}:${T.value}${T.important?" !important":""}`),y},[]).join(";")}function ne(v){return function(y,T){return qr(v,T.prop)&&v[T.prop].some(function(De){return De.test(T.value)})&&y.push(T),y}}function de(v,y,T){return y?(v=v.split(/\s+/),v.filter(function(P){return y.indexOf(P)!==-1||T.some(function(De){return De.test(P)})}).join(" ")):v}}var Bb={decodeEntities:!0};as.defaults={allowedTags:["address","article","aside","footer","header","h1","h2","h3","h4","h5","h6","hgroup","main","nav","section","blockquote","dd","div","dl","dt","figcaption","figure","hr","li","main","ol","p","pre","ul","a","abbr","b","bdi","bdo","br","cite","code","data","dfn","em","i","kbd","mark","q","rb","rp","rt","rtc","ruby","s","samp","small","span","strong","sub","sup","time","u","var","wbr","caption","col","colgroup","table","tbody","td","tfoot","th","thead","tr"],disallowedTagsMode:"discard",allowedAttributes:{a:["href","name","target"],img:["src","srcset","alt","title","width","height","loading"]},selfClosing:["img","br","hr","area","base","basefont","input","link","meta"],allowedSchemes:["http","https","ftp","mailto","tel"],allowedSchemesByTag:{},allowedSchemesAppliedToAttributes:["href","src","cite"],allowProtocolRelative:!0,enforceHtmlBoundary:!1};as.simpleTransform=function(t,e,r){return r=r===void 0?!0:r,e=e||{},function(a,i){let s;if(r)for(s in e)i[s]=e[s];else i=e;return{tagName:t,attribs:i}}}});var _m=b((PO,xm)=>{xm.exports={compareTwoStrings:ym,findBestMatch:Hb};function ym(t,e){if(t=t.replace(/\s+/g,""),e=e.replace(/\s+/g,""),t===e)return 1;if(t.length<2||e.length<2)return 0;let r=new Map;for(let i=0;i<t.length-1;i++){let s=t.substring(i,i+2),n=r.has(s)?r.get(s)+1:1;r.set(s,n)}let a=0;for(let i=0;i<e.length-1;i++){let s=e.substring(i,i+2),n=r.has(s)?r.get(s):0;n>0&&(r.set(s,n-1),a++)}return 2*a/(t.length+e.length-2)}function Hb(t,e){if(!jb(t,e))throw new Error("Bad arguments: First argument should be a string, second should be an array 
of strings");let r=[],a=0;for(let s=0;s<e.length;s++){let n=e[s],p=ym(t,n);r.push({target:n,rating:p}),p>r[a].rating&&(a=s)}let i=r[a];return{ratings:r,bestMatch:i,bestMatchIndex:a}}function jb(t,e){return!(typeof t!="string"||!Array.isArray(e)||!e.length||e.find(function(r){return typeof r!="string"}))}});var Om=b((FO,fd)=>{function Lm(t,e){if(e&&e.documentElement)t=e,e=arguments[2];else if(!t||!t.documentElement)throw new Error("First argument to Readability constructor should be a document object.");if(e=e||{},this._doc=t,this._docJSDOMParser=this._doc.firstChild.__JSDOMParser__,this._articleTitle=null,this._articleByline=null,this._articleDir=null,this._articleSiteName=null,this._attempts=[],this._debug=!!e.debug,this._maxElemsToParse=e.maxElemsToParse||this.DEFAULT_MAX_ELEMS_TO_PARSE,this._nbTopCandidates=e.nbTopCandidates||this.DEFAULT_N_TOP_CANDIDATES,this._charThreshold=e.charThreshold||this.DEFAULT_CHAR_THRESHOLD,this._classesToPreserve=this.CLASSES_TO_PRESERVE.concat(e.classesToPreserve||[]),this._keepClasses=!!e.keepClasses,this._serializer=e.serializer||function(r){return r.innerHTML},this._disableJSONLD=!!e.disableJSONLD,this._flags=this.FLAG_STRIP_UNLIKELYS|this.FLAG_WEIGHT_CLASSES|this.FLAG_CLEAN_CONDITIONALLY,this._debug){let r=function(a){if(a.nodeType==a.TEXT_NODE)return`${a.nodeName} ("${a.textContent}")`;let i=Array.from(a.attributes||[],function(s){return`${s.name}="${s.value}"`}).join(" ");return`<${a.localName} ${i}>`};this.log=function(){if(typeof dump<"u"){var a=Array.prototype.map.call(arguments,function(i){return i&&i.nodeName?r(i):i}).join(" ");dump("Reader: (Readability) "+a+`
----LONG WHITESPACE HERE, I DELETED IT----
            ^
Error: First argument to Readability constructor should be a document object.
    at new Lm (d:\Repositories\hexo-backend\node_modules\article-parser\dist\cjs\article-parser.js:54:8356)
    at qm (d:\Repositories\hexo-backend\node_modules\article-parser\dist\cjs\article-parser.js:64:3456)    at bd (d:\Repositories\hexo-backend\node_modules\article-parser\dist\cjs\article-parser.js:64:4693)    at Yb (d:\Repositories\hexo-backend\node_modules\article-parser\dist\cjs\article-parser.js:64:5168)    at d:\Repositories\hexo-backend\src\standalone\get-links.ts:9:14
    at Array.map (<anonymous>)
    at d:\Repositories\hexo-backend\src\standalone\get-links.ts:8:10
    at processTicksAndRejections (node:internal/process/task_queues:96:5)
ndaidong commented 2 years ago

@dimaslanjaka the error message looks strange! In the function extractWithReadability, we accept a html string and alway convet it to Document object before initiazing Readaability instance.

Could you check each one URL from your sitemap to see exactly which one causes error?

dimaslanjaka commented 2 years ago

@dimaslanjaka the error message looks strange! In the function extractWithReadability, we accept a html string and alway convet it to Document object before initiazing Readaability instance.

Could you check each one URL from your sitemap to see exactly which one causes error?

fixed by removing empty strings of urls with .filter(str => str.trim().length > 0)