webrecorder / warcio.js

JS Streaming WARC IO optimized for Browser and Node
MIT License
30 stars 6 forks source link

CDXIndexer crashes in 2.0.0 #47

Closed matteocargnelutti closed 1 year ago

matteocargnelutti commented 1 year ago

Hello 👋 !

No problem upgrading to 1.6.2, but I'm having issues with 2.0.0.

More specifically, CDXIndexer now crashes when I call its run() method.

`}write(s){this.out.write(this.serialize(s))}async run(s){for await(let e of this.iterIndex(s))this.write(e)}async*iterIndex(s){let e={strictHeaders:!0,parseHttp:this.parseHttp};for(let{filename:t,reader:r}of s){let n=new w(r,e);yield*this.iterRecords(n,t)}}async*iterRecords(s,e){for await(let t of s){await t.skipFully();let r=this.indexRecord(t,s,e);r&&(yield r)}}indexRecord(s,e,t){if(this.filterRecord&&!this.filterRecord(s))return null;let r={},n=e.offset,a=e.recordLength,o={offset:n,length:a,filename:t};for(let l of this.fields)l in o?r[l]=o[l]:this.setField(l,s,r);return r}setField(s,e,t){let r=this.getField(s,e);r!==null&&(t[s]=r)}getField(s,e){return s==="http:status"?e.httpHeaders&&(e.warcType==="response"||e.warcType==="revisit")?e.httpHeaders.statusCode:null:s.startsWith("http:")?e.httpHeaders?e.httpHeaders.headers.get(s.slice(5)):null:e.warcHeaders.headers.get(s)||null}},I=class extends ${constructor(s,e){super(s,e);for(let t of this.fields)if(t.startsWith("http:")){this.parseHttp=!0;break}}},ye="urlkey,timestamp,url,mime,status,digest,length,offset,filename".split(","),ge="urlkey,timestamp,url,mime,status,digest,redirect,meta,length,offset,filename".split(","),O=class extends I{constructor(e,t){super(e,t);switch(this.includeAll=Boolean(t?.all),this.fields=ye,this.parseHttp=!0,this.noSurt=Boolean(t?.noSurt),this._lastRecord=null,t?.format){case"cdxj":this.serialize=this.serializeCDXJ;break;case"cdx":this.serialize=this.serializeCDX11;break;case"json":default:break}}async*iterRecords(e,t){this._lastRecord=null;for await(let n of e){await n.readFully();let a=this.indexRecord(n,e,t);a&&(yield a)}let r=this.indexRecord(null,e,t);r&&(yield r)}filterRecord(e){if(this.includeAll)return!0;let t=e.warcType;return!(t==="request"||t==="warcinfo")}indexRecord(e,t,r){if(this.includeAll)return e?super.indexRecord(e,t,r):null;let n=this._lastRecord;return e&&(e._offset=t.offset,e._length=t.recordLength),n?!e||n.warcTargetURI!=e.warcTargetURI?(this._lastRecord=e,this.indexRecordPair(n,null,t,r)):e.warcType==="request"&&n.warcType==="response"?(this._lastRecord=null,this.indexRecordPair(n,e,t,r)):e.warcType==="response"&&n.warcType==="request"?(this._lastRecord=null,this.indexRecordPair(e,n,t,r)):(this._lastRecord=e,this.indexRecordPair(n,null,t,r)):(this._lastRecord=e,null)}indexRecordPair(e,t,r,n){let a,o,l=e.warcTargetURI||"";if(t&&t.httpHeaders&&t.httpHeaders.method!=="GET"){let f={url:l,method:t.httpHeaders.method,headers:t.httpHeaders.headers,postData:t.payload};a=f.method,L(f)&&(o=f.requestBody,e.method=a,e.requestBody=o,l=f.url)}e._urlkey=l;let d=super.indexRecord(e,r,n);return d&&(e&&e._offset!==void 0&&(d.offset=e._offset,d.length=e._length),a&&(d.method=a),o&&(d.requestBody=o)),d}serializeCDXJ(e){let{urlkey:t,timestamp:r}=e;return delete e.urlkey,delete e.timestamp,`${t} ${r} ${JSON.stringify(e)}
                    ^

TypeError: this.out.write is not a function
    at O.write (file://[redacted]/node_modules/warcio/dist/index.js:8:21)
    at O.run (file://[redacted]/node_modules/warcio/dist/index.js:8:101)
    at process.processTicksAndRejections (node:internal/process/task_queues:95:5)

As mentioned earlier: works in 1.6.1 and 1.6.2.

Thanks in advance,

ikreymer commented 1 year ago

Maybe this needs to be documented more, but the params have been switched in 2.0.0, to be able to remove process.stdout and avoid the polyfill in browsers. The params are now:

 constructor(
    out: WritableStreamBuffer | NodeJS.WriteStream,
    opts?: Partial<CdxIndexCommandArgs>,
  )

Perhaps this needs a bit more refactoring, maybe should remove the out from the constructor altogether, and just make it an iterator instead.. and move out to a separate.

matteocargnelutti commented 1 year ago

Ah I missed that, thank you @ikreymer ! It would be ideal if breaking changes like this were mentioned in the changelog, as they're hard to catch at a glance.

matteocargnelutti commented 1 year ago

PS: Congratulations on this major release! 🥳