example output (original_text replaced the text field):
{
"id": "http://03175fb.netsolhost.com/WordPress/2015/07/dusky-dark/",
"metadata": {
"Content-Length": "77351",
"Content-Type": "application/http; msgtype=response",
"WARC-Block-Digest": "sha1:FDUTPAJ7SY6FQMVNT66RTJ2RMPUIS5T6",
"WARC-Concurrent-To": "<urn:uuid:4579af60-8adc-4d5d-91d9-553573949d22>",
"WARC-Date": "2018-07-20T12:39:00Z",
"WARC-IP-Address": "206.188.192.17",
"WARC-Identified-Payload-Type": "application/xhtml+xml",
"WARC-Payload-Digest": "sha1:WYZQRB5LLK6R5AHCXYNECRWJ4RSD4Q2I",
"WARC-Record-ID": "<urn:uuid:f9988f8d-27c8-405e-9a6d-14e0d4cc35ff>",
"WARC-Target-URI": "http://03175fb.netsolhost.com/WordPress/2015/07/dusky-dark/",
"WARC-Type": "response",
"WARC-Warcinfo-ID": "<urn:uuid:efbf0c0e-d410-49e7-8b79-058e56bd64d5>",
"bff_contained_ngram_count_before_dedupe": 0,
"fasttext_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train_prob": 0.21563804149627688,
"language_id_whole_page_fasttext": {
"en": 0.8151143789291382
},
"original_text": "Venus is brightest, as it usually is. Jupiter’s just up to the left.\n\n“Evening stars” just days before the last nearly perfect conjunction.\n\nThe moon has slid up and sidewards, as it will do, from any conjunction,\nshe’ll be back in a moonth.\n\nBut the weird light is almost a solid thing.\n\nA flower you could stand on.",
"previous_word_count": 60,
"provenance": "0000_dclm_shard_00000958.jsonl.zstd:1",
"url": "http://03175fb.netsolhost.com/WordPress/2015/07/dusky-dark/",
"warcinfo": "robots: classic\r\nhostname: ip-10-13-227-44.ec2.internal\r\nsoftware: Nutch 1.6 (CC)\r\nisPartOf: CC-MAIN-2018-30\r\noperator: Common Crawl Admin\r\ndescription: Wide crawl of the web for July 2018\r\npublisher: Common Crawl\r\nformat: WARC File Format 1.0\r\nconformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf"
},
"source": "dclm-hero-run-fasttext_for_HF",
"text": "Venus is brightest, as it usually is. Jupiter’s just up to the left.\n\n“Evening stars” just days before the last nearly perfect conjunction.\n\nThe moon has slid up and sidewards, as it will do, from any conjunction,\nshe’ll be back in a moonth.\n\nBut the weird light is almost a solid thing.\n\nA flower you could stand on.",
"version": "1.0"
}
example output (original_text replaced the text field):