UnicodeDecodeError: 'ascii' codec can't decode byte...

Initializing a corpus fails with the following error:

UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 1499: ordinal not in range(128)

Steps to Reproduce (for bugs)

cw = textacy.datasets.CapitolWords()
cw.download()
records = cw.records(limit=50)
text_stream, metadata_stream = textacy.io.split_records(records, 'text')
corpus = textacy.Corpus('en', texts=text_stream, metadatas=metadata_stream)

Your Environment

platform: darwin
python: 3.6.4 |Anaconda, Inc.| (default, Jan 16 2018, 12:04:33) [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
spacy: 2.0.11
spacy_models: ['en']
textacy: 0.6.2

Traceback:

---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
<ipython-input-301-f9749e048b96> in <module>()
      1 records = cw.records(limit=50)
      2 text_stream, metadata_stream = textacy.io.split_records(records, 'text')
----> 3 corpus = textacy.Corpus('en', texts=text_stream, metadatas=metadata_stream)
      4 # for text in cw.texts():
      5 #      print(text)

~/anaconda3/lib/python3.6/site-packages/textacy/corpus.py in __init__(self, lang, texts, docs, metadatas)
    156             raise ValueError(msg)
    157         if texts:
--> 158             self.add_texts(texts, metadatas=metadatas)
    159         elif docs:
    160             if metadatas:

~/anaconda3/lib/python3.6/site-packages/textacy/corpus.py in add_texts(self, texts, metadatas, n_threads, batch_size)
    289             texts, n_threads=n_threads, batch_size=batch_size)
    290         if metadatas:
--> 291             for i, (spacy_doc, metadata) in enumerate(compat.zip_(spacy_docs, metadatas)):
    292                 self._add_textacy_doc(
    293                     Doc(spacy_doc, lang=self.spacy_lang, metadata=metadata))

~/anaconda3/lib/python3.6/site-packages/spacy/language.py in pipe(self, texts, as_tuples, n_threads, batch_size, disable, cleanup)
    576         original_strings_data = None
    577         nr_seen = 0
--> 578         for doc in docs:
    579             yield doc
    580             if cleanup:

nn_parser.pyx in pipe()

~/anaconda3/lib/python3.6/site-packages/cytoolz/itertoolz.pyx in cytoolz.itertoolz.partition_all.__next__ (cytoolz/itertoolz.c:14538)()

nn_parser.pyx in pipe()

~/anaconda3/lib/python3.6/site-packages/cytoolz/itertoolz.pyx in cytoolz.itertoolz.partition_all.__next__ (cytoolz/itertoolz.c:14538)()

pipeline.pyx in pipe()

~/anaconda3/lib/python3.6/site-packages/cytoolz/itertoolz.pyx in cytoolz.itertoolz.partition_all.__next__ (cytoolz/itertoolz.c:14538)()

~/anaconda3/lib/python3.6/site-packages/spacy/language.py in <genexpr>(.0)
    555                 yield (doc, context)
    556             return
--> 557         docs = (self.make_doc(text) for text in texts)
    558         for name, proc in self.pipeline:
    559             if name in disable:

~/anaconda3/lib/python3.6/site-packages/cytoolz/itertoolz.pyx in cytoolz.itertoolz._pluck_index.__next__ (cytoolz/itertoolz.c:15112)()

~/anaconda3/lib/python3.6/site-packages/textacy/io/utils.py in <genexpr>(.0)
    247         return ((item.pop(content_field), item) for item in items)
    248     else:
--> 249         return unzip(((item.pop(content_field), item) for item in items))
    250 
    251 

~/anaconda3/lib/python3.6/site-packages/textacy/datasets/capitol_words.py in records(self, speaker_name, speaker_party, chamber, congress, date_range, min_len, limit)
    236             False, speaker_name, speaker_party, chamber, congress, date_range,
    237             min_len, limit)
--> 238         for record in records:
    239             yield record
    240 

~/anaconda3/lib/python3.6/site-packages/textacy/datasets/capitol_words.py in _iterate(self, text_only, speaker_name, speaker_party, chamber, congress, date_range, min_len, limit)
    281         n = 0
    282         mode = 'rb' if compat.is_python2 else 'rt'  # TODO: check this
--> 283         for line in io.read_json(self.filename, mode=mode, lines=True):
    284 
    285             if speaker_name and line['speaker_name'] not in speaker_name:

~/anaconda3/lib/python3.6/site-packages/textacy/io/json.py in read_json(fname, mode, encoding, lines)
     50             yield json.load(f)
     51         elif lines is True:
---> 52             for line in f:
     53                 yield json.loads(line)
     54         elif isinstance(lines, compat.string_types):

~/anaconda3/lib/python3.6/encodings/ascii.py in decode(self, input, final)
     24 class IncrementalDecoder(codecs.IncrementalDecoder):
     25     def decode(self, input, final=False):
---> 26         return codecs.ascii_decode(input, self.errors)[0]
     27 
     28 class StreamWriter(Codec,codecs.StreamWriter):

UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 1499: ordinal not in range(128)

chartbeat-labs / textacy

UnicodeDecodeError: 'ascii' codec can't decode byte... #210

Steps to Reproduce (for bugs)

Your Environment

Traceback: