datasets / covid-19

Novel Coronavirus 2019 time series data on cases
https://datahub.io/core/covid-19
1.16k stars 605 forks source link

Executing process.py on 3/11/2020 gets ValidationError #6

Closed mpEsri closed 4 years ago

mpEsri commented 4 years ago

Here's the traceback:

Traceback (most recent call last): File "/usr/local/lib/python3.7/site-packages/dataflows/base/schema_validator.py", line 49, in schema_validator row[f.name] = f.cast_value(row.get(f.name)) File "/usr/local/lib/python3.7/site-packages/tableschema/field.py", line 149, in cast_value ).format(field=self, value=value)) datapackage.exceptions.CastError: Field "Deaths" can't cast value "None" for type "number" with format "default"

During handling of the above exception, another exception occurred:

Traceback (most recent call last): File "process.py", line 60, in dump_to_path() File "/usr/local/lib/python3.7/site-packages/dataflows/base/flow.py", line 12, in results return self._chain().results(on_error=on_error) File "/usr/local/lib/python3.7/site-packages/dataflows/base/datastream_processor.py", line 96, in results for res in ds.res_iter File "/usr/local/lib/python3.7/site-packages/dataflows/base/datastream_processor.py", line 96, in for res in ds.res_iter File "/usr/local/lib/python3.7/site-packages/dataflows/base/schema_validator.py", line 46, in schema_validator for i, row in enumerate(iterator): File "/usr/local/lib/python3.7/site-packages/dataflows/processors/dumpers/dumper_base.py", line 69, in row_counter for row in iterator: File "/usr/local/lib/python3.7/site-packages/dataflows/processors/dumpers/file_dumper.py", line 76, in rows_processor for row in resource: File "/usr/local/lib/python3.7/site-packages/dataflows/base/schema_validator.py", line 51, in schema_validator if not on_error(resource['name'], row, i, e): File "/usr/local/lib/python3.7/site-packages/dataflows/base/schema_validator.py", line 22, in raise_exception raise ValidationError(res_name, row, i, e) dataflows.base.schema_validator.ValidationError: ROW: {'Date': datetime.date(2020, 3, 11), 'Province/State': 'Anhui', 'Country/Region': 'Mainland China', 'Lat': Decimal('31.8257'), 'Long': Decimal('117.2264'), 'Confirmed': None, 'Recovered': None, 'Deaths': 'None'}

binarytrails commented 4 years ago

bump:

---------------------------------------------------------------------------
ValidationError                           Traceback (most recent call last)
<ipython-input-11-4036c1aa3210> in <module>
     18 extra_value = {'name': 'Case', 'type': 'number'}
     19 
---> 20 Flow(
     21       load(f'{BASE_URL}{CONFIRMED}'),
     22       load(f'{BASE_URL}{RECOVERED}'),

~/.local/lib/python3.8/site-packages/dataflows/base/flow.py in results(self, on_error)
     10 
     11     def results(self, on_error=None):
---> 12         return self._chain().results(on_error=on_error)
     13 
     14     def process(self):

~/.local/lib/python3.8/site-packages/dataflows/base/datastream_processor.py in results(self, on_error)
     92     def results(self, on_error=None):
     93         ds = self._process()
---> 94         results = [
     95             list(schema_validator(res.res, res, on_error=on_error))
     96             for res in ds.res_iter

~/.local/lib/python3.8/site-packages/dataflows/base/datastream_processor.py in <listcomp>(.0)
     93         ds = self._process()
     94         results = [
---> 95             list(schema_validator(res.res, res, on_error=on_error))
     96             for res in ds.res_iter
     97         ]

~/.local/lib/python3.8/site-packages/dataflows/base/schema_validator.py in schema_validator(resource, iterator, field_names, on_error)
     44         field_names = [f.name for f in schema.fields]
     45     schema_fields = [f for f in schema.fields if f.name in field_names]
---> 46     for i, row in enumerate(iterator):
     47         try:
     48             for f in schema_fields:

~/.local/lib/python3.8/site-packages/dataflows/processors/dumpers/dumper_base.py in row_counter(self, resource, iterator)
     67     def row_counter(self, resource, iterator):
     68         counter = 0
---> 69         for row in iterator:
     70             counter += 1
     71             yield row

~/.local/lib/python3.8/site-packages/dataflows/processors/dumpers/file_dumper.py in rows_processor(self, resource, writer, temp_file)
     74 
     75     def rows_processor(self, resource, writer, temp_file):
---> 76         for row in resource:
     77             writer.write_row(row)
     78             yield row

~/.local/lib/python3.8/site-packages/dataflows/base/schema_validator.py in schema_validator(resource, iterator, field_names, on_error)
     49                 row[f.name] = f.cast_value(row.get(f.name))
     50         except CastError as e:
---> 51             if not on_error(resource['name'], row, i, e):
     52                 continue
     53 

~/.local/lib/python3.8/site-packages/dataflows/base/schema_validator.py in raise_exception(res_name, row, i, e)
     20 
     21 def raise_exception(res_name, row, i, e):
---> 22     raise ValidationError(res_name, row, i, e)
     23 
     24 

ValidationError: 
ROW: {'Date': datetime.date(2020, 3, 14), 'Province/State': None, 'Country/Region': 'Thailand', 'Lat': Decimal('15.0'), 'Long': Decimal('101.0'), 'Confirmed': None, 'Recovered': None, 'Deaths': 'None'}
----
anuveyatsu commented 4 years ago

This should be FIXED now. See missingValues property in the schema: https://github.com/datasets/covid-19/commit/c150f41eea3538216d4faf8969be340210c283cd