dask / fastparquet

python implementation of the parquet columnar file format.
Apache License 2.0
763 stars 176 forks source link

To Pandas doesn't work with parquet file - Type Error #159

Closed springcoil closed 6 years ago

springcoil commented 7 years ago

Hi all, I'm loading some parquet files generated by a Spark ETL job.

I get this error when calling parquet_file.to_pandas().


AttributeError                            Traceback (most recent call last)
<ipython-input-9-7098f6946da6> in <module>()
----> 1 profiles.to_pandas()

/home/springcoil/miniconda3/envs/py35/lib/python3.5/site-packages/fastparquet/api.py in to_pandas(self, columns, categories, filters, index, timestamp96)
    332                     self.read_row_group(rg, columns, categories, infile=f,
    333                                         index=index, assign=parts,
--> 334                                         timestamp96=timestamp96)
    335                     start += rg.num_rows
    336         else:

/home/springcoil/miniconda3/envs/py35/lib/python3.5/site-packages/fastparquet/api.py in read_row_group(self, rg, columns, categories, infile, index, assign, timestamp96)
    184                 infile, rg, columns, categories, self.schema, self.cats,
    185                 self.selfmade, index=index, assign=assign,
--> 186                 timestamp96=timestamp96, sep=self.sep)
    187         if ret:
    188             return df

/home/springcoil/miniconda3/envs/py35/lib/python3.5/site-packages/fastparquet/core.py in read_row_group(file, rg, columns, categories, schema_helper, cats, selfmade, index, assign, timestamp96, sep)
    336         raise RuntimeError('Going with pre-allocation!')
    337     read_row_group_arrays(file, rg, columns, categories, schema_helper,
--> 338                           cats, selfmade, assign=assign, timestamp96=timestamp96)
    339 
    340     for cat in cats:

/home/springcoil/miniconda3/envs/py35/lib/python3.5/site-packages/fastparquet/core.py in read_row_group_arrays(file, rg, columns, categories, schema_helper, cats, selfmade, assign, timestamp96)
    313                  selfmade=selfmade, assign=out[name],
    314                  catdef=out[name+'-catdef'] if use else None,
--> 315                  timestamp96=mr)
    316 
    317         if _is_map_like(schema_helper, column):

/home/springcoil/miniconda3/envs/py35/lib/python3.5/site-packages/fastparquet/core.py in read_col(column, schema_helper, infile, use_cat, grab_dict, selfmade, assign, catdef, timestamp96)
    237             skip_nulls = False
    238         defi, rep, val = read_data_page(infile, schema_helper, ph, cmd,
--> 239                                         skip_nulls, selfmade=selfmade)
    240         if rep is not None and assign.dtype.kind != 'O':  # pragma: no cover
    241             # this should never get called

/home/springcoil/miniconda3/envs/py35/lib/python3.5/site-packages/fastparquet/core.py in read_data_page(f, helper, header, metadata, skip_nulls, selfmade)
    103                                            dtype=np.uint8))
    104 
--> 105     repetition_levels = read_rep(io_obj, daph, helper, metadata)
    106 
    107     if skip_nulls and not helper.is_required(metadata.path_in_schema):

/home/springcoil/miniconda3/envs/py35/lib/python3.5/site-packages/fastparquet/core.py in read_rep(io_obj, daph, helper, metadata)
     83             metadata.path_in_schema)
     84         bit_width = encoding.width_from_max_int(max_repetition_level)
---> 85         repetition_levels = read_data(io_obj, daph.repetition_level_encoding,
     86                                       daph.num_values,
     87                                       bit_width)[:daph.num_values]

AttributeError: 'NoneType' object has no attribute 'repetition_level_encoding'```

Has anyone seen anything like this before?
martindurant commented 7 years ago

I haven't seen such an error before, I'm afraid. The error suggests that the block has no data-page, which would be very odd - I wonder if it's possible that you have blocks containing no data?

davidcorigliano commented 6 years ago

@martindurant I'm also seeing this error. I'm wondering if you have any further thoughts. Can you elaborate on what you mean by "blocks containing no data"? Appreciate the help.

Traceback (most recent call last):
  File "read_parq.py", line 31, in <module>
    df2 = pfile.to_pandas(['features'])
  File "/Users/davidcorigliano/.virtualenvs/python_env/lib/python2.7/site-packages/fastparquet/api.py", line 406, in to_pandas
    assign=parts)
  File "/Users/davidcorigliano/.virtualenvs/python_env/lib/python2.7/site-packages/fastparquet/api.py", line 206, in read_row_group_file
    assign=assign, scheme=self.file_scheme)
  File "/Users/davidcorigliano/.virtualenvs/python_env/lib/python2.7/site-packages/fastparquet/core.py", line 284, in read_row_group_file
    sep=sep, scheme=scheme)
  File "/Users/davidcorigliano/.virtualenvs/python_env/lib/python2.7/site-packages/fastparquet/core.py", line 334, in read_row_group
    cats, selfmade, assign=assign)
  File "/Users/davidcorigliano/.virtualenvs/python_env/lib/python2.7/site-packages/fastparquet/core.py", line 311, in read_row_group_arrays
    catdef=out[name+'-catdef'] if use else None)
  File "/Users/davidcorigliano/.virtualenvs/python_env/lib/python2.7/site-packages/fastparquet/core.py", line 235, in read_col
    skip_nulls, selfmade=selfmade)
  File "/Users/davidcorigliano/.virtualenvs/python_env/lib/python2.7/site-packages/fastparquet/core.py", line 103, in read_data_page
    repetition_levels = read_rep(io_obj, daph, helper, metadata)
  File "/Users/davidcorigliano/.virtualenvs/python_env/lib/python2.7/site-packages/fastparquet/core.py", line 83, in read_rep
    repetition_levels = read_data(io_obj, daph.repetition_level_encoding,
AttributeError: 'NoneType' object has no attribute 'repetition_level_encoding'

The schema for this field is:

- spark_schema:
| - features: MAP, OPTIONAL
|   - key_value: REPEATED
|   | - key: BYTE_ARRAY, UTF8, REQUIRED
|     - value: BYTE_ARRAY, UTF8, OPTIONAL

Just getting started with fastparquet - the script itself is very simple:

import glob
import fastparquet

filelist = glob.glob('logs/*.parquet')

pfile = fastparquet.ParquetFile(filelist)

df = pfile.to_pandas(['features'])
martindurant commented 6 years ago

The traceback suggests that parsing of the thrift header to a data chunk failed, the "None" should be the data chunk header. This most likely means that the file is corrupt; how was it produced, and does it load successfully in any other parquet frameworks? Do any other columns load OK? Is this happening for only one of the files (try looping over the contents of filelist and loading each in turn) ?

If it turns out that you have a valid file, but fastparquet is failing to load it, then I may ask to see the file and debug from there.

davidcorigliano commented 6 years ago

Thank you for the suggestion. 3 of the ~60 files I was trying to load caused this error. By removing them, it worked. Need to look into why they are corrupt...

Appreciate the help.

davidcorigliano commented 6 years ago

@martindurant - If you are still open to checking out the file I would appreciate it:

To summarize, there seems to be something funky about the "features" column of the "troublesome" file causing fast parquet to generate this error:

(python_env) ~$ python read_parq.py
Traceback (most recent call last):
  File "read_parq.py", line 37, in <module>
    df2 = pfile.to_pandas(['features'])
  File "/Users/davidcorigliano/.virtualenvs/python_env/lib/python2.7/site-packages/fastparquet/api.py", line 398, in to_pandas
    index=index, assign=parts)
  File "/Users/davidcorigliano/.virtualenvs/python_env/lib/python2.7/site-packages/fastparquet/api.py", line 225, in read_row_group
    scheme=self.file_scheme)
  File "/Users/davidcorigliano/.virtualenvs/python_env/lib/python2.7/site-packages/fastparquet/core.py", line 334, in read_row_group
    cats, selfmade, assign=assign)
  File "/Users/davidcorigliano/.virtualenvs/python_env/lib/python2.7/site-packages/fastparquet/core.py", line 311, in read_row_group_arrays
    catdef=out[name+'-catdef'] if use else None)
  File "/Users/davidcorigliano/.virtualenvs/python_env/lib/python2.7/site-packages/fastparquet/core.py", line 235, in read_col
    skip_nulls, selfmade=selfmade)
  File "/Users/davidcorigliano/.virtualenvs/python_env/lib/python2.7/site-packages/fastparquet/core.py", line 103, in read_data_page
    repetition_levels = read_rep(io_obj, daph, helper, metadata)
  File "/Users/davidcorigliano/.virtualenvs/python_env/lib/python2.7/site-packages/fastparquet/core.py", line 83, in read_rep
    repetition_levels = read_data(io_obj, daph.repetition_level_encoding,
AttributeError: 'NoneType' object has no attribute 'repetition_level_encoding'

I'm able to read other columns from the "troublesome" file with fastparquet, and I'm also able to read in the whole thing (including the "features" field) with spark, so I don't actually think the file is corrupt. As I mentioned in previous comment, I'm able to read in other files of the same format ("working file" attached).

I appreciate any thoughts/insights into this. Happy to provide more info if it would be helpful. Thanks.

martindurant commented 6 years ago

OK, so: there appear to be multiple dictionary pages, which is not supposed to happen, but I can deal with. Also, the encoding is "bit-packed (deprecated)", which, as the name suggests, is not supposed to be around. I can maybe code it up, since the spec is well-stated, and I can compare the result against ground-truth as given by spark. I'll get back to you.

davidcorigliano commented 6 years ago

Hey @martindurant - Were you able to find anything here? Appreciate the insights. Thanks a lot for the help.

martindurant commented 6 years ago

Sorry, I did not manage to fix this yet. I did allow for multiple dictionaries, that works fine, but my implementation of binary packed reading apparently does not work, I end up in a bad bytes location and seg-fault. I don't know when I'll have the chance to look into this further.

pjoneswork commented 6 years ago

I'm seeing the same stack trace

Traceback (most recent call last):
  File "apps/asana/stats/er/spark_er/run_stats.py", line 58, in <module>
    run(args.experiment_data_bucket, args.experiment_data_key, args.output_bucket, args.output_key)
  File "apps/asana/stats/er/spark_er/run_stats.py", line 21, in run
    experiment_data["data_key"],
  File "/mnt/report-ebs/tmp/paul/ep_test/apps/asana/stats/er/spark_er/s3_helpers.py", line 40, in load_parquet_data_as_dataframe
    data_df = data.compute()
  File "/mnt/report-ebs/tmp/paul/avro/local/lib/python2.7/site-packages/dask/base.py", line 99, in compute
    (result,) = compute(self, traverse=False, **kwargs)
  File "/mnt/report-ebs/tmp/paul/avro/local/lib/python2.7/site-packages/dask/base.py", line 206, in compute
    results = get(dsk, keys, **kwargs)
  File "/mnt/report-ebs/tmp/paul/avro/local/lib/python2.7/site-packages/dask/threaded.py", line 75, in get
    pack_exception=pack_exception, **kwargs)
  File "/mnt/report-ebs/tmp/paul/avro/local/lib/python2.7/site-packages/dask/local.py", line 521, in get_async
    raise_exception(exc, tb)
  File "/mnt/report-ebs/tmp/paul/avro/local/lib/python2.7/site-packages/dask/local.py", line 290, in execute_task
    result = _execute_task(task, data)
  File "/mnt/report-ebs/tmp/paul/avro/local/lib/python2.7/site-packages/dask/local.py", line 271, in _execute_task
    return func(*args2)
  File "/mnt/report-ebs/tmp/paul/avro/local/lib/python2.7/site-packages/dask/dataframe/io/parquet.py", line 144, in _read_parquet_row_group
    open=open, assign=views, scheme=scheme)
  File "/mnt/report-ebs/tmp/paul/avro/local/lib/python2.7/site-packages/fastparquet/core.py", line 284, in read_row_group_file
    sep=sep, scheme=scheme)
  File "/mnt/report-ebs/tmp/paul/avro/local/lib/python2.7/site-packages/fastparquet/core.py", line 334, in read_row_group
    cats, selfmade, assign=assign)
  File "/mnt/report-ebs/tmp/paul/avro/local/lib/python2.7/site-packages/fastparquet/core.py", line 311, in read_row_group_arrays
    catdef=out[name+'-catdef'] if use else None)
  File "/mnt/report-ebs/tmp/paul/avro/local/lib/python2.7/site-packages/fastparquet/core.py", line 235, in read_col
    skip_nulls, selfmade=selfmade)
  File "/mnt/report-ebs/tmp/paul/avro/local/lib/python2.7/site-packages/fastparquet/core.py", line 103, in read_data_page
    repetition_levels = read_rep(io_obj, daph, helper, metadata)
  File "/mnt/report-ebs/tmp/paul/avro/local/lib/python2.7/site-packages/fastparquet/core.py", line 83, in read_rep
    repetition_levels = read_data(io_obj, daph.repetition_level_encoding,
AttributeError: 'NoneType' object has no attribute 'repetition_level_encoding'

but, the column doesn't seem to be bitpacked encoded.

$ hadoop jar parquet-tools-1.8.2.jar meta --debug hdfs:///user/paul/tmp_predictors/part-00000-7e3e96bc-9b49-4904-9857-8cb17f604635-c000.snappy.parquet
17/11/14 00:37:55 INFO hadoop.ParquetFileReader: Initiating action with parallelism: 5
17/11/14 00:37:55 INFO hadoop.ParquetFileReader: reading another 1 footers
17/11/14 00:37:55 INFO hadoop.ParquetFileReader: Initiating action with parallelism: 5
file:        hdfs://ip-10-0-14-235.ec2.internal:8020/user/paul/tmp_predictors/part-00000-7e3e96bc-9b49-4904-9857-8cb17f604635-c000.snappy.parquet
creator:     parquet-mr version 1.8.2 (build c6522788629e590a53eb79874b95f6c3ff11f16c)
extra:       org.apache.spark.sql.parquet.row.metadata = {"type":"struct","fields":[{"name":"predictors","type":{"type":"array","elementType":"double","containsNull":true},"nullable":true,"metadata":{}}]}

file schema: spark_schema
--------------------------------------------------------------------------------
predictors:  OPTIONAL F:1
.list:       REPEATED F:1
..element:   OPTIONAL DOUBLE R:1 D:3

row group 1: RC:1433744 TS:52617922 OFFSET:4
--------------------------------------------------------------------------------
predictors:
.list:
..element:    DOUBLE SNAPPY DO:0 FPO:4 SZ:7629620/52617922/6.90 VC:41578576 ENC:RLE,PLAIN_DICTIONARY

What can I do to help debug this issue?

martindurant commented 6 years ago

Perhaps fixed by #264 ?

martindurant commented 6 years ago

@davidcorigliano , @pjoneswork , @springcoil

anderl80 commented 6 years ago

I have the same problem here.

martindurant commented 6 years ago

Are you using 0.1.5?

anderl80 commented 6 years ago

Yes and python-snappy 0.5.1

Martin Durant notifications@github.com schrieb am Mo., 9. Apr. 2018 um 14:39 Uhr:

Are you using 0.1.5?

— You are receiving this because you commented. Reply to this email directly, view it on GitHub https://github.com/dask/fastparquet/issues/159#issuecomment-379737274, or mute the thread https://github.com/notifications/unsubscribe-auth/AWH0nEMiqvlx8cvl1jgu9OLjyc_SV2esks5tm1Z6gaJpZM4NqGtn .

anderl80 commented 6 years ago

I could isolate some columns that cause the error. Those contain arrays of integers. But other columns are also arrays of integers and they do work with toPandas().

Andreas Hopfgartner schotterschorsch@gmail.com schrieb am Mo., 9. Apr. 2018 um 15:26 Uhr:

Yes and python-snappy 0.5.1

Martin Durant notifications@github.com schrieb am Mo., 9. Apr. 2018 um 14:39 Uhr:

Are you using 0.1.5?

— You are receiving this because you commented. Reply to this email directly, view it on GitHub https://github.com/dask/fastparquet/issues/159#issuecomment-379737274, or mute the thread https://github.com/notifications/unsubscribe-auth/AWH0nEMiqvlx8cvl1jgu9OLjyc_SV2esks5tm1Z6gaJpZM4NqGtn .

lachinois commented 6 years ago

Oh no, I'm also using Apache Spark and unfortunately, I'm also having that exact same exception but for a list of doubles. It seems like fastparquet just cannot deal with the way Apache Spark writes arrays.

I guess I'll just have to switch to CSV.

martindurant commented 6 years ago

fastparquet does deal with some list types, so if you can produce a sample of the data, I might be able to help.

Just wondering, how do you write lists into CSV?

cmenguy commented 6 years ago

Same issue on my end with some Parquet file written by Spark, with the schema being:

root
  |-- id: string (nullable = true)
  |-- key_value: map (nullable = true)
  |    |-- key: string
  |    |-- value: float (valueContainsNull = true)

Same files can be read fine with Spark for all fields, and with FastParquet if I read just the id field it works fine, but as soon as I try to read the key_value I get the exact same exception as above with the repetition_level_encoding.

martindurant commented 6 years ago

Previously generated files from spark which I have are like this

root
| - map_req_req: MAP, REQUIRED
|   - key_value: REPEATED
|   | - key: (type), REQUIRED
|     - value: (type), UTF8, REQUIRED

what do you get with fastparquet's view of your file?

pf = fastparquet.ParquetFile(...)
print(pf.schema.text)
cmenguy commented 6 years ago

Here is what I get with the schema printed via FastParquet where I can only read the uuid field. If I use pf.to_pandas(["uuid"]) no error, but pf.to_pandas(["uuid", "topics"]) errors out with the stack trace above.

- spark_schema:
  | - uuid: BYTE_ARRAY, UTF8, OPTIONAL
    - topics: MAP, OPTIONAL
      - key_value: REPEATED
      | - key: BYTE_ARRAY, UTF8, REQUIRED
        - value: FLOAT, OPTIONAL

Versus schema for exact same file via Spark where I can read all fields fine:

root
 |-- uuid: string (nullable = true)
 |-- topics: map (nullable = true)
 |    |-- key: string
 |    |-- value: float (valueContainsNull = true)

And for good measure, what I'm getting through the parquet-tools, which also seems to read all the fields fine once I use the dump command:

$ java -jar target/parquet-tools-1.9.0.jar meta --debug /path/to/my_data.snappy.parquet
file:        file:/path/to/my_data.snappy.parquet
creator:     parquet-mr version 1.8.3 (build aef7230e114214b7cc962a8f3fc5aeed6ce80828)
extra:       org.apache.spark.sql.parquet.row.metadata = {"type":"struct","fields":[{"name":"uuid","type":"string","nullable":true,"metadata":{}},{"name":"topics","type":{"type":"map","keyType":"string","valueType":"float","valueContainsNull":true},"nullable":true,"metadata":{}}]}

file schema: spark_schema
--------------------------------------------------------------------------------
uuid:        OPTIONAL BINARY O:UTF8 R:0 D:1
topics:      OPTIONAL F:1
.key_value:  REPEATED F:2
..key:       REQUIRED BINARY O:UTF8 R:1 D:2
..value:     OPTIONAL FLOAT R:1 D:3

row group 1: RC:2424 TS:583883 OFFSET:4
--------------------------------------------------------------------------------
uuid:         BINARY SNAPPY DO:0 FPO:4 SZ:79413/101921/1.28 VC:2424 ENC:BIT_PACKED,PLAIN,RLE
topics:
.key_value:
..key:        BINARY SNAPPY DO:0 FPO:79417 SZ:198726/246964/1.24 VC:128972 ENC:PLAIN_DICTIONARY,RLE
..value:      FLOAT SNAPPY DO:0 FPO:278143 SZ:194030/234998/1.21 VC:128972 ENC:PLAIN_DICTIONARY,RLE

Let me know if I can provide anything more to help investigating this.

cmenguy commented 6 years ago

I think I found the issue @martindurant - in read_col, the assumption is made that only the first page header will possibly be a DictionaryPageHeader, and there is no further check for it in the subsequent pages of the column, so all other pages headers are assumed to be DataPageHeader.

However, it seems that in some cases, you can have multiple DictionaryPageHeader in the column. In my case, Spark generated this Parquet file, where for the topics column, it starts with a DictionaryPageHeader, followed by two DataPageHeader, and then another DictionaryPageHeader. Since at that point everything is assumed to be a data page and all pages go through read_data_page, when trying to access header.data_page_header it is null because this field doesn't exist in a DictionaryPageHeader.

I'm not familiar enough with Parquet internals to say why this happens, but since other Parquet frameworks seem to handle this case fine, it seems like a bug in FastParquet. The fix probably involves just moving the check for ph.type == parquet_thrift.PageType.DICTIONARY_PAGE inside the while loop, but I'm not too sure in that case what to do with the multiple dictionaries.

martindurant commented 6 years ago

@cmenguy , I had thought that this exact issue came up before and was fixed. Do you think it's possible to produce a test file (with randomised data) of reasonable size, with the structure you describe?

martindurant commented 6 years ago

Perhaps https://github.com/dask/fastparquet/pull/367 solves this? I would appreciate you trying with that code.

cmenguy commented 6 years ago

I've tried #367 , and it fails with a different error:

>>> import fastparquet
>>> pf = fastparquet.ParquetFile("/path/to/my_data.snappy.parquet")
>>> df = pf.to_pandas(["topics"])
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/usr/local/lib/python2.7/site-packages/fastparquet/api.py", line 426, in to_pandas
    index=index, assign=parts)
  File "/usr/local/lib/python2.7/site-packages/fastparquet/api.py", line 258, in read_row_group
    scheme=self.file_scheme)
  File "/usr/local/lib/python2.7/site-packages/fastparquet/core.py", line 353, in read_row_group
    cats, selfmade, assign=assign)
  File "/usr/local/lib/python2.7/site-packages/fastparquet/core.py", line 330, in read_row_group_arrays
    catdef=out.get(name+'-catdef', None))
  File "/usr/local/lib/python2.7/site-packages/fastparquet/core.py", line 230, in read_col
    dic2 = np.array(read_dictionary_page(infile, schema_helper, ph, cmd))
  File "/usr/local/lib/python2.7/site-packages/fastparquet/core.py", line 160, in read_dictionary_page
    page_header.dictionary_page_header.num_values)
  File "fastparquet/speedups.pyx", line 163, in fastparquet.speedups.unpack_byte_array
RuntimeError: Ran out of input

I've added a sample file with this issue, which just contains my topics column since it's the only one having multiple dictionary pages in the column. Generated from Spark 2.3.1. multidict-page.snappy.parquet.zip

martindurant commented 6 years ago

I am looking into it, but not sure yet what to do. It seems that after reading the second dictionary page header, we are no longer at a valid data section. Interestingly, the column metadata says:

encoding_stats: [<class 'fastparquet.parquet_thrift.parquet.ttypes.PageEncodingStats'>
count: 1
encoding: 2
page_type: 2
, <class 'fastparquet.parquet_thrift.parquet.ttypes.PageEncodingStats'>
count: 59
encoding: 2
page_type: 0
]

i.e., there are exactly one "type 2" pages (these are the dictionaries), not two. However, the data decompresses OK, and that wouldn't happen, I think, if it didn't look like a valid SNAPPY block.

cmenguy commented 6 years ago

I tried comparing with the output of the dump -d -n command of parquet-tools, and it seems that for the topics.key_value.key column, the dictionary page, along with the following 58 data pages are read just fine. The entire column seems to be read correctly.

But the 2nd dictionary page seems to happen right at the end of the 58 pages for topics.key_value.key. Could it be that this 2nd dictionary page is actually for the 2nd column topics.key_value.value and somehow it's being mixed up? It just seems strange it would be added at the very end after all the data pages.

cmenguy commented 6 years ago

Actually I'm pretty sure now that this 2nd dictionary is for the second column. num_values in this 2nd dictionary is 27765, and parquet-tools reports the same number for the 2nd column topics.key_value.value. So it's somehow being accessed in the same read_col as the 1st column.

martindurant commented 6 years ago

The second dictionary page appears to surface after 38575 values of key are read, out of a total number of 3807485. That's a pretty big number for 70399 rows (of which, I presume, some are None). Can you verify?

cmenguy commented 6 years ago

@martindurant I messed a bit around with the code, and think I found the root cause - could you take a look at #368 and let me know your thoughts? I was able to read the file fine with this fix, and the number of rows matches.

cmenguy commented 6 years ago

Actually, after 1289 rows being read correctly, everything else is null, so most likely some pages are not being read, but at least it's not throwing an exception. It's probably about tuning how num gets computed and the break condition.

cmenguy commented 6 years ago

For anyone hitting the same bug as described in this thread, these should be fixed with #367 and #368 so please report here if you are finding more issues due to how Spark writes files, but the specific error described in this issue should be fixed.

martindurant commented 6 years ago

So we can probably close this?

tao-cao commented 5 years ago

still have exact problem while read a parquet file(4.3G) saved from spark. Any thought to fix this issue?

martindurant commented 5 years ago

@tao-cao , are you using the master version of fastparquet? A release should happen in the next few days. If yes, then (as usual) the specifics of your schema will be important.

tao-cao commented 5 years ago

thanks! Im using pip installed version 0.1.6, is the master on git higher than 0.1.6?

martindurant commented 5 years ago

Yes, the fixes mentioned above are more recent. You would need to install directly from git to test (below), or wait for the release

pip install git+https://github.com/dask/fastparquet
tao-cao commented 5 years ago

@tao-cao , are you using the master version of fastparquet? A release should happen in the next few days. If yes, then (as usual) the specifics of your schema will be important.

@martindurant to give you more info the whole file with more than 5m rows, about 10 col, if saved 100 rows as a test dataset, works fine. But not got error on the whole file.

spektom commented 5 years ago

We are getting the same error with version 0.2.0 installed using Conda.

/opt/conda/lib/python3.6/site-packages/fastparquet/api.py in to_pandas(self, columns, categories, filters, index)
    432                          for (name, v) in views.items()}
    433                 self.read_row_group_file(rg, columns, categories, index,
--> 434                                          assign=parts)
    435                 start += rg.num_rows
    436         return df

/opt/conda/lib/python3.6/site-packages/fastparquet/api.py in read_row_group_file(self, rg, columns, categories, index, assign)
    238                 fn, rg, columns, categories, self.schema, self.cats,
    239                 open=self.open, selfmade=self.selfmade, index=index,
--> 240                 assign=assign, scheme=self.file_scheme)
    241         if ret:
    242             return df

/opt/conda/lib/python3.6/site-packages/fastparquet/core.py in read_row_group_file(fn, rg, columns, categories, schema_helper, cats, open, selfmade, index, assign, scheme)
    293         return read_row_group(f, rg, columns, categories, schema_helper, cats,
    294                               selfmade=selfmade, index=index, assign=assign,
--> 295                               scheme=scheme)
    296 
    297 

/opt/conda/lib/python3.6/site-packages/fastparquet/core.py in read_row_group(file, rg, columns, categories, schema_helper, cats, selfmade, index, assign, scheme)
    342         raise RuntimeError('Going with pre-allocation!')
    343     read_row_group_arrays(file, rg, columns, categories, schema_helper,
--> 344                           cats, selfmade, assign=assign)
    345 
    346     for cat in cats:

/opt/conda/lib/python3.6/site-packages/fastparquet/core.py in read_row_group_arrays(file, rg, columns, categories, schema_helper, cats, selfmade, assign)
    319         read_col(column, schema_helper, file, use_cat=name+'-catdef' in out,
    320                  selfmade=selfmade, assign=out[name],
--> 321                  catdef=out.get(name+'-catdef', None))
    322 
    323         if _is_map_like(schema_helper, column):

/opt/conda/lib/python3.6/site-packages/fastparquet/core.py in read_col(column, schema_helper, infile, use_cat, grab_dict, selfmade, assign, catdef)
    233             skip_nulls = False
    234         defi, rep, val = read_data_page(infile, schema_helper, ph, cmd,
--> 235                                         skip_nulls, selfmade=selfmade)
    236         if rep is not None and assign.dtype.kind != 'O':  # pragma: no cover
    237             # this should never get called

/opt/conda/lib/python3.6/site-packages/fastparquet/core.py in read_data_page(f, helper, header, metadata, skip_nulls, selfmade)
    101                                            dtype=np.uint8))
    102 
--> 103     repetition_levels = read_rep(io_obj, daph, helper, metadata)
    104 
    105     if skip_nulls and not helper.is_required(metadata.path_in_schema):

/opt/conda/lib/python3.6/site-packages/fastparquet/core.py in read_rep(io_obj, daph, helper, metadata)
     81         else:
     82             bit_width = encoding.width_from_max_int(max_repetition_level)
---> 83             repetition_levels = read_data(io_obj, daph.repetition_level_encoding,
     84                                           daph.num_values,
     85                                           bit_width)[:daph.num_values]

AttributeError: 'NoneType' object has no attribute 'repetition_level_encoding'
martindurant commented 5 years ago

@spektom , did you try with the master version as directed above, or try to change things on the spark side (see the discussion) ?

spektom commented 5 years ago

@martindurant The version installed from git master works! Thanks.

martindurant commented 5 years ago

Excellent; I will scheduler a release for when I return from vacation.