aliyun / aliyun-odps-python-sdk

ODPS Python SDK and data analysis framework
http://pyodps.readthedocs.io
Apache License 2.0
434 stars 97 forks source link

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb2 in position 23: invalid start byte 这里能不能加个ignore的选项 #253

Closed pengwork closed 1 week ago

pengwork commented 1 week ago
---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
Cell In[23], line 23
     20 n_process = multiprocessing.cpu_count()
     21 with o.execute_sql(SQL).open_reader(tunnel=True) as reader:
     22     # n_process should be number of processes to use
---> 23     pd_df = reader.to_pandas(n_process=n_process)
     24 data=pd_df
     25 data.columns

File /home/pai/lib/python3.9/site-packages/odps/models/readers.py:170, in TunnelRecordReader.to_pandas(self, start, count, columns, n_process)
    166     return super(TunnelRecordReader, self).to_pandas(
    167         start=start, count=count, columns=columns
    168     )
    169 else:
--> 170     return self._to_pandas_with_processes(
    171         start=start, count=count, columns=columns, n_process=n_process
    172     )

File /home/pai/lib/python3.9/site-packages/odps/models/readers.py:60, in TunnelReaderMixin._to_pandas_with_processes(self, start, count, columns, n_process)
     58 if any(not d[2] for d in splits):
     59     exc_info = next(d[1] for d in splits if not d[2])
---> 60     six.reraise(*exc_info)
     61 return pd.concat([d[1] for d in splits]).reset_index(drop=True)

File /home/pai/lib/python3.9/site-packages/odps/lib/six.py:703, in reraise(tp, value, tb)
    701     if value.__traceback__ is not tb:
    702         raise value.with_traceback(tb)
--> 703     raise value
    704 finally:
    705     value = None

File /home/pai/lib/python3.9/site-packages/odps/models/instance.py:79, in _read_instance_split()
     75     session = instance_tunnel.create_download_session(
     76         instance=instance_id, download_id=download_id
     77     )
     78     with session.open_record_reader(start, count, columns=columns) as reader:
---> 79         conn.send((idx, reader.to_pandas(), True))
     80 except:
     81     conn.send((idx, sys.exc_info(), False))

File /home/pai/lib/python3.9/site-packages/odps/readers.py:202, in to_pandas()
    199 def to_pandas(self, start=None, count=None, **kw):
    200     import pandas  # noqa: F401
--> 202     return self.to_result_frame(start=start, count=count, **kw).values

File /home/pai/lib/python3.9/site-packages/odps/readers.py:178, in to_result_frame()
    176 offset_iter = itertools.cycle(compat.irange(read_row_batch_size))
    177 data = [None] * read_row_batch_size
--> 178 for offset, rec in zip(
    179     offset_iter, self._iter(start=start, end=end, **iter_kw)
    180 ):
    181     data[offset] = rec
    182     if offset != read_row_batch_size - 1:

File /home/pai/lib/python3.9/site-packages/odps/readers.py:115, in _iter()
    113 for i in range(step):
    114     try:
--> 115         record = next(self)
    116     except StopIteration:
    117         return

File /home/pai/lib/python3.9/site-packages/odps/tunnel/io/reader_c.pyx:459, in odps.tunnel.io.reader_c.TunnelRecordReader.__next__()
    457 class TunnelRecordReader(BaseTunnelRecordReader, AbstractRecordReader):
    458     def __next__(self):
--> 459         record = self.read()
    460         if record is None:
    461             raise StopIteration

File /home/pai/lib/python3.9/site-packages/odps/tunnel/io/reader_c.pyx:358, in odps.tunnel.io.reader_c.BaseTunnelRecordReader.read()
    356     return 0
    357 
--> 358 cpdef read(self):
    359     cdef:
    360         int index

File /home/pai/lib/python3.9/site-packages/odps/tunnel/io/reader_c.pyx:413, in odps.tunnel.io.reader_c.BaseTunnelRecordReader.read()
    411 i = index - 1
    412 if self._column_setters[i] != NULL:
--> 413     self._column_setters[i](self, rec_list, i)
    414 else:
    415     data_type = self._schema_snapshot._col_types[i]

File /home/pai/lib/python3.9/site-packages/odps/tunnel/io/reader_c.pyx:303, in odps.tunnel.io.reader_c.BaseTunnelRecordReader._set_string()
    301 cdef int _set_string(self, list record, int i) except? -1:
    302     cdef object val = self._read_string()
--> 303     self._set_record_list_value(record, i, val)
    304     return 0
    305 

File /home/pai/lib/python3.9/site-packages/odps/tunnel/io/reader_c.pyx:298, in odps.tunnel.io.reader_c.BaseTunnelRecordReader._set_record_list_value()
    296 
    297     cdef int _set_record_list_value(self, list record, int i, object value) except? -1:
--> 298         record[i] = self._schema_snapshot.validate_value(i, value, MAX_READ_SIZE_LIMIT)
    299         return 0
    300 

File /home/pai/lib/python3.9/site-packages/odps/src/types_c.pyx:280, in odps.src.types_c.SchemaSnapshot.validate_value()
    278 if vfun != NULL:
    279     try:
--> 280         return vfun(val, max_field_size)
    281     except TypeError:
    282         pass

File /home/pai/lib/python3.9/site-packages/odps/src/types_c.pyx:68, in odps.src.types_c._validate_string()
     66 if isinstance(val, bytes):
     67     s_size = len(<bytes> val)
---> 68     u_val = (<bytes> val).decode('utf-8')
     69 elif isinstance(val, unicode):
     70     u_val = <unicode> val

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb2 in position 23: invalid start byte
wjsi commented 1 week ago

options.tunnel.string_as_binary = True.