Closed pengwork closed 1 week ago
--------------------------------------------------------------------------- UnicodeDecodeError Traceback (most recent call last) Cell In[23], line 23 20 n_process = multiprocessing.cpu_count() 21 with o.execute_sql(SQL).open_reader(tunnel=True) as reader: 22 # n_process should be number of processes to use ---> 23 pd_df = reader.to_pandas(n_process=n_process) 24 data=pd_df 25 data.columns File /home/pai/lib/python3.9/site-packages/odps/models/readers.py:170, in TunnelRecordReader.to_pandas(self, start, count, columns, n_process) 166 return super(TunnelRecordReader, self).to_pandas( 167 start=start, count=count, columns=columns 168 ) 169 else: --> 170 return self._to_pandas_with_processes( 171 start=start, count=count, columns=columns, n_process=n_process 172 ) File /home/pai/lib/python3.9/site-packages/odps/models/readers.py:60, in TunnelReaderMixin._to_pandas_with_processes(self, start, count, columns, n_process) 58 if any(not d[2] for d in splits): 59 exc_info = next(d[1] for d in splits if not d[2]) ---> 60 six.reraise(*exc_info) 61 return pd.concat([d[1] for d in splits]).reset_index(drop=True) File /home/pai/lib/python3.9/site-packages/odps/lib/six.py:703, in reraise(tp, value, tb) 701 if value.__traceback__ is not tb: 702 raise value.with_traceback(tb) --> 703 raise value 704 finally: 705 value = None File /home/pai/lib/python3.9/site-packages/odps/models/instance.py:79, in _read_instance_split() 75 session = instance_tunnel.create_download_session( 76 instance=instance_id, download_id=download_id 77 ) 78 with session.open_record_reader(start, count, columns=columns) as reader: ---> 79 conn.send((idx, reader.to_pandas(), True)) 80 except: 81 conn.send((idx, sys.exc_info(), False)) File /home/pai/lib/python3.9/site-packages/odps/readers.py:202, in to_pandas() 199 def to_pandas(self, start=None, count=None, **kw): 200 import pandas # noqa: F401 --> 202 return self.to_result_frame(start=start, count=count, **kw).values File /home/pai/lib/python3.9/site-packages/odps/readers.py:178, in to_result_frame() 176 offset_iter = itertools.cycle(compat.irange(read_row_batch_size)) 177 data = [None] * read_row_batch_size --> 178 for offset, rec in zip( 179 offset_iter, self._iter(start=start, end=end, **iter_kw) 180 ): 181 data[offset] = rec 182 if offset != read_row_batch_size - 1: File /home/pai/lib/python3.9/site-packages/odps/readers.py:115, in _iter() 113 for i in range(step): 114 try: --> 115 record = next(self) 116 except StopIteration: 117 return File /home/pai/lib/python3.9/site-packages/odps/tunnel/io/reader_c.pyx:459, in odps.tunnel.io.reader_c.TunnelRecordReader.__next__() 457 class TunnelRecordReader(BaseTunnelRecordReader, AbstractRecordReader): 458 def __next__(self): --> 459 record = self.read() 460 if record is None: 461 raise StopIteration File /home/pai/lib/python3.9/site-packages/odps/tunnel/io/reader_c.pyx:358, in odps.tunnel.io.reader_c.BaseTunnelRecordReader.read() 356 return 0 357 --> 358 cpdef read(self): 359 cdef: 360 int index File /home/pai/lib/python3.9/site-packages/odps/tunnel/io/reader_c.pyx:413, in odps.tunnel.io.reader_c.BaseTunnelRecordReader.read() 411 i = index - 1 412 if self._column_setters[i] != NULL: --> 413 self._column_setters[i](self, rec_list, i) 414 else: 415 data_type = self._schema_snapshot._col_types[i] File /home/pai/lib/python3.9/site-packages/odps/tunnel/io/reader_c.pyx:303, in odps.tunnel.io.reader_c.BaseTunnelRecordReader._set_string() 301 cdef int _set_string(self, list record, int i) except? -1: 302 cdef object val = self._read_string() --> 303 self._set_record_list_value(record, i, val) 304 return 0 305 File /home/pai/lib/python3.9/site-packages/odps/tunnel/io/reader_c.pyx:298, in odps.tunnel.io.reader_c.BaseTunnelRecordReader._set_record_list_value() 296 297 cdef int _set_record_list_value(self, list record, int i, object value) except? -1: --> 298 record[i] = self._schema_snapshot.validate_value(i, value, MAX_READ_SIZE_LIMIT) 299 return 0 300 File /home/pai/lib/python3.9/site-packages/odps/src/types_c.pyx:280, in odps.src.types_c.SchemaSnapshot.validate_value() 278 if vfun != NULL: 279 try: --> 280 return vfun(val, max_field_size) 281 except TypeError: 282 pass File /home/pai/lib/python3.9/site-packages/odps/src/types_c.pyx:68, in odps.src.types_c._validate_string() 66 if isinstance(val, bytes): 67 s_size = len(<bytes> val) ---> 68 u_val = (<bytes> val).decode('utf-8') 69 elif isinstance(val, unicode): 70 u_val = <unicode> val UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb2 in position 23: invalid start byte
用 options.tunnel.string_as_binary = True.
options.tunnel.string_as_binary = True