heavyai / pymapd

Python client for OmniSci GPU-accelerated SQL engine and analytics platform
https://pymapd.readthedocs.io/en/latest/
Apache License 2.0
111 stars 50 forks source link

UnicodeDecodeError when running simple SQL query #345

Open kcpevey opened 3 years ago

kcpevey commented 3 years ago

I'm getting an error when I'm trying to run a simple SQL query:

import pymapd
client = pymapd.connect(user="mapd", password="HyperInteractive", host="metis.mapd.com", dbname="mapd", port=443, protocol='https')
sql = """
SELECT *
FROM demo_vote_clean
LIMIT 1000;
"""
client.execute(sql)

The first time I tried to run this, it worked just fine, but then I reran and got this error:

---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
<ipython-input-54-43ff97a5c562> in <module>
----> 1 client.execute(sql)

~/.conda/envs/omni_hv/lib/python3.7/site-packages/pymapd/connection.py in execute(self, operation, parameters)
    388         """
    389         c = Cursor(self)
--> 390         return c.execute(operation, parameters=parameters)
    391 
    392     def cursor(self):

~/.conda/envs/omni_hv/lib/python3.7/site-packages/pymapd/cursor.py in execute(self, operation, parameters)
    116                 nonce=None,
    117                 first_n=-1,
--> 118                 at_most_n=-1,
    119             )
    120         except T.TOmniSciException as e:

~/.conda/envs/omni_hv/lib/python3.7/site-packages/omnisci/thrift/OmniSci.py in sql_execute(self, session, query, column_format, nonce, first_n, at_most_n)
   1743         """
   1744         self.send_sql_execute(session, query, column_format, nonce, first_n, at_most_n)
-> 1745         return self.recv_sql_execute()
   1746 
   1747     def send_sql_execute(self, session, query, column_format, nonce, first_n, at_most_n):

~/.conda/envs/omni_hv/lib/python3.7/site-packages/omnisci/thrift/OmniSci.py in recv_sql_execute(self)
   1767             raise x
   1768         result = sql_execute_result()
-> 1769         result.read(iprot)
   1770         iprot.readMessageEnd()
   1771         if result.success is not None:

~/.conda/envs/omni_hv/lib/python3.7/site-packages/omnisci/thrift/OmniSci.py in read(self, iprot)
   9827                 if ftype == TType.STRUCT:
   9828                     self.success = TQueryResult()
-> 9829                     self.success.read(iprot)
   9830                 else:
   9831                     iprot.skip(ftype)

~/.conda/envs/omni_hv/lib/python3.7/site-packages/omnisci/thrift/ttypes.py in read(self, iprot)
   1228                 if ftype == TType.STRUCT:
   1229                     self.row_set = TRowSet()
-> 1230                     self.row_set.read(iprot)
   1231                 else:
   1232                     iprot.skip(ftype)

~/.conda/envs/omni_hv/lib/python3.7/site-packages/omnisci/thrift/ttypes.py in read(self, iprot)
   1132                     for _i79 in range(_size75):
   1133                         _elem80 = TColumn()
-> 1134                         _elem80.read(iprot)
   1135                         self.columns.append(_elem80)
   1136                     iprot.readListEnd()

~/.conda/envs/omni_hv/lib/python3.7/site-packages/omnisci/thrift/ttypes.py in read(self, iprot)
    770                 if ftype == TType.STRUCT:
    771                     self.data = TColumnData()
--> 772                     self.data.read(iprot)
    773                 else:
    774                     iprot.skip(ftype)

~/.conda/envs/omni_hv/lib/python3.7/site-packages/omnisci/thrift/ttypes.py in read(self, iprot)
    672                     (_etype29, _size26) = iprot.readListBegin()
    673                     for _i30 in range(_size26):
--> 674                         _elem31 = iprot.readString().decode('utf-8') if sys.version_info[0] == 2 else iprot.readString()
    675                         self.str_col.append(_elem31)
    676                     iprot.readListEnd()

~/.conda/envs/omni_hv/lib/python3.7/site-packages/thrift/protocol/TJSONProtocol.py in readString(self)
    487 
    488     def readString(self):
--> 489         return self.readJSONString(False)
    490 
    491     def readBinary(self):

~/.conda/envs/omni_hv/lib/python3.7/site-packages/thrift/protocol/TJSONProtocol.py in readJSONString(self, skipContext)
    322                 while ord(self.reader.peek()) >= 0x80:
    323                     utf8_bytes.append(ord(self.reader.read()))
--> 324                 character = utf8_bytes.decode('utf8')
    325             string.append(character)
    326 

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb1 in position 1: invalid start byte

At some point I switched 1000 to 500, then again it worked once and never again.

Even if I restart my kernel, I can't get it to work again.

xmnlab commented 3 years ago

it is failing to return the follow data:

1. Probably Doña Ana County

string: ['D'] , utf8_bytes: bytearray(b'o\xf1')
********************************************************************************
['D', 'o', 'a', ' ', 'A', 'n', 'a', ' ', 'C', 'o', 'u', 'n', 't', 'y']

2. 

string: ['L', 'o', 'p'] , utf8_bytes: bytearray(b'e\xb1')
********************************************************************************
['L', 'o', 'p', 'e', 'o', ' ', 'C', 'D', 'P']

3. 

string: ['C'] , utf8_bytes: bytearray(b'a\xb1')
********************************************************************************
['C', 'a', 'o', 'n', ' ', 'C', 'D', 'P']

where \xb1 would be https://charbase.com/00b1-unicode-plus-minus-sign , and \xf1 would be https://charbase.com/00f1-unicode-latin-small-letter-n-with-tilde