Getting different behavior between windows and linux with egde case string conversion

hb2638 commented 2 years ago

Please first make sure you have looked at:

Documentation: https://github.com/mkleehammer/pyodbc/wiki
Other issues

Environment

To diagnose, we usually need to know the following, including version numbers. On Windows, be sure to specify 32-bit Python or 64-bit:

Python: 3.9 64 bit
pyodbc: 4.0.32
OS Windows: Windows 10 Pro 10.0.19044.1586
OS LInux: Debian GNU/Linux 10 (buster) Docker Image
DB: Microsoft SQL Server 2019
driver Windows: ODBC Driver 17 for SQL Server 2017.177.02.01 ( Microsoft ODBC Driver for SQL Server Version 17.07.0002 )
driver Linux: msodbcsql17/buster,now 17.9.1.1-1 amd64

Issue

All the tests below pass on windows but test_bytes_to_str and test_str fail on Linux and I can't explain why. Do you have any ideas why it's failing on Linux but not Windows? Is it something in Python? PyOdbc? The OS? The driver? A lot of moving parts :^(

I'm hoping for a consistent behavior between the two... And I know... VARCHAR is not unicode.

import unittest
import pyodbc
import sys
CONN_STR = None

#EXPECTED = r"\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ `‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
EXPECTED = bytes.fromhex('5c22232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e2060c281e2809ac692e2809ee280a6e280a0e280a1cb86e280b0c5a0e280b9c592c28dc5bdc28fc290e28098e28099e2809ce2809de280a2e28093e28094cb9ce284a2c5a1e280bac593c29dc5bec5b820c2a1c2a2c2a3c2a4c2a5c2a6c2a7c2a8c2a9c2aac2abc2acc2adc2aec2afc2b0c2b1c2b2c2b3c2b4c2b5c2b6c2b7c2b8c2b9c2bac2bbc2bcc2bdc2bec2bfc380c381c382c383c384c385c386c387c388c389c38ac38bc38cc38dc38ec38fc390c391c392c393c394c395c396c397c398c399c39ac39bc39cc39dc39ec39fc3a0c3a1c3a2c3a3c3a4c3a5c3a6c3a7c3a8c3a9c3aac3abc3acc3adc3aec3afc3b0c3b1c3b2c3b3c3b4c3b5c3b6c3b7c3b8c3b9c3bac3bbc3bcc3bdc3bec3bf').decode()

class PyodbcEncodingTestCase(unittest.TestCase):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.maxDiff = None

    def setUp(self) -> None:
        super().setUp()
        self.conn = pyodbc.connect(CONN_STR)
        self.cursor = self.conn.cursor()

    def tearDown(self) -> None:
        super().tearDown()
        self.cursor.close()
        self.conn.close()

    def test_bytes_to_str(self):
        """ This fails in linux """
        row = self.cursor.execute("DECLARE @OUTPUT VARCHAR(MAX)= CONCAT(?, '' COLLATE SQL_Latin1_General_CP1_CI_AS);SELECT @OUTPUT;", [EXPECTED.encode()]).fetchone()
        actual = row[0]
        expected_windows = '\\"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ `Â\x81â€šÆ’â€žâ€¦â€\xa0â€¡Ë†â€°Å\xa0â€¹Å’Â\x8dÅ½Â\x8fÂ\x90â€˜â€™â€œâ€\x9dâ€¢â€“â€”Ëœâ„¢Å¡â€ºÅ“Â\x9dÅ¾Å¸ Â¡Â¢Â£Â¤Â¥Â¦Â§Â¨Â©ÂªÂ«Â¬Â\xadÂ®Â¯Â°Â±Â²Â³Â´ÂµÂ¶Â·Â¸Â¹ÂºÂ»Â¼Â½Â¾Â¿Ã€Ã\x81Ã‚ÃƒÃ„Ã…Ã†Ã‡ÃˆÃ‰ÃŠÃ‹ÃŒÃ\x8dÃŽÃ\x8fÃ\x90Ã‘Ã’Ã“Ã”Ã•Ã–Ã—Ã˜Ã™ÃšÃ›ÃœÃ\x9dÃžÃŸÃ\xa0Ã¡Ã¢Ã£Ã¤Ã¥Ã¦Ã§Ã¨Ã©ÃªÃ«Ã¬Ã\xadÃ®Ã¯Ã°Ã±Ã²Ã³Ã´ÃµÃ¶Ã·Ã¸Ã¹ÃºÃ»Ã¼Ã½Ã¾Ã¿'
        self.assertEqual(expected_windows, actual)

    def test_str(self):
        """ This fails in linux """
        row = self.cursor.execute("DECLARE @INPUT VARCHAR(MAX)=?;SELECT CONCAT(@INPUT, '' COLLATE SQL_Latin1_General_CP1_CI_AS);", [EXPECTED]).fetchone()
        actual = row[0]
        self.assertEqual(EXPECTED, actual)

    def test_curr_db_master(self):
        row = self.cursor.execute("SELECT DB_NAME(DB_ID())").fetchone()
        actual = row[0]
        expected = "master"
        self.assertEqual(expected, actual)

    def test_hex(self):
        actual = '5c22232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e2060c281e2809ac692e2809ee280a6e280a0e280a1cb86e280b0c5a0e280b9c592c28dc5bdc28fc290e28098e28099e2809ce2809de280a2e28093e28094cb9ce284a2c5a1e280bac593c29dc5bec5b820c2a1c2a2c2a3c2a4c2a5c2a6c2a7c2a8c2a9c2aac2abc2acc2adc2aec2afc2b0c2b1c2b2c2b3c2b4c2b5c2b6c2b7c2b8c2b9c2bac2bbc2bcc2bdc2bec2bfc380c381c382c383c384c385c386c387c388c389c38ac38bc38cc38dc38ec38fc390c391c392c393c394c395c396c397c398c399c39ac39bc39cc39dc39ec39fc3a0c3a1c3a2c3a3c3a4c3a5c3a6c3a7c3a8c3a9c3aac3abc3acc3adc3aec3afc3b0c3b1c3b2c3b3c3b4c3b5c3b6c3b7c3b8c3b9c3bac3bbc3bcc3bdc3bec3bf'
        self.assertEqual(EXPECTED.encode().hex(), actual)

    def test_sys_default_encoding(self):
        self.assertEqual(sys.getdefaultencoding(), "utf-8")

    def test_input(self):
        row = self.cursor.execute("DECLARE @INPUT VARCHAR(MAX)=?;SELECT CAST(@INPUT AS VARBINARY(MAX))", [EXPECTED]).fetchone()
        actual = row[0]
        expected_windows = '5c22232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e20608182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f20a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebfc0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedfe0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff'
        self.assertEqual(expected_windows, actual.hex())

    def test_output(self):
        row = self.cursor.execute("SELECT CAST(? AS VARBINARY(MAX))", [EXPECTED]).fetchone()
        actual = row[0]
        expected_windows = '5c00220023002400250026002700280029002a002b002c002d002e002f0030003100320033003400350036003700380039003a003b003c003d003e003f0040004100420043004400450046004700480049004a004b004c004d004e004f0050005100520053005400550056005700580059005a005b005c005d005e005f0060006100620063006400650066006700680069006a006b006c006d006e006f0070007100720073007400750076007700780079007a007b007c007d007e002000600081001a2092011e20262020202120c60230206001392052018d007d018f009000182019201c201d20222013201420dc02222161013a2053019d007e0178012000a100a200a300a400a500a600a700a800a900aa00ab00ac00ad00ae00af00b000b100b200b300b400b500b600b700b800b900ba00bb00bc00bd00be00bf00c000c100c200c300c400c500c600c700c800c900ca00cb00cc00cd00ce00cf00d000d100d200d300d400d500d600d700d800d900da00db00dc00dd00de00df00e000e100e200e300e400e500e600e700e800e900ea00eb00ec00ed00ee00ef00f000f100f200f300f400f500f600f700f800f900fa00fb00fc00fd00fe00ff00'
        self.assertEqual(expected_windows, actual.hex())

    def test_bytes(self):
        row = self.cursor.execute("DECLARE @OUTPUT VARBINARY(MAX)=?;SELECT @OUTPUT;", [EXPECTED.encode()]).fetchone()
        actual = row[0]
        self.assertEqual(EXPECTED.encode(), actual)

if __name__ == '__main__':
    unittest.main()

gordthompson commented 2 years ago

I can replicate the two test failures on Linux (Xubuntu 20.04). FWIW, those tests pass if I change

row = self.cursor.execute("DECLARE @OUTPUT VARCHAR(MAX)= …

to

row = self.cursor.execute("DECLARE @OUTPUT NVARCHAR(MAX)= …

@hb2638 - If you make that change on Windows does it cause those tests to start failing?

v-chojas commented 2 years ago

On Windows the narrow charset is usually CP1252 or similar (depends on the language etc.) but on Linux it's usually UTF-8. That may explain the differences you're seeing.

hb2638 commented 2 years ago

On Windows the narrow charset is usually CP1252 or similar (depends on the language etc.) but on Linux it's usually UTF-8. That may explain the differences you're seeing.

I guess the issue is in the Microsoft SQL ODBC driver.. That's the only thing that's different between Linux and Windows. I couldn't find anything specific to Windows in the pyodbc code.

Do you know if I can change the encoding to latin1 iso-8859-1 in pyodbc? I see there's a setdecoding and setencoding but I can't find any documentation on it.

v-chojas commented 2 years ago

There is unlikely to be an issue. The driver uses the environment encoding; this is the same behaviour as on Windows. You can change the environment encoding in Linux, but you may need to install CP1252 first - see https://ereimer.net/programs/charsets-cp1252-utf8.htm for more information.

hb2638 commented 2 years ago

I can replicate the two test failures on Linux (Xubuntu 20.04). FWIW, those tests pass if I change
row = self.cursor.execute("DECLARE @OUTPUT VARCHAR(MAX)= …
to
row = self.cursor.execute("DECLARE @OUTPUT NVARCHAR(MAX)= …
@hb2638 - If you make that change on Windows does it cause those tests to start failing?

The NVARCHAR works on linux for me too.

hb2638 commented 2 years ago

There is unlikely to be an issue. The driver uses the environment encoding; this is the same behaviour as on Windows. You can change the environment encoding in Linux, but you may need to install CP1252 first - see https://ereimer.net/programs/charsets-cp1252-utf8.htm for more information.

Thx! I'll give that a try. I'll close this because I feel more confident that the issue is in the driver.

hb2638 commented 2 years ago

I was able to get the localce CP1252 locale installed but I still couldn't get the strings to tie out, so I ended up using "FOR JSON" to make SQL do the UTF8 conversion before sending it over the wire

    def test_output_json(self):
        row = self.cursor.execute("DECLARE @OUTPUT VARCHAR(MAX)= CONCAT(?, '' COLLATE SQL_Latin1_General_CP1_CI_AS);SELECT * FROM (SELECT @OUTPUT AS OUTPUT) TMP FOR JSON AUTO", [EXPECTED]).fetchone()
        actual = json.loads(row[0])[0]["OUTPUT"]
        expected_windows = '5c22232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e2060c281e2809ac692e2809ee280a6e280a0e280a1cb86e280b0c5a0e280b9c592c28dc5bdc28fc290e28098e28099e2809ce2809de280a2e28093e28094cb9ce284a2c5a1e280bac593c29dc5bec5b820c2a1c2a2c2a3c2a4c2a5c2a6c2a7c2a8c2a9c2aac2abc2acc2adc2aec2afc2b0c2b1c2b2c2b3c2b4c2b5c2b6c2b7c2b8c2b9c2bac2bbc2bcc2bdc2bec2bfc380c381c382c383c384c385c386c387c388c389c38ac38bc38cc38dc38ec38fc390c391c392c393c394c395c396c397c398c399c39ac39bc39cc39dc39ec39fc3a0c3a1c3a2c3a3c3a4c3a5c3a6c3a7c3a8c3a9c3aac3abc3acc3adc3aec3afc3b0c3b1c3b2c3b3c3b4c3b5c3b6c3b7c3b8c3b9c3bac3bbc3bcc3bdc3bec3bf'
        self.assertEqual(expected_windows, actual.encode().hex())

gordthompson commented 2 years ago

I can replicate the two test failures on Linux (Xubuntu 20.04). FWIW, those tests pass if I change
row = self.cursor.execute("DECLARE @OUTPUT VARCHAR(MAX)= …
to
row = self.cursor.execute("DECLARE @OUTPUT NVARCHAR(MAX)= …
@hb2638 - If you make that change on Windows does it cause those tests to start failing?

For the record, the original tests as modified above also passed on Windows.

hb2638 commented 2 years ago

Hi @gordthompson Switching from VARCHAR to NVARCHAR works in both OSes.

mkleehammer / pyodbc

Getting different behavior between windows and linux with egde case string conversion #1042

Environment

Issue