Tests with unicode path entires are failing

These tests are failing for me:

test_check_archiveentry_using_python_testtar
test_check_archiveentry_with_unicode_and_binary_entries_zip
test_check_archiveentry_with_unicode_entries_and_name_zip

A few details about my setup:

Mac OSX 10.13
Python 3.7.2
libarchive 3.3.3

I included the full log at the bottom of this issue.

I did a little digging, and while the paths render equivalently and are unicode-equivalent, the tests are actually looking for a slightly different byte sequence than the one present in the file. Below is a test I wrote that demonstrates the problem. (To run test_load, add it to one of the python files in tests/ and run py.test.)

def test_load():
    """Exhibits why the tests are busted."""

    # These byte sequences are unicode-equivalent, but not byte-for-byte
    # equivalent.
    good_sequence = b"u\xcc\x88" # present in unicode.zip
    wrong_sequence = b"\xc3\xbc" # present in unicode.zip.json

    good_str = good_sequence.decode("UTF-8")
    wrong_str = wrong_sequence.decode("UTF-8")

    assert good_str != wrong_str
    for mode in ("NFC", "NFD"):
        assert unicodedata.normalize(mode, good_str) == unicodedata.normalize(mode, wrong_str)

    # This file has the good sequence, not the bad one
    with open(join(data_dir, "unicode.zip"), "rb") as f:
        zipfile_bytes = f.read()
    assert good_sequence in zipfile_bytes
    assert wrong_sequence not in zipfile_bytes

    # Oops! This fails. The JSON has the bad sequence (not the good one)
    with open(join(data_dir, "unicode.zip.json"), encoding='UTF-8') as ex:
        x = json.load(ex)
    encoded_json_bytes = repr(x).encode("UTF-8")
    assert good_sequence in encoded_json_bytes
    assert wrong_sequence not in encoded_json_bytes

=================================== FAILURES ===================================
__________________________________ test_load ___________________________________

    def test_load():
        """Exhibits why the tests are busted."""

        # These byte sequences are unicode-equivalent, but not byte-for-byte
        # equivalent.
        good_sequence = b"u\xcc\x88" # present in unicode.zip
        wrong_sequence = b"\xc3\xbc" # present in unicode.zip.json

        good_str = good_sequence.decode("UTF-8")
        wrong_str = wrong_sequence.decode("UTF-8")

        assert good_str != wrong_str
        for mode in ("NFC", "NFD"):
            assert unicodedata.normalize(mode, good_str) == unicodedata.normalize(mode, wrong_str)

        # This file has the good sequence, not the bad one
        with open(join(data_dir, "unicode.zip"), "rb") as f:
            zipfile_bytes = f.read()
        assert good_sequence in zipfile_bytes
        assert wrong_sequence not in zipfile_bytes

        # Oops! This fails. The JSON has the bad sequence (not the good one)
        with open(join(data_dir, "unicode.zip.json"), encoding='UTF-8') as ex:
            x = json.load(ex)
        encoded_json_bytes = repr(x).encode("UTF-8")
>       assert good_sequence in encoded_json_bytes
E       assert b'u\xcc\x88' in b"[{'gid': 1000, 'isblk': False, 'ischr': False, 'isdev': False, 'isdir': True, 'isfifo': False, 'islnk': False, 'isre...e, 'linkpath': None, 'mode': 'rw-r--r--', 'mtime': 1268678259, 'path': 'a/gr\xc3\xbcn.png', 'size': 362, 'uid': 1000}]"

Log:

============================= test session starts ==============================
platform darwin -- Python 3.7.2, pytest-4.2.1, py-1.7.0, pluggy-0.8.1
rootdir: opensource/python-libarchive-c, inifile:
collected 29 items

tests/test_atime_mtime_ctime.py ........                                 [ 27%]
tests/test_convert.py .                                                  [ 31%]
tests/test_entry.py ..F.F.F                                              [ 55%]
tests/test_errors.py ....                                                [ 68%]
tests/test_rwx.py .......                                                [ 93%]
tests/test_security_flags.py ..                                          [100%]

=================================== FAILURES ===================================
_________________ test_check_archiveentry_using_python_testtar _________________

    def test_check_archiveentry_using_python_testtar():
>       check_entries(join(data_dir, 'testtar.tar'))

tests/test_entry.py:63: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

test_file = 'opensource/python-libarchive-c/tests/data/testtar.tar'
regen = False, ignore = []

    def check_entries(test_file, regen=False, ignore=''):
        ignore = ignore.split()
        fixture_file = test_file + '.json'
        if regen:
            entries = list(get_entries(test_file))
            with open(fixture_file, 'w', encoding='UTF-8') as ex:
                json.dump(entries, ex, indent=2, sort_keys=True)
        with open(fixture_file, encoding='UTF-8') as ex:
            expected = json.load(ex)
        actual = list(get_entries(test_file))
        for e1, e2 in zip(actual, expected):
            for key in ignore:
                e1.pop(key)
                e2.pop(key)
>           assert e1 == e2
E           AssertionError: assert {'gid': 100, ...': False, ...} == {'gid': 100, '...': False, ...}
E             Omitting 14 identical items, use -vv to show
E             Differing items:
E             {'path': 'pax/umlauts-ÄÖÜäöüß'} != {'path': 'pax/umlauts-ÄÖÜäöüß'}
E             Use -v to get the full diff

tests/test_entry.py:96: AssertionError
------------------------------ Captured log call -------------------------------
ffi.py                      88 WARNING  Pathname can't be converted from UTF-8 to current locale.
_________ test_check_archiveentry_with_unicode_and_binary_entries_zip __________

    def test_check_archiveentry_with_unicode_and_binary_entries_zip():
>       check_entries(join(data_dir, 'unicode.zip'))

tests/test_entry.py:71: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

test_file = 'opensource/python-libarchive-c/tests/data/unicode.zip'
regen = False, ignore = []

    def check_entries(test_file, regen=False, ignore=''):
        ignore = ignore.split()
        fixture_file = test_file + '.json'
        if regen:
            entries = list(get_entries(test_file))
            with open(fixture_file, 'w', encoding='UTF-8') as ex:
                json.dump(entries, ex, indent=2, sort_keys=True)
        with open(fixture_file, encoding='UTF-8') as ex:
            expected = json.load(ex)
        actual = list(get_entries(test_file))
        for e1, e2 in zip(actual, expected):
            for key in ignore:
                e1.pop(key)
                e2.pop(key)
>           assert e1 == e2
E           AssertionError: assert {'gid': 1000,...': False, ...} == {'gid': 1000, ...': False, ...}
E             Omitting 14 identical items, use -vv to show
E             Differing items:
E             {'path': 'a/grün.png'} != {'path': 'a/grün.png'}
E             Use -v to get the full diff

tests/test_entry.py:96: AssertionError
__________ test_check_archiveentry_with_unicode_entries_and_name_zip ___________

    def test_check_archiveentry_with_unicode_entries_and_name_zip():
>       check_entries(join(data_dir, '\ud504\ub85c\uadf8\ub7a8.zip'))

tests/test_entry.py:79: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

test_file = 'opensource/python-libarchive-c/tests/data/프로그램.zip'
regen = False, ignore = []

    def check_entries(test_file, regen=False, ignore=''):
        ignore = ignore.split()
        fixture_file = test_file + '.json'
        if regen:
            entries = list(get_entries(test_file))
            with open(fixture_file, 'w', encoding='UTF-8') as ex:
                json.dump(entries, ex, indent=2, sort_keys=True)
        with open(fixture_file, encoding='UTF-8') as ex:
            expected = json.load(ex)
        actual = list(get_entries(test_file))
        for e1, e2 in zip(actual, expected):
            for key in ignore:
                e1.pop(key)
                e2.pop(key)
>           assert e1 == e2
E           AssertionError: assert {'gid': 502, ...': False, ...} == {'gid': 502, '...': False, ...}
E             Omitting 14 identical items, use -vv to show
E             Differing items:
E             {'path': '프로그램.txt'} != {'path': '프로그램.txt'}
E             Use -v to get the full diff

tests/test_entry.py:96: AssertionError
=============================== warnings summary ===============================
tests/test_entry.py::test_check_ArchiveEntry_against_TarInfo
tests/test_entry.py::test_check_ArchiveEntry_against_TarInfo
tests/test_entry.py::test_check_ArchiveEntry_against_TarInfo
tests/test_entry.py::test_check_ArchiveEntry_against_TarInfo
tests/test_entry.py::test_check_ArchiveEntry_against_TarInfo
tests/test_entry.py::test_check_ArchiveEntry_against_TarInfo
tests/test_entry.py::test_check_ArchiveEntry_against_TarInfo
tests/test_entry.py::test_check_ArchiveEntry_against_TarInfo
tests/test_entry.py::test_check_ArchiveEntry_against_TarInfo
tests/test_entry.py::test_check_ArchiveEntry_against_TarInfo
tests/test_entry.py::test_check_ArchiveEntry_against_TarInfo
tests/test_entry.py::test_check_ArchiveEntry_against_TarInfo
tests/test_entry.py::test_check_ArchiveEntry_against_TarInfo
  opensource/python-libarchive-c/tests/__init__.py:86: DeprecationWarning: deprecated in favor of stat.filemode
    mode = tarfile.filemode(entry.mode)[1:]

-- Docs: https://docs.pytest.org/en/latest/warnings.html
=============== 3 failed, 26 passed, 13 warnings in 0.54 seconds ===============

Changaco / python-libarchive-c

Tests with unicode path entires are failing #81