I included the full log at the bottom of this issue.
I did a little digging, and while the paths render equivalently and are unicode-equivalent, the tests are actually looking for a slightly different byte sequence than the one present in the file. Below is a test I wrote that demonstrates the problem. (To run test_load, add it to one of the python files in tests/ and run py.test.)
def test_load():
"""Exhibits why the tests are busted."""
# These byte sequences are unicode-equivalent, but not byte-for-byte
# equivalent.
good_sequence = b"u\xcc\x88" # present in unicode.zip
wrong_sequence = b"\xc3\xbc" # present in unicode.zip.json
good_str = good_sequence.decode("UTF-8")
wrong_str = wrong_sequence.decode("UTF-8")
assert good_str != wrong_str
for mode in ("NFC", "NFD"):
assert unicodedata.normalize(mode, good_str) == unicodedata.normalize(mode, wrong_str)
# This file has the good sequence, not the bad one
with open(join(data_dir, "unicode.zip"), "rb") as f:
zipfile_bytes = f.read()
assert good_sequence in zipfile_bytes
assert wrong_sequence not in zipfile_bytes
# Oops! This fails. The JSON has the bad sequence (not the good one)
with open(join(data_dir, "unicode.zip.json"), encoding='UTF-8') as ex:
x = json.load(ex)
encoded_json_bytes = repr(x).encode("UTF-8")
assert good_sequence in encoded_json_bytes
assert wrong_sequence not in encoded_json_bytes
=================================== FAILURES ===================================
__________________________________ test_load ___________________________________
def test_load():
"""Exhibits why the tests are busted."""
# These byte sequences are unicode-equivalent, but not byte-for-byte
# equivalent.
good_sequence = b"u\xcc\x88" # present in unicode.zip
wrong_sequence = b"\xc3\xbc" # present in unicode.zip.json
good_str = good_sequence.decode("UTF-8")
wrong_str = wrong_sequence.decode("UTF-8")
assert good_str != wrong_str
for mode in ("NFC", "NFD"):
assert unicodedata.normalize(mode, good_str) == unicodedata.normalize(mode, wrong_str)
# This file has the good sequence, not the bad one
with open(join(data_dir, "unicode.zip"), "rb") as f:
zipfile_bytes = f.read()
assert good_sequence in zipfile_bytes
assert wrong_sequence not in zipfile_bytes
# Oops! This fails. The JSON has the bad sequence (not the good one)
with open(join(data_dir, "unicode.zip.json"), encoding='UTF-8') as ex:
x = json.load(ex)
encoded_json_bytes = repr(x).encode("UTF-8")
> assert good_sequence in encoded_json_bytes
E assert b'u\xcc\x88' in b"[{'gid': 1000, 'isblk': False, 'ischr': False, 'isdev': False, 'isdir': True, 'isfifo': False, 'islnk': False, 'isre...e, 'linkpath': None, 'mode': 'rw-r--r--', 'mtime': 1268678259, 'path': 'a/gr\xc3\xbcn.png', 'size': 362, 'uid': 1000}]"
Log:
============================= test session starts ==============================
platform darwin -- Python 3.7.2, pytest-4.2.1, py-1.7.0, pluggy-0.8.1
rootdir: opensource/python-libarchive-c, inifile:
collected 29 items
tests/test_atime_mtime_ctime.py ........ [ 27%]
tests/test_convert.py . [ 31%]
tests/test_entry.py ..F.F.F [ 55%]
tests/test_errors.py .... [ 68%]
tests/test_rwx.py ....... [ 93%]
tests/test_security_flags.py .. [100%]
=================================== FAILURES ===================================
_________________ test_check_archiveentry_using_python_testtar _________________
def test_check_archiveentry_using_python_testtar():
> check_entries(join(data_dir, 'testtar.tar'))
tests/test_entry.py:63:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
test_file = 'opensource/python-libarchive-c/tests/data/testtar.tar'
regen = False, ignore = []
def check_entries(test_file, regen=False, ignore=''):
ignore = ignore.split()
fixture_file = test_file + '.json'
if regen:
entries = list(get_entries(test_file))
with open(fixture_file, 'w', encoding='UTF-8') as ex:
json.dump(entries, ex, indent=2, sort_keys=True)
with open(fixture_file, encoding='UTF-8') as ex:
expected = json.load(ex)
actual = list(get_entries(test_file))
for e1, e2 in zip(actual, expected):
for key in ignore:
e1.pop(key)
e2.pop(key)
> assert e1 == e2
E AssertionError: assert {'gid': 100, ...': False, ...} == {'gid': 100, '...': False, ...}
E Omitting 14 identical items, use -vv to show
E Differing items:
E {'path': 'pax/umlauts-ÄÖÜäöüß'} != {'path': 'pax/umlauts-ÄÖÜäöüß'}
E Use -v to get the full diff
tests/test_entry.py:96: AssertionError
------------------------------ Captured log call -------------------------------
ffi.py 88 WARNING Pathname can't be converted from UTF-8 to current locale.
_________ test_check_archiveentry_with_unicode_and_binary_entries_zip __________
def test_check_archiveentry_with_unicode_and_binary_entries_zip():
> check_entries(join(data_dir, 'unicode.zip'))
tests/test_entry.py:71:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
test_file = 'opensource/python-libarchive-c/tests/data/unicode.zip'
regen = False, ignore = []
def check_entries(test_file, regen=False, ignore=''):
ignore = ignore.split()
fixture_file = test_file + '.json'
if regen:
entries = list(get_entries(test_file))
with open(fixture_file, 'w', encoding='UTF-8') as ex:
json.dump(entries, ex, indent=2, sort_keys=True)
with open(fixture_file, encoding='UTF-8') as ex:
expected = json.load(ex)
actual = list(get_entries(test_file))
for e1, e2 in zip(actual, expected):
for key in ignore:
e1.pop(key)
e2.pop(key)
> assert e1 == e2
E AssertionError: assert {'gid': 1000,...': False, ...} == {'gid': 1000, ...': False, ...}
E Omitting 14 identical items, use -vv to show
E Differing items:
E {'path': 'a/grün.png'} != {'path': 'a/grün.png'}
E Use -v to get the full diff
tests/test_entry.py:96: AssertionError
__________ test_check_archiveentry_with_unicode_entries_and_name_zip ___________
def test_check_archiveentry_with_unicode_entries_and_name_zip():
> check_entries(join(data_dir, '\ud504\ub85c\uadf8\ub7a8.zip'))
tests/test_entry.py:79:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
test_file = 'opensource/python-libarchive-c/tests/data/프로그램.zip'
regen = False, ignore = []
def check_entries(test_file, regen=False, ignore=''):
ignore = ignore.split()
fixture_file = test_file + '.json'
if regen:
entries = list(get_entries(test_file))
with open(fixture_file, 'w', encoding='UTF-8') as ex:
json.dump(entries, ex, indent=2, sort_keys=True)
with open(fixture_file, encoding='UTF-8') as ex:
expected = json.load(ex)
actual = list(get_entries(test_file))
for e1, e2 in zip(actual, expected):
for key in ignore:
e1.pop(key)
e2.pop(key)
> assert e1 == e2
E AssertionError: assert {'gid': 502, ...': False, ...} == {'gid': 502, '...': False, ...}
E Omitting 14 identical items, use -vv to show
E Differing items:
E {'path': '프로그램.txt'} != {'path': '프로그램.txt'}
E Use -v to get the full diff
tests/test_entry.py:96: AssertionError
=============================== warnings summary ===============================
tests/test_entry.py::test_check_ArchiveEntry_against_TarInfo
tests/test_entry.py::test_check_ArchiveEntry_against_TarInfo
tests/test_entry.py::test_check_ArchiveEntry_against_TarInfo
tests/test_entry.py::test_check_ArchiveEntry_against_TarInfo
tests/test_entry.py::test_check_ArchiveEntry_against_TarInfo
tests/test_entry.py::test_check_ArchiveEntry_against_TarInfo
tests/test_entry.py::test_check_ArchiveEntry_against_TarInfo
tests/test_entry.py::test_check_ArchiveEntry_against_TarInfo
tests/test_entry.py::test_check_ArchiveEntry_against_TarInfo
tests/test_entry.py::test_check_ArchiveEntry_against_TarInfo
tests/test_entry.py::test_check_ArchiveEntry_against_TarInfo
tests/test_entry.py::test_check_ArchiveEntry_against_TarInfo
tests/test_entry.py::test_check_ArchiveEntry_against_TarInfo
opensource/python-libarchive-c/tests/__init__.py:86: DeprecationWarning: deprecated in favor of stat.filemode
mode = tarfile.filemode(entry.mode)[1:]
-- Docs: https://docs.pytest.org/en/latest/warnings.html
=============== 3 failed, 26 passed, 13 warnings in 0.54 seconds ===============
These tests are failing for me:
A few details about my setup:
I included the full log at the bottom of this issue.
I did a little digging, and while the paths render equivalently and are unicode-equivalent, the tests are actually looking for a slightly different byte sequence than the one present in the file. Below is a test I wrote that demonstrates the problem. (To run
test_load
, add it to one of the python files intests/
and runpy.test
.)Log: