When reading a tar archive that includes a file with a bad header (such as a checksum mismatch), getmembers simply stops listing the members at that file, without reporting an error, and ignoring the files that come after it (Edit: unless ignore_zeros=True is set).
I would expect instead that getmembers lists all members, and extractfile raises a TarError when trying to extract a file with an invalid header (such as a bad chksum or typeflag).
import os
import contextlib
import subprocess
from io import BytesIO
from tarfile import TarFile, TarInfo
from tempfile import TemporaryDirectory
# generate a tar file in memory
bio = BytesIO()
with TarFile(mode="w", fileobj=bio, errorlevel=2) as tf:
ti = TarInfo()
ti.size = 3
for name, data in (("foo", b"123"), ("bar", b"456"), ("quz", b"789")):
ti.name = name
tf.addfile(ti, BytesIO(data))
# break the checksum of the second file 'bar'
assert b"\x00006425\x00" in bio.getvalue()
broken = bio.getvalue().replace(b"\x00006425\x00", b"\x00106425\x00")
# try to read the tar file
with TarFile(fileobj=BytesIO(broken), errorlevel=2) as tf:
for ti in tf.getmembers():
print(repr(ti.name))
with tf.extractfile(ti) as fh:
print(repr(fh.read()))
# => only "foo" is extracted
with TemporaryDirectory() as td:
with contextlib.chdir(td):
with TarFile(fileobj=BytesIO(broken), errorlevel=2) as tf:
tf.extractall()
print(os.listdir())
# => again only "foo" is extracted
os.unlink("foo")
with TarFile(fileobj=BytesIO(broken), errorlevel=2) as tf:
tf.extractall(filter="data")
print(os.listdir())
# => filter doesn't change anything
with open("test.tar", "wb") as fh:
fh.write(broken)
subprocess.run(["tar", "tvf", "test.tar"], check=False)
# => GNU tar 1.34 correctly identifies error and continues processing
Output:
'foo'
b'123'
['foo']
['foo']
-rw-r--r-- 0/0 3 1970-01-01 00:00 foo
tar: Skipping to next header
-rw-r--r-- 0/0 3 1970-01-01 00:00 quz
tar: Exiting with failure status due to previous errors
Setting ignore_zeros=True allows the third file in the example above (quz, the one after the broken file) to be extracted, however there is still no indication at all that a file was skipped.
Bug report
Bug description:
When reading a tar archive that includes a file with a bad header (such as a checksum mismatch),
getmembers
simply stops listing the members at that file, without reporting an error, and ignoring the files that come after it (Edit: unlessignore_zeros=True
is set).I would expect instead that
getmembers
lists all members, andextractfile
raises aTarError
when trying to extract a file with an invalid header (such as a bad chksum or typeflag).Output:
CPython versions tested on:
3.12, 3.13
Operating systems tested on:
Linux, Windows