reading tar file contents without untarring it, in python script

you can use getmembers()

>>> import  tarfile
>>> tar = tarfile.open("test.tar")
>>> tar.getmembers()

After that, you can use extractfile() to extract the members as file object. Just an example

import tarfile,os
import sys
os.chdir("/tmp/foo")
tar = tarfile.open("test.tar")
for member in tar.getmembers():
    f=tar.extractfile(member)
    content=f.read()
    print "%s has %d newlines" %(member, content.count("\n"))
    print "%s has %d spaces" % (member,content.count(" "))
    print "%s has %d characters" % (member, len(content))
    sys.exit()
tar.close()

With the file object f in the above example, you can use read(), readlines() etc.


you need to use the tarfile module. Specifically, you use an instance of the class TarFile to access the file, and then access the names with TarFile.getnames()

 |  getnames(self)
 |      Return the members of the archive as a list of their names. It has
 |      the same order as the list returned by getmembers().

If instead you want to read the content, then you use this method

 |  extractfile(self, member)
 |      Extract a member from the archive as a file object. `member' may be
 |      a filename or a TarInfo object. If `member' is a regular file, a
 |      file-like object is returned. If `member' is a link, a file-like
 |      object is constructed from the link's target. If `member' is none of
 |      the above, None is returned.
 |      The file-like object is read-only and provides the following
 |      methods: read(), readline(), readlines(), seek() and tell()

Previously, this post showed an example of "dict(zip(()"'ing the member names and members lists together, this is silly and causes excessive reads of the archive, to accomplish the same, we can use dictionary comprehension:

index = {i.name: i for i in my_tarfile.getmembers()}

More info on how to use tarfile

Extract a tarfile member

#!/usr/bin/env python3
import tarfile

my_tarfile = tarfile.open('/path/to/mytarfile.tar')

print(my_tarfile.extractfile('./path/to/file.png').read())

Index a tar file

#!/usr/bin/env python3
import tarfile
import pprint

my_tarfile = tarfile.open('/path/to/mytarfile.tar')

index = my_tarfile.getnames()  # a list of strings, each members name
# or
# index = {i.name: i for i in my_tarfile.getmembers()}

pprint.pprint(index)

Index, read, dynamic extra a tar file

#!/usr/bin/env python3

import tarfile
import base64
import textwrap
import random

# note, indexing a tar file requires reading it completely once
# if we want to do anything after indexing it, it must be a file
# that can be seeked (not a stream), so here we open a file we
# can seek
my_tarfile = tarfile.open('/path/to/mytar.tar')


# tarfile.getmembers is similar to os.stat kind of, it will
# give you the member names (i.name) as well as TarInfo attributes:
#
# chksum,devmajor,devminor,gid,gname,linkname,linkpath,
# mode,mtime,name,offset,offset_data,path,pax_headers,
# size,sparse,tarfile,type,uid,uname
#
# here we use a dictionary comprehension to index all TarInfo
# members by the member name
index = {i.name: i for i in my_tarfile.getmembers()}

print(index.keys())

# pick your member
# note: if you can pick your member before indexing the tar file,
# you don't need to index it to read that file, you can directly
# my_tarfile.extractfile(name)
# or my_tarfile.getmember(name)

# pick your filename from the index dynamically
my_file_name = random.choice(index.keys())

my_file_tarinfo = index[my_file_name]
my_file_size = my_file_tarinfo.size
my_file_buf = my_tarfile.extractfile( 
    my_file_name
    # or my_file_tarinfo
)

print('file_name: {}'.format(my_file_name))
print('file_size: {}'.format(my_file_size))
print('----- BEGIN FILE BASE64 -----'
print(
    textwrap.fill(
        base64.b64encode(
            my_file_buf.read()
        ).decode(),
        72
    )
)
print('----- END FILE BASE64 -----'

tarfile with duplicate members

in the case that we have a tar that was created strangely, in this example by appending many versions of the same file to the same tar archive, we can work with that carefully, I've annotated which members contain what text, lets say we want the fourth (index 3) member, "capturetheflag\n"

tar -tf mybadtar.tar 
mymember.txt  # "version 1\n"
mymember.txt  # "version 1\n"
mymember.txt  # "version 2\n"
mymember.txt  # "capturetheflag\n"
mymember.txt  # "version 3\n"
#!/usr/bin/env python3

import tarfile
my_tarfile = tarfile.open('mybadtar.tar')

# >>> my_tarfile.getnames()
# ['mymember.txt', 'mymember.txt', 'mymember.txt', 'mymember.txt', 'mymember.txt']

# if we use extracfile on a name, we get the last entry, I'm not sure how python is smart enough to do this, it must read the entire tar file and buffer every valid member and return the last one

# >>> my_tarfile.extractfile('mymember.txt').read()
# b'version 3\n'

# >>> my_tarfile.extractfile(my_tarfile.getmembers()[3]).read()
# b'capturetheflag\n'

Alternatively we can iterate over the tar file #!/usr/bin/env python3

import tarfile
my_tarfile = tarfile.open('mybadtar.tar')
# note, if we do anything to the tarfile object that will 
# cause a full read, the tarfile.next() method will return none,
# so call next in a loop as the first thing you do if you want to
# iterate

while True:
    my_member = my_tarfile.next()
    if not my_member:
        break
    print((my_member.offset, mytarfile.extractfile(my_member).read,))

# (0, b'version 1\n')
# (1024, b'version 1\n')
# (2048, b'version 2\n')
# (3072, b'capturetheflag\n')
# (4096, b'version 3\n')