Persisting sha256 hash objects?
I need a Python/C/C++/Java implementation which can pause hashing progress and store that progress in a file in such a way that the progress is recoverable from that file at a later stage.
No matter in what language it is written from above listed, it should work properly in Python. It is recommended that you may provide that to work well with "hashlib" however it is not necessary. Also, if such a thing already exist, a link to that is sufficient.
For an idea, what your implementation should achieve.
import hashlib
import hashpersist #THIS IS NEEDED.
sha256 = hashlib.sha256("Hello ")
hashpersist.save_state(sha256, open('test_file', 'w'))
sha256_recovered = hashpersist.load_state(open('test_file', 'r'))
sha256_recovered.update("World")
print sha256_recovered.hexdigest()
This should give the same output as we had done simple hashing of "Hello World" with standard sha256 function.
a591a6d40bf420404a011733cfb7b190d62c65bf0bcda32b57b277d9ad9f146e
It turned out to be easier than I thought to rewrite hashlib to be resumable, at least, the SHA-256 portion. I spent some time playing with C code that uses the OpenSSL crypto library, but then I realised that I don't need all that stuff, I can just use ctypes.
rehash.py
#! /usr/bin/env python
''' A resumable implementation of SHA-256 using ctypes with the OpenSSL crypto library
Written by PM 2Ring 2014.11.13
'''
from ctypes import *
SHA_LBLOCK = 16
SHA256_DIGEST_LENGTH = 32
class SHA256_CTX(Structure):
_fields_ = [
("h", c_long * 8),
("Nl", c_long),
("Nh", c_long),
("data", c_long * SHA_LBLOCK),
("num", c_uint),
("md_len", c_uint)
]
HashBuffType = c_ubyte * SHA256_DIGEST_LENGTH
#crypto = cdll.LoadLibrary("libcrypto.so")
crypto = cdll.LoadLibrary("libeay32.dll" if os.name == "nt" else "libssl.so")
class sha256(object):
digest_size = SHA256_DIGEST_LENGTH
def __init__(self, datastr=None):
self.ctx = SHA256_CTX()
crypto.SHA256_Init(byref(self.ctx))
if datastr:
self.update(datastr)
def update(self, datastr):
crypto.SHA256_Update(byref(self.ctx), datastr, c_int(len(datastr)))
#Clone the current context
def _copy_ctx(self):
ctx = SHA256_CTX()
pointer(ctx)[0] = self.ctx
return ctx
def copy(self):
other = sha256()
other.ctx = self._copy_ctx()
return other
def digest(self):
#Preserve context in case we get called before hashing is
# really finished, since SHA256_Final() clears the SHA256_CTX
ctx = self._copy_ctx()
hashbuff = HashBuffType()
crypto.SHA256_Final(hashbuff, byref(self.ctx))
self.ctx = ctx
return str(bytearray(hashbuff))
def hexdigest(self):
return self.digest().encode('hex')
#Tests
def main():
import cPickle
import hashlib
data = ("Nobody expects ", "the spammish ", "imposition!")
print "rehash\n"
shaA = sha256(''.join(data))
print shaA.hexdigest()
print repr(shaA.digest())
print "digest size =", shaA.digest_size
print
shaB = sha256()
shaB.update(data[0])
print shaB.hexdigest()
#Test pickling
sha_pickle = cPickle.dumps(shaB, -1)
print "Pickle length:", len(sha_pickle)
shaC = cPickle.loads(sha_pickle)
shaC.update(data[1])
print shaC.hexdigest()
#Test copying. Note that copy can be pickled
shaD = shaC.copy()
shaC.update(data[2])
print shaC.hexdigest()
#Verify against hashlib.sha256()
print "\nhashlib\n"
shaD = hashlib.sha256(''.join(data))
print shaD.hexdigest()
print repr(shaD.digest())
print "digest size =", shaD.digest_size
print
shaE = hashlib.sha256(data[0])
print shaE.hexdigest()
shaE.update(data[1])
print shaE.hexdigest()
#Test copying. Note that hashlib copy can NOT be pickled
shaF = shaE.copy()
shaF.update(data[2])
print shaF.hexdigest()
if __name__ == '__main__':
main()
resumable_SHA-256.py
#! /usr/bin/env python
''' Resumable SHA-256 hash for large files using the OpenSSL crypto library
The hashing process may be interrupted by Control-C (SIGINT) or SIGTERM.
When a signal is received, hashing continues until the end of the
current chunk, then the current file position, total file size, and
the sha object is saved to a file. The name of this file is formed by
appending '.hash' to the name of the file being hashed.
Just re-run the program to resume hashing. The '.hash' file will be deleted
once hashing is completed.
Written by PM 2Ring 2014.11.14
'''
import cPickle as pickle
import os
import signal
import sys
import rehash
quit = False
blocksize = 1<<16 # 64kB
blocksperchunk = 1<<8
chunksize = blocksize * blocksperchunk
def handler(signum, frame):
global quit
print "\nGot signal %d, cleaning up." % signum
quit = True
def do_hash(fname, filesize):
hashname = fname + '.hash'
if os.path.exists(hashname):
with open(hashname, 'rb') as f:
pos, fsize, sha = pickle.load(f)
if fsize != filesize:
print "Error: file size of '%s' doesn't match size recorded in '%s'" % (fname, hashname)
print "%d != %d. Aborting" % (fsize, filesize)
exit(1)
else:
pos, fsize, sha = 0, filesize, rehash.sha256()
finished = False
with open(fname, 'rb') as f:
f.seek(pos)
while not (quit or finished):
for _ in xrange(blocksperchunk):
block = f.read(blocksize)
if block == '':
finished = True
break
sha.update(block)
pos += chunksize
sys.stderr.write(" %6.2f%% of %d\r" % (100.0 * pos / fsize, fsize))
if finished or quit:
break
if quit:
with open(hashname, 'wb') as f:
pickle.dump((pos, fsize, sha), f, -1)
elif os.path.exists(hashname):
os.remove(hashname)
return (not quit), pos, sha.hexdigest()
def main():
if len(sys.argv) != 2:
print "Resumable SHA-256 hash of a file."
print "Usage:\npython %s filename\n" % sys.argv[0]
exit(1)
fname = sys.argv[1]
filesize = os.path.getsize(fname)
signal.signal(signal.SIGINT, handler)
signal.signal(signal.SIGTERM, handler)
finished, pos, hexdigest = do_hash(fname, filesize)
if finished:
print "%s %s" % (hexdigest, fname)
else:
print "sha-256 hash of '%s' incomplete" % fname
print "%s" % hexdigest
print "%d / %d bytes processed." % (pos, filesize)
if __name__ == '__main__':
main()
demo
import rehash
import pickle
sha=rehash.sha256("Hello ")
s=pickle.dumps(sha.ctx)
sha=rehash.sha256()
sha.ctx=pickle.loads(s)
sha.update("World")
print sha.hexdigest()
output
a591a6d40bf420404a011733cfb7b190d62c65bf0bcda32b57b277d9ad9f146e
edit
I've just made a minor edit to allow rehash
to work on Windows, too, although I've only tested it on WinXP. The libeay32.dll
can be in the current directory, or somewhere in the system library search path, eg WINDOWS\system32
. My rather ancient (and mostly unused) XP installation couldn't find the .dll, even though it's used by OpenOffice and Avira. So I just copied it from the Avira folder to system32. And now it works perfectly. :)