move hashing functions to module dedup.hashing
authorHelmut Grohne <helmut@subdivi.de>
Thu, 21 Feb 2013 16:10:54 +0000 (17:10 +0100)
committerHelmut Grohne <helmut@subdivi.de>
Thu, 21 Feb 2013 16:10:54 +0000 (17:10 +0100)
dedup/__init__.py [new file with mode: 0644]
dedup/__init__.pyc [new file with mode: 0644]
dedup/hashing.py [new file with mode: 0644]
dedup/hashing.pyc [new file with mode: 0644]
importpkg.py

diff --git a/dedup/__init__.py b/dedup/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/dedup/__init__.pyc b/dedup/__init__.pyc
new file mode 100644 (file)
index 0000000..8f09308
Binary files /dev/null and b/dedup/__init__.pyc differ
diff --git a/dedup/hashing.py b/dedup/hashing.py
new file mode 100644 (file)
index 0000000..1283c7e
--- /dev/null
@@ -0,0 +1,108 @@
+class HashBlacklist(object):
+    """Turn a hashlib-like object into a hash that returns None for some
+    blacklisted hashes instead of the real hash value.
+
+    We only work with hexdigests here, so diget() disappears. The methods
+    copy and update as well as the name attribute keep working as expected.
+    """
+    def __init__(self, hashobj, blacklist=()):
+        """
+        @param hashobj: a hashlib-like object
+        @param blacklist: an object providing __contains__.
+            hexdigest values which are contained in the blacklist
+            are turned into None values
+        """
+        self.hashobj = hashobj
+        self.blacklist = blacklist
+        self.update = self.hashobj.update
+
+    @property
+    def name(self):
+        return self.hashobj.name
+
+    def hexdigest(self):
+        digest = self.hashobj.hexdigest()
+        if digest in self.blacklist:
+            return None
+        return digest
+
+    def copy(self):
+        return HashBlacklist(self.hashobj.copy(), self.blacklist)
+
+class DecompressedHash(object):
+    """Apply a decompression function before the hash. This class provides the
+    hashlib interface (update, hexdigest, copy) excluding digest and name."""
+    def __init__(self, decompressor, hashobj):
+        """
+        @param decompressor: a decompression object like bz2.BZ2Decompressor or
+            lzma.LZMADecompressor. It has to provide methods decompress and
+            copy as well as an unused_data attribute. It may provide a flush
+            method.
+        @param hashobj: a hashlib-like obj providing methods update, hexdigest
+            and copy
+        """
+        self.decompressor = decompressor
+        self.hashobj = hashobj
+
+    def update(self, data):
+        self.hashobj.update(self.decompressor.decompress(data))
+
+    def hexdigest(self):
+        if not hasattr(self.decompressor, "flush"):
+            return self.hashobj.hexdigest()
+        tmpdecomp = self.decompressor.copy()
+        data = tmpdecomp.flush()
+        tmphash = self.hashobj.copy()
+        tmphash.update(data)
+        return tmphash.hexdigest()
+
+    def copy(self):
+        return DecompressedHash(self.decompressor.copy(), self.hashobj.copy())
+
+class SuppressingHash(object):
+    """A hash that silences exceptions from the update and hexdigest methods of
+    a hashlib-like object. If an exception has occured, hexdigest always
+    returns None."""
+    def __init__(self, hashobj, exceptions=()):
+        """
+        @param hashobj: a hashlib-like object providing methods update, copy
+            and hexdigest. If a name attribute is present, it is mirrored as
+            well.
+        @type exceptions: tuple
+        @param exceptions: exception classes to be suppressed
+        """
+        self.hashobj = hashobj
+        self.exceptions = exceptions
+        if hasattr(hashobj, "name"):
+            self.name = hashobj.name
+
+    def update(self, data):
+        if self.hashobj:
+            try:
+                self.hashobj.update(data)
+            except self.exceptions:
+                self.hashobj = None
+
+    def hexdigest(self):
+        if self.hashobj:
+            try:
+                return self.hashobj.hexdigest()
+            except self.exceptions:
+                self.hashobj = None
+        return None
+
+    def copy(self):
+        if self.hashobj:
+            return SuppressingHash(self.hashobj.copy(), self.exceptions)
+        return SuppressingHash(None, self.exceptions)
+
+def hash_file(hashobj, filelike, blocksize=65536):
+    """Feed the entire contents from the given filelike to the given hashobj.
+    @param hashobj: hashlib-like object providing an update method
+    @param filelike: file-like object providing read(size)
+    """
+    data = filelike.read(blocksize)
+    while data:
+        hashobj.update(data)
+        data = filelike.read(blocksize)
+    return hashobj
diff --git a/dedup/hashing.pyc b/dedup/hashing.pyc
new file mode 100644 (file)
index 0000000..7d2383a
Binary files /dev/null and b/dedup/hashing.pyc differ
index a45720a..eb3b3ec 100755 (executable)
@@ -18,6 +18,8 @@ from debian.debian_support import version_compare
 from debian import deb822
 import lzma
 
+from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, hash_file
+
 class ArReader(object):
     global_magic = b"!<arch>\n"
     file_magic = b"`\n"
@@ -103,19 +105,6 @@ class MultiHash(object):
         for hasher in self.hashes:
             hasher.update(data)
 
-class HashBlacklist(object):
-    def __init__(self, hasher, blacklist=set()):
-        self.hasher = hasher
-        self.blacklist = blacklist
-        self.update = self.hasher.update
-        self.name = hasher.name
-
-    def hexdigest(self):
-        digest = self.hasher.hexdigest()
-        if digest in self.blacklist:
-            return None
-        return digest
-
 class GzipDecompressor(object):
     def __init__(self):
         self.inbuffer = b""
@@ -175,50 +164,6 @@ class GzipDecompressor(object):
             new.decompressor = self.decompressor.copy()
         return new
 
-class DecompressedHash(object):
-    def __init__(self, decompressor, hashobj):
-        self.decompressor = decompressor
-        self.hashobj = hashobj
-
-    def update(self, data):
-        self.hashobj.update(self.decompressor.decompress(data))
-
-    def hexdigest(self):
-        if not hasattr(self.decompressor, "flush"):
-            return self.hashobj.hexdigest()
-        tmpdecomp = self.decompressor.copy()
-        data = tmpdecomp.flush()
-        tmphash = self.hashobj.copy()
-        tmphash.update(data)
-        return tmphash.hexdigest()
-
-class SuppressingHash(object):
-    def __init__(self, hashobj, exceptions=()):
-        self.hashobj = hashobj
-        self.exceptions = exceptions
-
-    def update(self, data):
-        if self.hashobj:
-            try:
-                self.hashobj.update(data)
-            except self.exceptions:
-                self.hashobj = None
-
-    def hexdigest(self):
-        if self.hashobj:
-            try:
-                return self.hashobj.hexdigest()
-            except self.exceptions:
-                self.hashobj = None
-        return None
-
-def hash_file(hashobj, filelike, blocksize=65536):
-    data = filelike.read(blocksize)
-    while data:
-        hashobj.update(data)
-        data = filelike.read(blocksize)
-    return hashobj
-
 boring_sha512_hashes = set((
     # ""
     "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e",