importpkg: move library-like parts to dedup.debpkg
authorHelmut Grohne <helmut@subdivi.de>
Mon, 2 Sep 2013 07:30:05 +0000 (09:30 +0200)
committerHelmut Grohne <helmut@subdivi.de>
Mon, 2 Sep 2013 07:30:05 +0000 (09:30 +0200)
dedup/debpkg.py [new file with mode: 0644]
importpkg.py

diff --git a/dedup/debpkg.py b/dedup/debpkg.py
new file mode 100644 (file)
index 0000000..d8cc22f
--- /dev/null
@@ -0,0 +1,55 @@
+from debian import deb822
+
+from dedup.hashing import hash_file
+
+def process_control(control_contents):
+    """Parses the contents of a control file from a control.tar.gz of a Debian
+    package and returns a dictionary containing the fields relevant to dedup.
+    @type control_contents: bytes
+    @rtype: {str: object}
+    """
+    control = deb822.Packages(control_contents)
+    package = control["package"].encode("ascii")
+    try:
+        source = control["source"].encode("ascii").split()[0]
+    except KeyError:
+        source = package
+    version = control["version"].encode("ascii")
+    architecture = control["architecture"].encode("ascii")
+
+    depends = set(dep[0]["name"].encode("ascii")
+                  for dep in control.relations.get("depends", ())
+                  if len(dep) == 1)
+    return dict(package=package, source=source, version=version,
+                architecture=architecture, depends=depends)
+
+class MultiHash(object):
+    def __init__(self, *hashes):
+        self.hashes = hashes
+
+    def update(self, data):
+        for hasher in self.hashes:
+            hasher.update(data)
+
+def get_tar_hashes(tar, hash_functions):
+    """Given a TarFile read all regular files and compute all of the given hash
+    functions on each file.
+    @type tar: tarfile.TarFile
+    @param hash_functions: a sequence of parameter-less functions each creating a
+            new hashlib-like object
+    @rtype: gen((str, int, {str: str}}
+    @returns: an iterable of (filename, filesize, hashes) tuples where
+            hashes is a dict mapping hash function names to hash values
+    """
+
+    for elem in tar:
+        if not elem.isreg(): # excludes hard links as well
+            continue
+        hasher = MultiHash(*[func() for func in hash_functions])
+        hasher = hash_file(hasher, tar.extractfile(elem))
+        hashes = {}
+        for hashobj in hasher.hashes:
+            hashvalue = hashobj.hexdigest()
+            if hashvalue:
+                hashes[hashobj.name] = hashvalue
+        yield (elem.name, elem.size, hashes)
index 1334dd6..54f6181 100755 (executable)
@@ -11,24 +11,16 @@ import sys
 import tarfile
 import zlib
 
-from debian import deb822
 import lzma
 import yaml
 
 from dedup.arreader import ArReader
+from dedup.debpkg import process_control, get_tar_hashes
 from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \
-    HashedStream, hash_file
+    HashedStream
 from dedup.compression import GzipDecompressor, DecompressedStream
 from dedup.image import GIFHash, PNGHash
 
-class MultiHash(object):
-    def __init__(self, *hashes):
-        self.hashes = hashes
-
-    def update(self, data):
-        for hasher in self.hashes:
-            hasher.update(data)
-
 boring_sha512_hashes = set((
     # ""
     "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e",
@@ -57,37 +49,7 @@ def gifhash():
     hashobj.name = "gif_sha512"
     return hashobj
 
-def get_hashes(tar):
-    for elem in tar:
-        if not elem.isreg(): # excludes hard links as well
-            continue
-        hasher = MultiHash(sha512_nontrivial(), gziphash(), pnghash(),
-                           gifhash())
-        hasher = hash_file(hasher, tar.extractfile(elem))
-        hashes = {}
-        for hashobj in hasher.hashes:
-            hashvalue = hashobj.hexdigest()
-            if hashvalue:
-                hashes[hashobj.name] = hashvalue
-        yield (elem.name, elem.size, hashes)
-
-def process_control(control_contents):
-    control = deb822.Packages(control_contents)
-    package = control["package"].encode("ascii")
-    try:
-        source = control["source"].encode("ascii").split()[0]
-    except KeyError:
-        source = package
-    version = control["version"].encode("ascii")
-    architecture = control["architecture"].encode("ascii")
-
-    depends = set(dep[0]["name"].encode("ascii")
-                  for dep in control.relations.get("depends", ())
-                  if len(dep) == 1)
-    return dict(package=package, source=source, version=version,
-                architecture=architecture, depends=depends)
-
-def process_package(filelike):
+def process_package(filelike, hash_functions):
     af = ArReader(filelike)
     af.read_magic()
     state = "start"
@@ -123,7 +85,7 @@ def process_package(filelike):
             continue
         if state != "control_file":
             raise ValueError("missing control file")
-        for name, size, hashes in get_hashes(tf):
+        for name, size, hashes in get_tar_hashes(tf, hash_functions):
             try:
                 name = name.decode("utf8")
             except UnicodeDecodeError:
@@ -133,9 +95,9 @@ def process_package(filelike):
         yield "commit"
         break
 
-def process_package_with_hash(filelike, sha256hash):
+def process_package_with_hash(filelike, hash_functions, sha256hash):
     hstream = HashedStream(filelike, hashlib.sha256())
-    for elem in process_package(hstream):
+    for elem in process_package(hstream, hash_functions):
         if elem == "commit":
             while hstream.read(4096):
                 pass
@@ -150,10 +112,11 @@ def main():
     parser.add_option("-H", "--hash", action="store",
                       help="verify that stdin hash given sha256 hash")
     options, args = parser.parse_args()
+    hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash]
     if options.hash:
-        gen = process_package_with_hash(sys.stdin, options.hash)
+        gen = process_package_with_hash(sys.stdin, hash_functions, options.hash)
     else:
-        gen = process_package(sys.stdin)
+        gen = process_package(sys.stdin, hash_functions)
     yaml.safe_dump_all(gen, sys.stdout)
 
 if __name__ == "__main__":