push more functionality into DebExtractor
authorHelmut Grohne <helmut@subdivi.de>
Sun, 1 May 2016 12:31:56 +0000 (14:31 +0200)
committerHelmut Grohne <helmut@subdivi.de>
Sun, 1 May 2016 12:31:56 +0000 (14:31 +0200)
The handle_ar_member and handle_ar_end methods now have a default
implementation adding further handlers handle_debversion,
handle_control_tar and handle_data_tar.

In that process two additional bugs were fixed:
 * decompress_tar was wrongly passing errors="surrogateescape" for
   Python 2.x even though that's only supported for Python 3.x.
 * The use of decompress actually passes the extension as unicode.

dedup/compression.py
dedup/debpkg.py
importpkg.py

index 5df6613..7f6dc99 100644 (file)
@@ -156,10 +156,10 @@ class DecompressedStream(object):
             self.closed = True
 
 decompressors = {
-    '.gz':   GzipDecompressor,
-    '.bz2':  bz2.BZ2Decompressor,
-    '.lzma': lzma.LZMADecompressor,
-    '.xz':   lzma.LZMADecompressor,
+    u'.gz':   GzipDecompressor,
+    u'.bz2':  bz2.BZ2Decompressor,
+    u'.lzma': lzma.LZMADecompressor,
+    u'.xz':   lzma.LZMADecompressor,
 }
 
 def decompress(filelike, extension):
@@ -168,7 +168,7 @@ def decompress(filelike, extension):
                      close().
     @param extension: permitted values are "", ".gz", ".bz2", ".lzma", and
                       ".xz"
-    @type extension: str
+    @type extension: unicode
     @returns: a read-only byte-stream with the decompressed contents of the
               original filelike. It supports read(size) and close(). If the
               original supports seek(pos) and tell(), then it also supports
index 04773de..ba0b7c9 100644 (file)
@@ -1,6 +1,10 @@
+import sys
+import tarfile
+
 from debian import deb822
 
 from dedup.arreader import ArReader
+from dedup.compression import decompress
 from dedup.hashing import hash_file
 
 def process_control(control_contents):
@@ -55,9 +59,37 @@ def get_tar_hashes(tar, hash_functions):
                 hashes[hashobj.name] = hashvalue
         yield (elem.name, elem.size, hashes)
 
+if sys.version_info.major >= 3:
+    def opentar(filelike):
+        return tarfile.open(fileobj=filelike, mode="r|", encoding="utf8",
+                            errors="surrogateescape")
+
+    def decodetarname(name):
+        """Decoded name of a tarinfo.
+        @raises UnicodeDecodeError:
+        """
+        try:
+            name.encode("utf8", "strict")
+        except UnicodeEncodeError as e:
+            if e.reason == "surrogates not allowed":
+                name.encode("utf8", "surrogateescape").decode("utf8", "strict")
+        return name
+else:
+    def opentar(filelike):
+        return tarfile.open(fileobj=filelike, mode="r|")
+
+    def decodetarname(name):
+        """Decoded name of a tarinfo.
+        @raises UnicodeDecodeError:
+        """
+        return name.decode("utf8")
+
 class DebExtractor(object):
     "Base class for extracting desired features from a Debian package."
 
+    def __init__(self):
+        self.arstate = "start"
+
     def process(self, filelike):
         """Process a Debian package.
         @param filelike: is a file-like object containing the contents of the
@@ -76,11 +108,58 @@ class DebExtractor(object):
 
     def handle_ar_member(self, name, filelike):
         """Handle an ar archive member of the Debian package.
+        If you replace this method, you must also replace handle_ar_end and
+        none of the methods handle_debversion, handle_control_tar or
+        handle_data_tar are called.
         @type name: bytes
         @param name: is the name of the member
         @param filelike: is a file-like object containing the contents of the
                          member and can be read once without seeks.
         """
+        if self.arstate == "start":
+            if name != b"debian-binary":
+                raise ValueError("debian-binary not found")
+            version = filelike.read()
+            self.handle_debversion(version)
+            if not version.startswith(b"2."):
+                raise ValueError("debian version not recognized")
+            self.arstate = "version"
+        elif self.arstate == "version":
+            if name.startswith(b"control.tar"):
+                filelike = decompress(filelike, name[11:].decode("ascii"))
+                self.handle_control_tar(opentar(filelike))
+                self.arstate = "control"
+            elif not name.startswith(b"_"):
+                raise ValueError("unexpected ar member %r" % name)
+        elif self.arstate == "control":
+            if name.startswith(b"data.tar"):
+                filelike = decompress(filelike, name[8:].decode("ascii"))
+                self.handle_data_tar(opentar(filelike))
+                self.arstate = "data"
+            elif not name.startswith(b"_"):
+                raise ValueError("unexpected ar member %r" % name)
+        else:
+            assert self.arstate == "data"
 
     def handle_ar_end(self):
         "Handle the end of the ar archive of the Debian package."
+        if self.arstate != "data":
+            raise ValueError("data.tar not found")
+
+    def handle_debversion(self, version):
+        """Handle the debian-binary member of the Debian package.
+        @type version: bytes
+        @param version: The full contents of the ar member.
+        """
+
+    def handle_control_tar(self, tarfileobj):
+        """Handle the control.tar member of the Debian package.
+        @type tarfileobj: tarfile.TarFile
+        @param tarfile: is opened for streaming reads
+        """
+
+    def handle_data_tar(self, tarfileobj):
+        """Handle the data.tar member of the Debian package.
+        @type tarfileobj: tarfile.TarFile
+        @param tarfile: is opened for streaming reads
+        """
index e8cc2fa..933ec80 100755 (executable)
@@ -8,15 +8,15 @@ And finally a document consisting of the string "commit" is emitted."""
 import hashlib
 import optparse
 import sys
-import tarfile
 import zlib
 
 import yaml
 
-from dedup.debpkg import DebExtractor, process_control, get_tar_hashes
+from dedup.debpkg import DebExtractor, decodetarname, get_tar_hashes, \
+        process_control
 from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \
         HashBlacklistContent
-from dedup.compression import GzipDecompressor, decompress
+from dedup.compression import GzipDecompressor
 from dedup.image import GIFHash, PNGHash
 
 boring_content = set(("", "\n"))
@@ -42,33 +42,6 @@ def gifhash():
     hashobj.name = "gif_sha512"
     return hashobj
 
-if sys.version_info.major >= 3:
-    def decompress_tar(filelike, extension):
-        filelike = decompress(filelike, extension.decode("ascii"))
-        return tarfile.open(fileobj=filelike, mode="r|")
-
-    def decodetarname(name):
-        """Decoded name of a tarinfo.
-        @raises UnicodeDecodeError:
-        """
-        try:
-            name.encode("utf8", "strict")
-        except UnicodeEncodeError as e:
-            if e.reason == "surrogates not allowed":
-                name.encode("utf8", "surrogateescape").decode("utf8", "strict")
-        return name
-else:
-    def decompress_tar(filelike, extension):
-        filelike = decompress(filelike, extension.decode("ascii"))
-        return tarfile.open(fileobj=filelike, mode="r|", encoding="utf8",
-                            errors="surrogateescape")
-
-    def decodetarname(name):
-        """Decoded name of a tarinfo.
-        @raises UnicodeDecodeError:
-        """
-        return name.decode("utf8")
-
 class ProcessingFinished(Exception):
     pass
 
@@ -76,40 +49,27 @@ class ImportpkgExtractor(DebExtractor):
     hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash]
 
     def __init__(self, callback):
-        self.state = "start"
+        DebExtractor.__init__(self)
         self.callback = callback
 
-    def handle_ar_member(self, name, filelike):
-        if name.startswith(b"control.tar"):
-            if self.state != "start":
-                raise ValueError("unexpected control.tar")
-            self.state = "control"
-            tf = decompress_tar(filelike, name[11:])
-            for elem in tf:
-                if elem.name not in ("./control", "control"):
-                    continue
-                if self.state != "control":
-                    raise ValueError("duplicate control file")
-                self.state = "control_file"
-                self.callback(process_control(tf.extractfile(elem).read()))
-                break
-        elif name.startswith(b"data.tar"):
-            if self.state != "control_file":
-                raise ValueError("missing control file")
-            self.state = "data"
-            tf = decompress_tar(filelike, name[8:])
-            for name, size, hashes in get_tar_hashes(tf, self.hash_functions):
-                try:
-                    name = decodetarname(name)
-                except UnicodeDecodeError:
-                    print("warning: skipping filename with encoding error")
-                    continue # skip files with non-utf8 encoding for now
-                self.callback(dict(name=name, size=size, hashes=hashes))
-            raise ProcessingFinished()
-
-    def handle_ar_end(self):
-        if self.state != "data":
-            raise ValueError("data.tar not found")
+    def handle_control_tar(self, tarfileobj):
+        for elem in tarfileobj:
+            if elem.name not in ("./control", "control"):
+                continue
+            self.callback(process_control(tarfileobj.extractfile(elem).read()))
+            return
+        raise ValueError("missing control file")
+
+    def handle_data_tar(self, tarfileobj):
+        for name, size, hashes in get_tar_hashes(tarfileobj,
+                                                 self.hash_functions):
+            try:
+                name = decodetarname(name)
+            except UnicodeDecodeError:
+                print("warning: skipping filename with encoding error")
+                continue # skip files with non-utf8 encoding for now
+            self.callback(dict(name=name, size=size, hashes=hashes))
+        raise ProcessingFinished()
 
 def main():
     parser = optparse.OptionParser()