add a class DebExtractor for guiding feature extraction
authorHelmut Grohne <helmut@subdivi.de>
Tue, 19 Apr 2016 20:48:02 +0000 (22:48 +0200)
committerHelmut Grohne <helmut@subdivi.de>
Tue, 19 Apr 2016 20:55:37 +0000 (22:55 +0200)
It is supposed to separate the parsing of Debian packages (understanding
how the format works) from the actual feature extraction. Its goal is to
simplify writing custom extractors for different feature sets.

dedup/debpkg.py
importpkg.py

index 8f2121b..04773de 100644 (file)
@@ -1,5 +1,6 @@
 from debian import deb822
 
+from dedup.arreader import ArReader
 from dedup.hashing import hash_file
 
 def process_control(control_contents):
@@ -53,3 +54,33 @@ def get_tar_hashes(tar, hash_functions):
             if hashvalue:
                 hashes[hashobj.name] = hashvalue
         yield (elem.name, elem.size, hashes)
+
+class DebExtractor(object):
+    "Base class for extracting desired features from a Debian package."
+
+    def process(self, filelike):
+        """Process a Debian package.
+        @param filelike: is a file-like object containing the contents of the
+                         Debian packge and can be read once without seeks.
+        """
+        af = ArReader(filelike)
+        af.read_magic()
+        while True:
+            try:
+                name = af.read_entry()
+            except EOFError:
+                break
+            else:
+                self.handle_ar_member(name, af)
+        self.handle_ar_end()
+
+    def handle_ar_member(self, name, filelike):
+        """Handle an ar archive member of the Debian package.
+        @type name: bytes
+        @param name: is the name of the member
+        @param filelike: is a file-like object containing the contents of the
+                         member and can be read once without seeks.
+        """
+
+    def handle_ar_end(self):
+        "Handle the end of the ar archive of the Debian package."
index f72cf03..0798f13 100755 (executable)
@@ -14,8 +14,7 @@ import zlib
 import lzma
 import yaml
 
-from dedup.arreader import ArReader
-from dedup.debpkg import process_control, get_tar_hashes
+from dedup.debpkg import DebExtractor, process_control, get_tar_hashes
 from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \
         HashBlacklistContent
 from dedup.compression import GzipDecompressor, DecompressedStream
@@ -54,42 +53,46 @@ def decompress_tar(filelike, extension):
     return tarfile.open(fileobj=filelike,
                         mode="r|" + extension[1:].decode("ascii"))
 
-def process_package(filelike, hash_functions, callback):
-    af = ArReader(filelike)
-    af.read_magic()
-    state = "start"
-    while True:
-        try:
-            name = af.read_entry()
-        except EOFError:
-            raise ValueError("data.tar not found")
+class ProcessingFinished(Exception):
+    pass
+
+class ImportpkgExtractor(DebExtractor):
+    def __init__(self, hash_functions, callback):
+        self.state = "start"
+        self.hash_functions = hash_functions
+        self.callback = callback
+
+    def handle_ar_member(self, name, filelike):
         if name.startswith(b"control.tar"):
-            if state != "start":
+            if self.state != "start":
                 raise ValueError("unexpected control.tar")
-            state = "control"
-            tf = decompress_tar(af, name[11:])
+            self.state = "control"
+            tf = decompress_tar(filelike, name[11:])
             for elem in tf:
                 if elem.name not in ("./control", "control"):
                     continue
-                if state != "control":
+                if self.state != "control":
                     raise ValueError("duplicate control file")
-                state = "control_file"
-                callback(process_control(tf.extractfile(elem).read()))
+                self.state = "control_file"
+                self.callback(process_control(tf.extractfile(elem).read()))
                 break
-            continue
         elif name.startswith(b"data.tar"):
-            if state != "control_file":
+            if self.state != "control_file":
                 raise ValueError("missing control file")
-            state = "data"
-            tf = decompress_tar(af, name[8:])
-            for name, size, hashes in get_tar_hashes(tf, hash_functions):
+            self.state = "data"
+            tf = decompress_tar(filelike, name[8:])
+            for name, size, hashes in get_tar_hashes(tf, self.hash_functions):
                 try:
                     name = name.decode("utf8")
                 except UnicodeDecodeError:
                     print("warning: skipping filename with encoding error")
                     continue # skip files with non-utf8 encoding for now
-                callback(dict(name=name, size=size, hashes=hashes))
-            break
+                self.callback(dict(name=name, size=size, hashes=hashes))
+            raise ProcessingFinished()
+
+    def handle_ar_end(self):
+        if self.state != "data":
+            raise ValueError("data.tar not found")
 
 def main():
     parser = optparse.OptionParser()
@@ -105,7 +108,12 @@ def main():
     dumper.open()
     if options.hash:
         stdin = HashedStream(stdin, hashlib.sha256())
-    process_package(stdin, hash_functions, dumper.represent)
+    try:
+        ImportpkgExtractor(hash_functions, dumper.represent).process(stdin)
+    except ProcessingFinished:
+        pass
+    else:
+        raise RuntimeError("unexpected termination of extractor")
     if options.hash:
         stdin.validate(options.hash)
     dumper.represent("commit")