2 """This tool reads a Debian package from stdin and emits a yaml stream on
3 stdout. It does not access a database. Therefore it can be run in parallel and
4 on multiple machines. The generated yaml contains multiple documents. The first
5 document contains package metadata. Then a document is emitted for each file.
6 And finally a document consisting of the string "commit" is emitted."""
13 from urllib.request import urlopen
15 from urllib import urlopen
19 from dedup.debpkg import DebExtractor, decodetarname, get_tar_hashes, \
21 from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \
23 from dedup.compression import GzipDecompressor
24 from dedup.image import GIFHash, PNGHash
26 boring_content = set(("", "\n"))
28 def sha512_nontrivial():
29 return HashBlacklistContent(hashlib.sha512(), boring_content)
32 hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
33 hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
34 hashobj.name = "gzip_sha512"
35 return HashBlacklistContent(hashobj, boring_content)
38 hashobj = PNGHash(hashlib.sha512())
39 hashobj = SuppressingHash(hashobj, (ValueError,))
40 hashobj.name = "png_sha512"
44 hashobj = GIFHash(hashlib.sha512())
45 hashobj = SuppressingHash(hashobj, (ValueError,))
46 hashobj.name = "gif_sha512"
49 class ProcessingFinished(Exception):
52 class ImportpkgExtractor(DebExtractor):
53 hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash]
55 def __init__(self, callback):
56 DebExtractor.__init__(self)
57 self.callback = callback
59 def handle_control_tar(self, tarfileobj):
60 for elem in tarfileobj:
61 if elem.name not in ("./control", "control"):
63 self.callback(process_control(tarfileobj.extractfile(elem).read()))
65 raise ValueError("missing control file")
67 def handle_data_tar(self, tarfileobj):
68 for name, size, hashes in get_tar_hashes(tarfileobj,
71 name = decodetarname(name)
72 except UnicodeDecodeError:
73 print("warning: skipping filename with encoding error")
74 continue # skip files with non-utf8 encoding for now
75 self.callback(dict(name=name, size=size, hashes=hashes))
76 raise ProcessingFinished()
80 stdin = sys.stdin.buffer
81 except AttributeError: # python2
83 parser = argparse.ArgumentParser()
84 parser.add_argument("-H", "--hash", action="store",
85 help="verify that stdin hash given sha256 hash")
86 parser.add_argument("input", nargs='?', default=stdin, type=urlopen,
87 help="read from this location instead of stdin")
88 args = parser.parse_args()
89 dumper = yaml.SafeDumper(sys.stdout)
92 stdin = HashedStream(stdin, hashlib.sha256())
94 ImportpkgExtractor(dumper.represent).process(args.input)
95 except ProcessingFinished:
98 raise RuntimeError("unexpected termination of extractor")
100 stdin.validate(args.hash)
101 dumper.represent("commit")
104 if __name__ == "__main__":