fedad733d985200c5484ed5631656f7623cddfee
[~helmut/debian-dedup.git] / importpkg.py
1 #!/usr/bin/python
2 """This tool reads a Debian package from stdin and emits a yaml stream on
3 stdout.  It does not access a database. Therefore it can be run in parallel and
4 on multiple machines. The generated yaml contains multiple documents. The first
5 document contains package metadata. Then a document is emitted for each file.
6 And finally a document consisting of the string "commit" is emitted."""
7
8 import argparse
9 import hashlib
10 import sys
11 import zlib
12
13 import yaml
14
15 from dedup.debpkg import DebExtractor, decodetarname, get_tar_hashes, \
16         process_control
17 from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \
18         HashBlacklistContent
19 from dedup.compression import GzipDecompressor
20 from dedup.image import GIFHash, PNGHash
21
22 boring_content = set(("", "\n"))
23
24 def sha512_nontrivial():
25     return HashBlacklistContent(hashlib.sha512(), boring_content)
26
27 def gziphash():
28     hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
29     hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
30     hashobj.name = "gzip_sha512"
31     return HashBlacklistContent(hashobj, boring_content)
32
33 def pnghash():
34     hashobj = PNGHash(hashlib.sha512())
35     hashobj = SuppressingHash(hashobj, (ValueError,))
36     hashobj.name = "png_sha512"
37     return hashobj
38
39 def gifhash():
40     hashobj = GIFHash(hashlib.sha512())
41     hashobj = SuppressingHash(hashobj, (ValueError,))
42     hashobj.name = "gif_sha512"
43     return hashobj
44
45 class ProcessingFinished(Exception):
46     pass
47
48 class ImportpkgExtractor(DebExtractor):
49     hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash]
50
51     def __init__(self, callback):
52         DebExtractor.__init__(self)
53         self.callback = callback
54
55     def handle_control_tar(self, tarfileobj):
56         for elem in tarfileobj:
57             if elem.name not in ("./control", "control"):
58                 continue
59             self.callback(process_control(tarfileobj.extractfile(elem).read()))
60             return
61         raise ValueError("missing control file")
62
63     def handle_data_tar(self, tarfileobj):
64         for name, size, hashes in get_tar_hashes(tarfileobj,
65                                                  self.hash_functions):
66             try:
67                 name = decodetarname(name)
68             except UnicodeDecodeError:
69                 print("warning: skipping filename with encoding error")
70                 continue # skip files with non-utf8 encoding for now
71             self.callback(dict(name=name, size=size, hashes=hashes))
72         raise ProcessingFinished()
73
74 def main():
75     parser = argparse.ArgumentParser()
76     parser.add_argument("-H", "--hash", action="store",
77                         help="verify that stdin hash given sha256 hash")
78     args = parser.parse_args()
79     try:
80         stdin = sys.stdin.buffer
81     except AttributeError: # python2
82         stdin = sys.stdin
83     dumper = yaml.SafeDumper(sys.stdout)
84     dumper.open()
85     if args.hash:
86         stdin = HashedStream(stdin, hashlib.sha256())
87     try:
88         ImportpkgExtractor(dumper.represent).process(stdin)
89     except ProcessingFinished:
90         pass
91     else:
92         raise RuntimeError("unexpected termination of extractor")
93     if args.hash:
94         stdin.validate(args.hash)
95     dumper.represent("commit")
96     dumper.close()
97
98 if __name__ == "__main__":
99     main()