92c474ea960cc13fcc022decd032613b0ad7fa1d
[~helmut/debian-dedup.git] / importpkg.py
1 #!/usr/bin/python
2 """This tool reads a Debian package from stdin and emits a yaml stream on
3 stdout.  It does not access a database. Therefore it can be run in parallel and
4 on multiple machines. The generated yaml contains multiple documents. The first
5 document contains package metadata. Then a document is emitted for each file.
6 And finally a document consisting of the string "commit" is emitted."""
7
8 import argparse
9 import hashlib
10 import sys
11 import zlib
12 try:
13     from urllib.request import urlopen
14 except ImportError:
15     from urllib import urlopen
16
17 import yaml
18
19 from dedup.debpkg import DebExtractor, decodetarname, get_tar_hashes, \
20         process_control
21 from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \
22         HashBlacklistContent
23 from dedup.compression import GzipDecompressor
24 from dedup.image import GIFHash, PNGHash
25
26 boring_content = set(("", "\n"))
27
28 def sha512_nontrivial():
29     return HashBlacklistContent(hashlib.sha512(), boring_content)
30
31 def gziphash():
32     hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
33     hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
34     hashobj.name = "gzip_sha512"
35     return HashBlacklistContent(hashobj, boring_content)
36
37 def pnghash():
38     hashobj = PNGHash(hashlib.sha512())
39     hashobj = SuppressingHash(hashobj, (ValueError,))
40     hashobj.name = "png_sha512"
41     return hashobj
42
43 def gifhash():
44     hashobj = GIFHash(hashlib.sha512())
45     hashobj = SuppressingHash(hashobj, (ValueError,))
46     hashobj.name = "gif_sha512"
47     return hashobj
48
49 class ProcessingFinished(Exception):
50     pass
51
52 class ImportpkgExtractor(DebExtractor):
53     hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash]
54
55     def __init__(self, callback):
56         DebExtractor.__init__(self)
57         self.callback = callback
58
59     def handle_control_tar(self, tarfileobj):
60         for elem in tarfileobj:
61             if elem.name not in ("./control", "control"):
62                 continue
63             self.callback(process_control(tarfileobj.extractfile(elem).read()))
64             return
65         raise ValueError("missing control file")
66
67     def handle_data_tar(self, tarfileobj):
68         for name, size, hashes in get_tar_hashes(tarfileobj,
69                                                  self.hash_functions):
70             try:
71                 name = decodetarname(name)
72             except UnicodeDecodeError:
73                 print("warning: skipping filename with encoding error")
74                 continue # skip files with non-utf8 encoding for now
75             self.callback(dict(name=name, size=size, hashes=hashes))
76         raise ProcessingFinished()
77
78 def main():
79     try:
80         stdin = sys.stdin.buffer
81     except AttributeError: # python2
82         stdin = sys.stdin
83     parser = argparse.ArgumentParser()
84     parser.add_argument("-H", "--hash", action="store",
85                         help="verify that stdin hash given sha256 hash")
86     parser.add_argument("input", nargs='?', default=stdin, type=urlopen,
87                         help="read from this location instead of stdin")
88     args = parser.parse_args()
89     dumper = yaml.SafeDumper(sys.stdout)
90     dumper.open()
91     if args.hash:
92         args.input = HashedStream(args.input, hashlib.sha256())
93     try:
94         ImportpkgExtractor(dumper.represent).process(args.input)
95     except ProcessingFinished:
96         pass
97     else:
98         raise RuntimeError("unexpected termination of extractor")
99     if args.hash:
100         args.input.validate(args.hash)
101     dumper.represent("commit")
102     dumper.close()
103
104 if __name__ == "__main__":
105     main()