drop support for Python 2.x
[~helmut/debian-dedup.git] / importpkg.py
1 #!/usr/bin/python3
2 """This tool reads a Debian package from stdin and emits a yaml stream on
3 stdout.  It does not access a database. Therefore it can be run in parallel and
4 on multiple machines. The generated yaml contains multiple documents. The first
5 document contains package metadata. Then a document is emitted for each file.
6 And finally a document consisting of the string "commit" is emitted."""
7
8 import argparse
9 import hashlib
10 import sys
11 import urllib.request
12 import zlib
13
14 import yaml
15
16 from dedup.debpkg import DebExtractor, get_tar_hashes
17 from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \
18         HashBlacklistContent
19 from dedup.compression import GzipDecompressor
20 from dedup.image import GIFHash, PNGHash
21
22 boring_content = set(("", "\n"))
23
24 def sha512_nontrivial():
25     return HashBlacklistContent(hashlib.sha512(), boring_content)
26
27 def gziphash():
28     hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
29     hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
30     hashobj.name = "gzip_sha512"
31     return HashBlacklistContent(hashobj, boring_content)
32
33 def pnghash():
34     hashobj = PNGHash(hashlib.sha512())
35     hashobj = SuppressingHash(hashobj, (ValueError,))
36     hashobj.name = "png_sha512"
37     return hashobj
38
39 def gifhash():
40     hashobj = GIFHash(hashlib.sha512())
41     hashobj = SuppressingHash(hashobj, (ValueError,))
42     hashobj.name = "gif_sha512"
43     return hashobj
44
45 class ProcessingFinished(Exception):
46     pass
47
48 class ImportpkgExtractor(DebExtractor):
49     hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash]
50
51     def __init__(self, callback):
52         DebExtractor.__init__(self)
53         self.callback = callback
54
55     def handle_control_info(self, info):
56         try:
57             source = info["source"].split()[0]
58         except KeyError:
59             source = info["package"]
60         # deb822 currently returns :any dependencies raw. see #670679
61         deprelations = info.relations.get("depends", []) + \
62                        info.relations.get("pre-depends", [])
63         depends = set(dep[0]["name"].split(':', 1)[0]
64                       for dep in deprelations if len(dep) == 1)
65         self.callback(dict(package=info["package"], source=source,
66                            version=info["version"],
67                            architecture=info["architecture"], depends=depends))
68
69     def handle_data_tar(self, tarfileobj):
70         for name, size, hashes in get_tar_hashes(tarfileobj,
71                                                  self.hash_functions):
72             try:
73                 name.encode("utf8", "strict")
74             except UnicodeEncodeError:
75                 print("warning: skipping filename with encoding error")
76                 continue # skip files with non-utf8 encoding for now
77             self.callback(dict(name=name, size=size, hashes=hashes))
78         raise ProcessingFinished()
79
80 def main():
81     parser = argparse.ArgumentParser()
82     parser.add_argument("-H", "--hash", action="store",
83                         help="verify that stdin hash given sha256 hash")
84     parser.add_argument("input", nargs='?', default=sys.stdin.buffer,
85                         type=urllib.request.urlopen,
86                         help="read from this location instead of stdin")
87     args = parser.parse_args()
88     dumper = yaml.SafeDumper(sys.stdout)
89     dumper.open()
90     if args.hash:
91         args.input = HashedStream(args.input, hashlib.sha256())
92     try:
93         ImportpkgExtractor(dumper.represent).process(args.input)
94     except ProcessingFinished:
95         pass
96     else:
97         raise RuntimeError("unexpected termination of extractor")
98     if args.hash:
99         args.input.validate(args.hash)
100     dumper.represent("commit")
101     dumper.close()
102
103 if __name__ == "__main__":
104     main()