webapp.py: fuse two sql queries in get_details
[~helmut/debian-dedup.git] / importpkg.py
1 #!/usr/bin/python3
2 """This tool reads a Debian package from stdin and emits a yaml stream on
3 stdout.  It does not access a database. Therefore it can be run in parallel and
4 on multiple machines. The generated yaml contains multiple documents. The first
5 document contains package metadata. Then a document is emitted for each file.
6 And finally a document consisting of the string "commit" is emitted."""
7
8 import argparse
9 import hashlib
10 import sys
11 import urllib.request
12 import zlib
13
14 import yaml
15
16 from dedup.debpkg import DebExtractor, get_tar_hashes
17 from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \
18         HashBlacklistContent
19 from dedup.compression import GzipDecompressor
20 from dedup.image import GIFHash, PNGHash
21
22 boring_content = set((b"", b"\n"))
23
24 def sha512_nontrivial():
25     return HashBlacklistContent(hashlib.sha512(), boring_content)
26
27 def gziphash():
28     hashobj = hashlib.sha512()
29     hashobj = DecompressedHash(GzipDecompressor(), hashobj, "gzip_sha512")
30     hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
31     return HashBlacklistContent(hashobj, boring_content)
32
33 def pnghash():
34     return SuppressingHash(PNGHash(hashlib.sha512()), (ValueError,))
35
36 def gifhash():
37     return SuppressingHash(GIFHash(hashlib.sha512()), (ValueError,))
38
39 class ProcessingFinished(Exception):
40     pass
41
42 class ImportpkgExtractor(DebExtractor):
43     hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash]
44
45     def __init__(self, callback):
46         DebExtractor.__init__(self)
47         self.callback = callback
48
49     def handle_control_info(self, info):
50         try:
51             source = info["source"].split()[0]
52         except KeyError:
53             source = info["package"]
54         # deb822 currently returns :any dependencies raw. see #670679
55         deprelations = info.relations.get("depends", []) + \
56                        info.relations.get("pre-depends", [])
57         depends = set(dep[0]["name"].split(':', 1)[0]
58                       for dep in deprelations if len(dep) == 1)
59         self.callback(dict(package=info["package"], source=source,
60                            version=info["version"],
61                            architecture=info["architecture"], depends=depends))
62
63     def handle_data_tar(self, tarfileobj):
64         for name, size, hashes in get_tar_hashes(tarfileobj,
65                                                  self.hash_functions):
66             try:
67                 name.encode("utf8", "strict")
68             except UnicodeEncodeError:
69                 print("warning: skipping filename with encoding error")
70                 continue # skip files with non-utf8 encoding for now
71             self.callback(dict(name=name, size=size, hashes=hashes))
72         raise ProcessingFinished()
73
74 def main():
75     parser = argparse.ArgumentParser()
76     parser.add_argument("-H", "--hash", action="store",
77                         help="verify that stdin hash given sha256 hash")
78     parser.add_argument("input", nargs='?', default=sys.stdin.buffer,
79                         type=urllib.request.urlopen,
80                         help="read from this location instead of stdin")
81     args = parser.parse_args()
82     dumper = yaml.CSafeDumper(sys.stdout)
83     dumper.open()
84     if args.hash:
85         args.input = HashedStream(args.input, hashlib.sha256())
86     try:
87         ImportpkgExtractor(dumper.represent).process(args.input)
88     except ProcessingFinished:
89         pass
90     else:
91         raise RuntimeError("unexpected termination of extractor")
92     if args.hash:
93         args.input.validate(args.hash)
94     dumper.represent("commit")
95     dumper.close()
96
97 if __name__ == "__main__":
98     main()