use urlopen from urllib2 on py2
[~helmut/debian-dedup.git] / importpkg.py
1 #!/usr/bin/python
2 """This tool reads a Debian package from stdin and emits a yaml stream on
3 stdout.  It does not access a database. Therefore it can be run in parallel and
4 on multiple machines. The generated yaml contains multiple documents. The first
5 document contains package metadata. Then a document is emitted for each file.
6 And finally a document consisting of the string "commit" is emitted."""
7
8 import argparse
9 import hashlib
10 import sys
11 import zlib
12 try:
13     from urllib.request import urlopen
14 except ImportError:
15     from urllib2 import urlopen
16
17 import yaml
18
19 from dedup.debpkg import DebExtractor, decodetarname, get_tar_hashes
20 from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \
21         HashBlacklistContent
22 from dedup.compression import GzipDecompressor
23 from dedup.image import GIFHash, PNGHash
24
25 boring_content = set(("", "\n"))
26
27 def sha512_nontrivial():
28     return HashBlacklistContent(hashlib.sha512(), boring_content)
29
30 def gziphash():
31     hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
32     hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
33     hashobj.name = "gzip_sha512"
34     return HashBlacklistContent(hashobj, boring_content)
35
36 def pnghash():
37     hashobj = PNGHash(hashlib.sha512())
38     hashobj = SuppressingHash(hashobj, (ValueError,))
39     hashobj.name = "png_sha512"
40     return hashobj
41
42 def gifhash():
43     hashobj = GIFHash(hashlib.sha512())
44     hashobj = SuppressingHash(hashobj, (ValueError,))
45     hashobj.name = "gif_sha512"
46     return hashobj
47
48 class ProcessingFinished(Exception):
49     pass
50
51 class ImportpkgExtractor(DebExtractor):
52     hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash]
53
54     def __init__(self, callback):
55         DebExtractor.__init__(self)
56         self.callback = callback
57
58     def handle_control_info(self, info):
59         try:
60             source = info["source"].split()[0]
61         except KeyError:
62             source = info["package"]
63         # deb822 currently returns :any dependencies raw. see #670679
64         deprelations = info.relations.get("depends", []) + \
65                        info.relations.get("pre-depends", [])
66         depends = set(dep[0]["name"].split(u':', 1)[0]
67                       for dep in deprelations if len(dep) == 1)
68         self.callback(dict(package=info["package"], source=source,
69                            version=info["version"],
70                            architecture=info["architecture"], depends=depends))
71
72     def handle_data_tar(self, tarfileobj):
73         for name, size, hashes in get_tar_hashes(tarfileobj,
74                                                  self.hash_functions):
75             try:
76                 name = decodetarname(name)
77             except UnicodeDecodeError:
78                 print("warning: skipping filename with encoding error")
79                 continue # skip files with non-utf8 encoding for now
80             self.callback(dict(name=name, size=size, hashes=hashes))
81         raise ProcessingFinished()
82
83 def main():
84     try:
85         stdin = sys.stdin.buffer
86     except AttributeError: # python2
87         stdin = sys.stdin
88     parser = argparse.ArgumentParser()
89     parser.add_argument("-H", "--hash", action="store",
90                         help="verify that stdin hash given sha256 hash")
91     parser.add_argument("input", nargs='?', default=stdin, type=urlopen,
92                         help="read from this location instead of stdin")
93     args = parser.parse_args()
94     dumper = yaml.SafeDumper(sys.stdout)
95     dumper.open()
96     if args.hash:
97         args.input = HashedStream(args.input, hashlib.sha256())
98     try:
99         ImportpkgExtractor(dumper.represent).process(args.input)
100     except ProcessingFinished:
101         pass
102     else:
103         raise RuntimeError("unexpected termination of extractor")
104     if args.hash:
105         args.input.validate(args.hash)
106     dumper.represent("commit")
107     dumper.close()
108
109 if __name__ == "__main__":
110     main()