importpkg: don't blacklist boring gzip_sha512 hashes
[~helmut/debian-dedup.git] / importpkg.py
1 #!/usr/bin/python
2 """This tool reads a debian package from stdin and emits a yaml stream on
3 stdout.  It does not access a database. Therefore it can be run in parallel and
4 on multiple machines. The generated yaml conatins multiple documents. The first
5 document contains package metadata. Then a document is emitted for each file.
6 And finally a document consisting of the string "commit" is emitted."""
7
8 import hashlib
9 import optparse
10 import sys
11 import tarfile
12 import zlib
13
14 from debian import deb822
15 import lzma
16 import yaml
17
18 from dedup.arreader import ArReader
19 from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \
20     HashedStream, hash_file
21 from dedup.compression import GzipDecompressor, DecompressedStream
22 from dedup.image import GIFHash, PNGHash
23
24 class MultiHash(object):
25     def __init__(self, *hashes):
26         self.hashes = hashes
27
28     def update(self, data):
29         for hasher in self.hashes:
30             hasher.update(data)
31
32 boring_sha512_hashes = set((
33     # ""
34     "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e",
35     # "\n"
36     "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09"))
37
38 def sha512_nontrivial():
39     return HashBlacklist(hashlib.sha512(), boring_sha512_hashes)
40
41 def gziphash():
42     hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
43     hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
44     hashobj.name = "gzip_sha512"
45     # don't blacklist boring hashes for gzip to get gzip issues right
46     return hashobj
47
48 def pnghash():
49     hashobj = PNGHash(hashlib.sha512())
50     hashobj = SuppressingHash(hashobj, (ValueError,))
51     hashobj.name = "png_sha512"
52     return hashobj
53
54 def gifhash():
55     hashobj = GIFHash(hashlib.sha512())
56     hashobj = SuppressingHash(hashobj, (ValueError,))
57     hashobj.name = "gif_sha512"
58     return hashobj
59
60 def get_hashes(tar):
61     for elem in tar:
62         if not elem.isreg(): # excludes hard links as well
63             continue
64         hasher = MultiHash(sha512_nontrivial(), gziphash(), pnghash(),
65                            gifhash())
66         hasher = hash_file(hasher, tar.extractfile(elem))
67         hashes = {}
68         for hashobj in hasher.hashes:
69             hashvalue = hashobj.hexdigest()
70             if hashvalue:
71                 hashes[hashobj.name] = hashvalue
72         yield (elem.name, elem.size, hashes)
73
74 def process_control(control_contents):
75     control = deb822.Packages(control_contents)
76     package = control["package"].encode("ascii")
77     try:
78         source = control["source"].encode("ascii").split()[0]
79     except KeyError:
80         source = package
81     version = control["version"].encode("ascii")
82     architecture = control["architecture"].encode("ascii")
83
84     depends = set(dep[0]["name"].encode("ascii")
85                   for dep in control.relations.get("depends", ())
86                   if len(dep) == 1)
87     return dict(package=package, source=source, version=version,
88                 architecture=architecture, depends=depends)
89
90 def process_package(filelike):
91     af = ArReader(filelike)
92     af.read_magic()
93     state = "start"
94     while True:
95         try:
96             name = af.read_entry()
97         except EOFError:
98             raise ValueError("data.tar not found")
99         if name == "control.tar.gz":
100             if state != "start":
101                 raise ValueError("unexpected control.tar.gz")
102             state = "control"
103             tf = tarfile.open(fileobj=af, mode="r|gz")
104             for elem in tf:
105                 if elem.name != "./control":
106                     continue
107                 if state != "control":
108                     raise ValueError("duplicate control file")
109                 state = "control_file"
110                 yield process_control(tf.extractfile(elem).read())
111                 break
112             continue
113         elif name == "data.tar.gz":
114             tf = tarfile.open(fileobj=af, mode="r|gz")
115         elif name == "data.tar.bz2":
116             tf = tarfile.open(fileobj=af, mode="r|bz2")
117         elif name == "data.tar.xz":
118             zf = DecompressedStream(af, lzma.LZMADecompressor())
119             tf = tarfile.open(fileobj=zf, mode="r|")
120         elif name == "data.tar":
121             tf = tarfile.open(fileobj=af, mode="r|")
122         else:
123             continue
124         if state != "control_file":
125             raise ValueError("missing control file")
126         for name, size, hashes in get_hashes(tf):
127             try:
128                 name = name.decode("utf8")
129             except UnicodeDecodeError:
130                 print("warning: skipping filename with encoding error")
131                 continue # skip files with non-utf8 encoding for now
132             yield dict(name=name, size=size, hashes=hashes)
133         yield "commit"
134         break
135
136 def process_package_with_hash(filelike, sha256hash):
137     hstream = HashedStream(filelike, hashlib.sha256())
138     for elem in process_package(hstream):
139         if elem == "commit":
140             while hstream.read(4096):
141                 pass
142             if hstream.hexdigest() != sha256hash:
143                 raise ValueError("hash sum mismatch")
144             yield elem
145             break
146         yield elem
147
148 def main():
149     parser = optparse.OptionParser()
150     parser.add_option("-H", "--hash", action="store",
151                       help="verify that stdin hash given sha256 hash")
152     options, args = parser.parse_args()
153     if options.hash:
154         gen = process_package_with_hash(sys.stdin, options.hash)
155     else:
156         gen = process_package(sys.stdin)
157     yaml.safe_dump_all(gen, sys.stdout)
158
159 if __name__ == "__main__":
160     main()