support hashing gif images
[~helmut/debian-dedup.git] / importpkg.py
1 #!/usr/bin/python
2 """This tool reads a debian package from stdin and emits a yaml stream on
3 stdout.  It does not access a database. Therefore it can be run in parallel and
4 on multiple machines. The generated yaml conatins multiple documents. The first
5 document contains package metadata. Then a document is emitted for each file.
6 And finally a document consisting of the string "commit" is emitted."""
7
8 import hashlib
9 import optparse
10 import sys
11 import tarfile
12 import zlib
13
14 from debian import deb822
15 import lzma
16 import yaml
17
18 from dedup.arreader import ArReader
19 from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \
20     HashedStream, hash_file
21 from dedup.compression import GzipDecompressor, DecompressedStream
22 from dedup.image import GIFHash, PNGHash
23
24 class MultiHash(object):
25     def __init__(self, *hashes):
26         self.hashes = hashes
27
28     def update(self, data):
29         for hasher in self.hashes:
30             hasher.update(data)
31
32 boring_sha512_hashes = set((
33     # ""
34     "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e",
35     # "\n"
36     "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09"))
37
38 def sha512_nontrivial():
39     return HashBlacklist(hashlib.sha512(), boring_sha512_hashes)
40
41 def gziphash():
42     hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
43     hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
44     hashobj.name = "gzip_sha512"
45     return HashBlacklist(hashobj, boring_sha512_hashes)
46
47 def pnghash():
48     hashobj = PNGHash(hashlib.sha512())
49     hashobj = SuppressingHash(hashobj, (ValueError,))
50     hashobj.name = "png_sha512"
51     return hashobj
52
53 def gifhash():
54     hashobj = GIFHash(hashlib.sha512())
55     hashobj = SuppressingHash(hashobj, (ValueError,))
56     hashobj.name = "gif_sha512"
57     return hashobj
58
59 def get_hashes(tar):
60     for elem in tar:
61         if not elem.isreg(): # excludes hard links as well
62             continue
63         hasher = MultiHash(sha512_nontrivial(), gziphash(), pnghash(),
64                            gifhash())
65         hasher = hash_file(hasher, tar.extractfile(elem))
66         hashes = {}
67         for hashobj in hasher.hashes:
68             hashvalue = hashobj.hexdigest()
69             if hashvalue:
70                 hashes[hashobj.name] = hashvalue
71         yield (elem.name, elem.size, hashes)
72
73 def process_control(control_contents):
74     control = deb822.Packages(control_contents)
75     package = control["package"].encode("ascii")
76     try:
77         source = control["source"].encode("ascii").split()[0]
78     except KeyError:
79         source = package
80     version = control["version"].encode("ascii")
81     architecture = control["architecture"].encode("ascii")
82
83     depends = set(dep[0]["name"].encode("ascii")
84                   for dep in control.relations.get("depends", ())
85                   if len(dep) == 1)
86     return dict(package=package, source=source, version=version,
87                 architecture=architecture, depends=depends)
88
89 def process_package(filelike):
90     af = ArReader(filelike)
91     af.read_magic()
92     state = "start"
93     while True:
94         try:
95             name = af.read_entry()
96         except EOFError:
97             raise ValueError("data.tar not found")
98         if name == "control.tar.gz":
99             if state != "start":
100                 raise ValueError("unexpected control.tar.gz")
101             state = "control"
102             tf = tarfile.open(fileobj=af, mode="r|gz")
103             for elem in tf:
104                 if elem.name != "./control":
105                     continue
106                 if state != "control":
107                     raise ValueError("duplicate control file")
108                 state = "control_file"
109                 yield process_control(tf.extractfile(elem).read())
110                 break
111             continue
112         elif name == "data.tar.gz":
113             tf = tarfile.open(fileobj=af, mode="r|gz")
114         elif name == "data.tar.bz2":
115             tf = tarfile.open(fileobj=af, mode="r|bz2")
116         elif name == "data.tar.xz":
117             zf = DecompressedStream(af, lzma.LZMADecompressor())
118             tf = tarfile.open(fileobj=zf, mode="r|")
119         elif name == "data.tar":
120             tf = tarfile.open(fileobj=af, mode="r|")
121         else:
122             continue
123         if state != "control_file":
124             raise ValueError("missing control file")
125         for name, size, hashes in get_hashes(tf):
126             try:
127                 name = name.decode("utf8")
128             except UnicodeDecodeError:
129                 print("warning: skipping filename with encoding error")
130                 continue # skip files with non-utf8 encoding for now
131             yield dict(name=name, size=size, hashes=hashes)
132         yield "commit"
133         break
134
135 def process_package_with_hash(filelike, sha256hash):
136     hstream = HashedStream(filelike, hashlib.sha256())
137     for elem in process_package(hstream):
138         if elem == "commit":
139             while hstream.read(4096):
140                 pass
141             if hstream.hexdigest() != sha256hash:
142                 raise ValueError("hash sum mismatch")
143             yield elem
144             break
145         yield elem
146
147 def main():
148     parser = optparse.OptionParser()
149     parser.add_option("-H", "--hash", action="store",
150                       help="verify that stdin hash given sha256 hash")
151     options, args = parser.parse_args()
152     if options.hash:
153         gen = process_package_with_hash(sys.stdin, options.hash)
154     else:
155         gen = process_package(sys.stdin)
156     yaml.safe_dump_all(gen, sys.stdout)
157
158 if __name__ == "__main__":
159     main()