split the import phase to a yaml stream
[~helmut/debian-dedup.git] / importpkg.py
1 #!/usr/bin/python
2 """This tool reads a debian package from stdin and emits a yaml stream on
3 stdout.  It does not access a database. Therefore it can be run in parallel and
4 on multiple machines. The generated yaml conatins multiple documents. The first
5 document contains package metadata. Then a document is emitted for each file.
6 And finally a document consisting of the string "commit" is emitted."""
7
8 import hashlib
9 import sys
10 import tarfile
11 import zlib
12
13 from debian import deb822
14 import lzma
15 import yaml
16
17 from dedup.arreader import ArReader
18 from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, hash_file
19 from dedup.compression import GzipDecompressor, DecompressedStream
20 from dedup.image import ImageHash
21
22 class MultiHash(object):
23     def __init__(self, *hashes):
24         self.hashes = hashes
25
26     def update(self, data):
27         for hasher in self.hashes:
28             hasher.update(data)
29
30 boring_sha512_hashes = set((
31     # ""
32     "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e",
33     # "\n"
34     "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09"))
35
36 def sha512_nontrivial():
37     return HashBlacklist(hashlib.sha512(), boring_sha512_hashes)
38
39 def gziphash():
40     hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
41     hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
42     hashobj.name = "gzip_sha512"
43     return HashBlacklist(hashobj, boring_sha512_hashes)
44
45 def imagehash():
46     hashobj = ImageHash(hashlib.sha512())
47     hashobj = SuppressingHash(hashobj, (ValueError,))
48     hashobj.name = "image_sha512"
49     return hashobj
50
51 def get_hashes(tar):
52     for elem in tar:
53         if not elem.isreg(): # excludes hard links as well
54             continue
55         hasher = MultiHash(sha512_nontrivial(), gziphash(), imagehash())
56         hasher = hash_file(hasher, tar.extractfile(elem))
57         hashes = {}
58         for hashobj in hasher.hashes:
59             hashvalue = hashobj.hexdigest()
60             if hashvalue:
61                 hashes[hashobj.name] = hashvalue
62         yield (elem.name, elem.size, hashes)
63
64 def process_package(filelike):
65     af = ArReader(filelike)
66     af.read_magic()
67     state = "start"
68     while state not in ("finished", "skipped"):
69         try:
70             name = af.read_entry()
71         except EOFError:
72             if state != "finished":
73                 raise ValueError("data.tar not found")
74         if name == "control.tar.gz":
75             if state != "start":
76                 raise ValueError("unexpected control.tar.gz")
77             state = "control"
78             tf = tarfile.open(fileobj=af, mode="r|gz")
79             for elem in tf:
80                 if elem.name != "./control":
81                     continue
82                 if state != "control":
83                     raise ValueError("duplicate control file")
84                 state = "control_file"
85                 control = tf.extractfile(elem).read()
86                 control = deb822.Packages(control)
87                 package = control["package"].encode("ascii")
88                 try:
89                     source = control["source"].encode("ascii").split()[0]
90                 except KeyError:
91                     source = package
92                 version = control["version"].encode("ascii")
93                 architecture = control["architecture"].encode("ascii")
94
95                 depends = control.relations.get("depends", [])
96                 depends = set(dep[0]["name"].encode("ascii")
97                               for dep in depends if len(dep) == 1)
98                 yield dict(package=package, source=source, version=version,
99                            architecture=architecture, depends=depends)
100                 break
101             continue
102         elif name == "data.tar.gz":
103             tf = tarfile.open(fileobj=af, mode="r|gz")
104         elif name == "data.tar.bz2":
105             tf = tarfile.open(fileobj=af, mode="r|bz2")
106         elif name == "data.tar.xz":
107             zf = DecompressedStream(af, lzma.LZMADecompressor())
108             tf = tarfile.open(fileobj=zf, mode="r|")
109         else:
110             continue
111         if state != "control_file":
112             raise ValueError("missing control file")
113         for name, size, hashes in get_hashes(tf):
114             try:
115                 name = name.decode("utf8")
116             except UnicodeDecodeError:
117                 print("warning: skipping filename with encoding error")
118                 continue # skip files with non-utf8 encoding for now
119             yield dict(name=name, size=size, hashes=hashes)
120         state = "finished"
121         yield "commit"
122
123 def main():
124     yaml.safe_dump_all(process_package(sys.stdin), sys.stdout)
125
126 if __name__ == "__main__":
127     main()