importpkg: simplify state logic
[~helmut/debian-dedup.git] / importpkg.py
1 #!/usr/bin/python
2 """This tool reads a debian package from stdin and emits a yaml stream on
3 stdout.  It does not access a database. Therefore it can be run in parallel and
4 on multiple machines. The generated yaml conatins multiple documents. The first
5 document contains package metadata. Then a document is emitted for each file.
6 And finally a document consisting of the string "commit" is emitted."""
7
8 import hashlib
9 import sys
10 import tarfile
11 import zlib
12
13 from debian import deb822
14 import lzma
15 import yaml
16
17 from dedup.arreader import ArReader
18 from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, hash_file
19 from dedup.compression import GzipDecompressor, DecompressedStream
20 from dedup.image import ImageHash
21
22 class MultiHash(object):
23     def __init__(self, *hashes):
24         self.hashes = hashes
25
26     def update(self, data):
27         for hasher in self.hashes:
28             hasher.update(data)
29
30 boring_sha512_hashes = set((
31     # ""
32     "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e",
33     # "\n"
34     "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09"))
35
36 def sha512_nontrivial():
37     return HashBlacklist(hashlib.sha512(), boring_sha512_hashes)
38
39 def gziphash():
40     hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
41     hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
42     hashobj.name = "gzip_sha512"
43     return HashBlacklist(hashobj, boring_sha512_hashes)
44
45 def imagehash():
46     hashobj = ImageHash(hashlib.sha512())
47     hashobj = SuppressingHash(hashobj, (ValueError,))
48     hashobj.name = "image_sha512"
49     return hashobj
50
51 def get_hashes(tar):
52     for elem in tar:
53         if not elem.isreg(): # excludes hard links as well
54             continue
55         hasher = MultiHash(sha512_nontrivial(), gziphash(), imagehash())
56         hasher = hash_file(hasher, tar.extractfile(elem))
57         hashes = {}
58         for hashobj in hasher.hashes:
59             hashvalue = hashobj.hexdigest()
60             if hashvalue:
61                 hashes[hashobj.name] = hashvalue
62         yield (elem.name, elem.size, hashes)
63
64 def process_control(control_contents):
65     control = deb822.Packages(control_contents)
66     package = control["package"].encode("ascii")
67     try:
68         source = control["source"].encode("ascii").split()[0]
69     except KeyError:
70         source = package
71     version = control["version"].encode("ascii")
72     architecture = control["architecture"].encode("ascii")
73
74     depends = set(dep[0]["name"].encode("ascii")
75                   for dep in control.relations.get("depends", ())
76                   if len(dep) == 1)
77     return dict(package=package, source=source, version=version,
78                 architecture=architecture, depends=depends)
79
80 def process_package(filelike):
81     af = ArReader(filelike)
82     af.read_magic()
83     state = "start"
84     while True:
85         try:
86             name = af.read_entry()
87         except EOFError:
88             raise ValueError("data.tar not found")
89         if name == "control.tar.gz":
90             if state != "start":
91                 raise ValueError("unexpected control.tar.gz")
92             state = "control"
93             tf = tarfile.open(fileobj=af, mode="r|gz")
94             for elem in tf:
95                 if elem.name != "./control":
96                     continue
97                 if state != "control":
98                     raise ValueError("duplicate control file")
99                 state = "control_file"
100                 yield process_control(tf.extractfile(elem).read())
101                 break
102             continue
103         elif name == "data.tar.gz":
104             tf = tarfile.open(fileobj=af, mode="r|gz")
105         elif name == "data.tar.bz2":
106             tf = tarfile.open(fileobj=af, mode="r|bz2")
107         elif name == "data.tar.xz":
108             zf = DecompressedStream(af, lzma.LZMADecompressor())
109             tf = tarfile.open(fileobj=zf, mode="r|")
110         else:
111             continue
112         if state != "control_file":
113             raise ValueError("missing control file")
114         for name, size, hashes in get_hashes(tf):
115             try:
116                 name = name.decode("utf8")
117             except UnicodeDecodeError:
118                 print("warning: skipping filename with encoding error")
119                 continue # skip files with non-utf8 encoding for now
120             yield dict(name=name, size=size, hashes=hashes)
121         yield "commit"
122         break
123
124 def main():
125     yaml.safe_dump_all(process_package(sys.stdin), sys.stdout)
126
127 if __name__ == "__main__":
128     main()