importpkg: split process_package to process_control
[~helmut/debian-dedup.git] / importpkg.py
1 #!/usr/bin/python
2 """This tool reads a debian package from stdin and emits a yaml stream on
3 stdout.  It does not access a database. Therefore it can be run in parallel and
4 on multiple machines. The generated yaml conatins multiple documents. The first
5 document contains package metadata. Then a document is emitted for each file.
6 And finally a document consisting of the string "commit" is emitted."""
7
8 import hashlib
9 import sys
10 import tarfile
11 import zlib
12
13 from debian import deb822
14 import lzma
15 import yaml
16
17 from dedup.arreader import ArReader
18 from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, hash_file
19 from dedup.compression import GzipDecompressor, DecompressedStream
20 from dedup.image import ImageHash
21
22 class MultiHash(object):
23     def __init__(self, *hashes):
24         self.hashes = hashes
25
26     def update(self, data):
27         for hasher in self.hashes:
28             hasher.update(data)
29
30 boring_sha512_hashes = set((
31     # ""
32     "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e",
33     # "\n"
34     "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09"))
35
36 def sha512_nontrivial():
37     return HashBlacklist(hashlib.sha512(), boring_sha512_hashes)
38
39 def gziphash():
40     hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
41     hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
42     hashobj.name = "gzip_sha512"
43     return HashBlacklist(hashobj, boring_sha512_hashes)
44
45 def imagehash():
46     hashobj = ImageHash(hashlib.sha512())
47     hashobj = SuppressingHash(hashobj, (ValueError,))
48     hashobj.name = "image_sha512"
49     return hashobj
50
51 def get_hashes(tar):
52     for elem in tar:
53         if not elem.isreg(): # excludes hard links as well
54             continue
55         hasher = MultiHash(sha512_nontrivial(), gziphash(), imagehash())
56         hasher = hash_file(hasher, tar.extractfile(elem))
57         hashes = {}
58         for hashobj in hasher.hashes:
59             hashvalue = hashobj.hexdigest()
60             if hashvalue:
61                 hashes[hashobj.name] = hashvalue
62         yield (elem.name, elem.size, hashes)
63
64 def process_control(control_contents):
65     control = deb822.Packages(control_contents)
66     package = control["package"].encode("ascii")
67     try:
68         source = control["source"].encode("ascii").split()[0]
69     except KeyError:
70         source = package
71     version = control["version"].encode("ascii")
72     architecture = control["architecture"].encode("ascii")
73
74     depends = set(dep[0]["name"].encode("ascii")
75                   for dep in control.relations.get("depends", ())
76                   if len(dep) == 1)
77     return dict(package=package, source=source, version=version,
78                 architecture=architecture, depends=depends)
79
80 def process_package(filelike):
81     af = ArReader(filelike)
82     af.read_magic()
83     state = "start"
84     while state not in ("finished", "skipped"):
85         try:
86             name = af.read_entry()
87         except EOFError:
88             if state != "finished":
89                 raise ValueError("data.tar not found")
90         if name == "control.tar.gz":
91             if state != "start":
92                 raise ValueError("unexpected control.tar.gz")
93             state = "control"
94             tf = tarfile.open(fileobj=af, mode="r|gz")
95             for elem in tf:
96                 if elem.name != "./control":
97                     continue
98                 if state != "control":
99                     raise ValueError("duplicate control file")
100                 state = "control_file"
101                 yield process_control(tf.extractfile(elem).read())
102                 break
103             continue
104         elif name == "data.tar.gz":
105             tf = tarfile.open(fileobj=af, mode="r|gz")
106         elif name == "data.tar.bz2":
107             tf = tarfile.open(fileobj=af, mode="r|bz2")
108         elif name == "data.tar.xz":
109             zf = DecompressedStream(af, lzma.LZMADecompressor())
110             tf = tarfile.open(fileobj=zf, mode="r|")
111         else:
112             continue
113         if state != "control_file":
114             raise ValueError("missing control file")
115         for name, size, hashes in get_hashes(tf):
116             try:
117                 name = name.decode("utf8")
118             except UnicodeDecodeError:
119                 print("warning: skipping filename with encoding error")
120                 continue # skip files with non-utf8 encoding for now
121             yield dict(name=name, size=size, hashes=hashes)
122         state = "finished"
123         yield "commit"
124
125 def main():
126     yaml.safe_dump_all(process_package(sys.stdin), sys.stdout)
127
128 if __name__ == "__main__":
129     main()