2f38f5cfc63e46a3fdba3704e1634b0a9dd767dc
[~helmut/debian-dedup.git] / importpkg.py
1 #!/usr/bin/python
2 """This tool reads a debian package from stdin and emits a yaml stream on
3 stdout.  It does not access a database. Therefore it can be run in parallel and
4 on multiple machines. The generated yaml conatins multiple documents. The first
5 document contains package metadata. Then a document is emitted for each file.
6 And finally a document consisting of the string "commit" is emitted."""
7
8 import hashlib
9 import optparse
10 import sys
11 import tarfile
12 import zlib
13
14 from debian import deb822
15 import lzma
16 import yaml
17
18 from dedup.arreader import ArReader
19 from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \
20     HashedStream, hash_file
21 from dedup.compression import GzipDecompressor, DecompressedStream
22 from dedup.image import ImageHash
23
24 class MultiHash(object):
25     def __init__(self, *hashes):
26         self.hashes = hashes
27
28     def update(self, data):
29         for hasher in self.hashes:
30             hasher.update(data)
31
32 boring_sha512_hashes = set((
33     # ""
34     "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e",
35     # "\n"
36     "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09"))
37
38 def sha512_nontrivial():
39     return HashBlacklist(hashlib.sha512(), boring_sha512_hashes)
40
41 def gziphash():
42     hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
43     hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
44     hashobj.name = "gzip_sha512"
45     return HashBlacklist(hashobj, boring_sha512_hashes)
46
47 def imagehash():
48     hashobj = ImageHash(hashlib.sha512())
49     hashobj = SuppressingHash(hashobj, (ValueError,))
50     hashobj.name = "image_sha512"
51     return hashobj
52
53 def get_hashes(tar):
54     for elem in tar:
55         if not elem.isreg(): # excludes hard links as well
56             continue
57         hasher = MultiHash(sha512_nontrivial(), gziphash(), imagehash())
58         hasher = hash_file(hasher, tar.extractfile(elem))
59         hashes = {}
60         for hashobj in hasher.hashes:
61             hashvalue = hashobj.hexdigest()
62             if hashvalue:
63                 hashes[hashobj.name] = hashvalue
64         yield (elem.name, elem.size, hashes)
65
66 def process_control(control_contents):
67     control = deb822.Packages(control_contents)
68     package = control["package"].encode("ascii")
69     try:
70         source = control["source"].encode("ascii").split()[0]
71     except KeyError:
72         source = package
73     version = control["version"].encode("ascii")
74     architecture = control["architecture"].encode("ascii")
75
76     depends = set(dep[0]["name"].encode("ascii")
77                   for dep in control.relations.get("depends", ())
78                   if len(dep) == 1)
79     return dict(package=package, source=source, version=version,
80                 architecture=architecture, depends=depends)
81
82 def process_package(filelike):
83     af = ArReader(filelike)
84     af.read_magic()
85     state = "start"
86     while True:
87         try:
88             name = af.read_entry()
89         except EOFError:
90             raise ValueError("data.tar not found")
91         if name == "control.tar.gz":
92             if state != "start":
93                 raise ValueError("unexpected control.tar.gz")
94             state = "control"
95             tf = tarfile.open(fileobj=af, mode="r|gz")
96             for elem in tf:
97                 if elem.name != "./control":
98                     continue
99                 if state != "control":
100                     raise ValueError("duplicate control file")
101                 state = "control_file"
102                 yield process_control(tf.extractfile(elem).read())
103                 break
104             continue
105         elif name == "data.tar.gz":
106             tf = tarfile.open(fileobj=af, mode="r|gz")
107         elif name == "data.tar.bz2":
108             tf = tarfile.open(fileobj=af, mode="r|bz2")
109         elif name == "data.tar.xz":
110             zf = DecompressedStream(af, lzma.LZMADecompressor())
111             tf = tarfile.open(fileobj=zf, mode="r|")
112         else:
113             continue
114         if state != "control_file":
115             raise ValueError("missing control file")
116         for name, size, hashes in get_hashes(tf):
117             try:
118                 name = name.decode("utf8")
119             except UnicodeDecodeError:
120                 print("warning: skipping filename with encoding error")
121                 continue # skip files with non-utf8 encoding for now
122             yield dict(name=name, size=size, hashes=hashes)
123         yield "commit"
124         break
125
126 def process_package_with_hash(filelike, sha256hash):
127     hstream = HashedStream(filelike, hashlib.sha256())
128     for elem in process_package(hstream):
129         if elem == "commit":
130             while hstream.read(4096):
131                 pass
132             if hstream.hexdigest() != sha256hash:
133                 raise ValueError("hash sum mismatch")
134             yield elem
135             break
136         yield elem
137
138 def main():
139     parser = optparse.OptionParser()
140     parser.add_option("-H", "--hash", action="store",
141                       help="verify that stdin hash given sha256 hash")
142     options, args = parser.parse_args()
143     if options.hash:
144         gen = process_package_with_hash(sys.stdin, options.hash)
145     else:
146         gen = process_package(sys.stdin)
147     yaml.safe_dump_all(gen, sys.stdout)
148
149 if __name__ == "__main__":
150     main()