importpkg.py: support uncompressed data.tar
[~helmut/debian-dedup.git] / importpkg.py
1 #!/usr/bin/python
2 """This tool reads a debian package from stdin and emits a yaml stream on
3 stdout.  It does not access a database. Therefore it can be run in parallel and
4 on multiple machines. The generated yaml conatins multiple documents. The first
5 document contains package metadata. Then a document is emitted for each file.
6 And finally a document consisting of the string "commit" is emitted."""
7
8 import hashlib
9 import optparse
10 import sys
11 import tarfile
12 import zlib
13
14 from debian import deb822
15 import lzma
16 import yaml
17
18 from dedup.arreader import ArReader
19 from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \
20     HashedStream, hash_file
21 from dedup.compression import GzipDecompressor, DecompressedStream
22 from dedup.image import ImageHash
23
24 class MultiHash(object):
25     def __init__(self, *hashes):
26         self.hashes = hashes
27
28     def update(self, data):
29         for hasher in self.hashes:
30             hasher.update(data)
31
32 boring_sha512_hashes = set((
33     # ""
34     "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e",
35     # "\n"
36     "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09"))
37
38 def sha512_nontrivial():
39     return HashBlacklist(hashlib.sha512(), boring_sha512_hashes)
40
41 def gziphash():
42     hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
43     hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
44     hashobj.name = "gzip_sha512"
45     return HashBlacklist(hashobj, boring_sha512_hashes)
46
47 def imagehash():
48     hashobj = ImageHash(hashlib.sha512())
49     hashobj = SuppressingHash(hashobj, (ValueError,))
50     hashobj.name = "image_sha512"
51     return hashobj
52
53 def get_hashes(tar):
54     for elem in tar:
55         if not elem.isreg(): # excludes hard links as well
56             continue
57         hasher = MultiHash(sha512_nontrivial(), gziphash(), imagehash())
58         hasher = hash_file(hasher, tar.extractfile(elem))
59         hashes = {}
60         for hashobj in hasher.hashes:
61             hashvalue = hashobj.hexdigest()
62             if hashvalue:
63                 hashes[hashobj.name] = hashvalue
64         yield (elem.name, elem.size, hashes)
65
66 def process_control(control_contents):
67     control = deb822.Packages(control_contents)
68     package = control["package"].encode("ascii")
69     try:
70         source = control["source"].encode("ascii").split()[0]
71     except KeyError:
72         source = package
73     version = control["version"].encode("ascii")
74     architecture = control["architecture"].encode("ascii")
75
76     depends = set(dep[0]["name"].encode("ascii")
77                   for dep in control.relations.get("depends", ())
78                   if len(dep) == 1)
79     return dict(package=package, source=source, version=version,
80                 architecture=architecture, depends=depends)
81
82 def process_package(filelike):
83     af = ArReader(filelike)
84     af.read_magic()
85     state = "start"
86     while True:
87         try:
88             name = af.read_entry()
89         except EOFError:
90             raise ValueError("data.tar not found")
91         if name == "control.tar.gz":
92             if state != "start":
93                 raise ValueError("unexpected control.tar.gz")
94             state = "control"
95             tf = tarfile.open(fileobj=af, mode="r|gz")
96             for elem in tf:
97                 if elem.name != "./control":
98                     continue
99                 if state != "control":
100                     raise ValueError("duplicate control file")
101                 state = "control_file"
102                 yield process_control(tf.extractfile(elem).read())
103                 break
104             continue
105         elif name == "data.tar.gz":
106             tf = tarfile.open(fileobj=af, mode="r|gz")
107         elif name == "data.tar.bz2":
108             tf = tarfile.open(fileobj=af, mode="r|bz2")
109         elif name == "data.tar.xz":
110             zf = DecompressedStream(af, lzma.LZMADecompressor())
111             tf = tarfile.open(fileobj=zf, mode="r|")
112         elif name == "data.tar":
113             tf = tarfile.open(fileobj=af, mode="r|")
114         else:
115             continue
116         if state != "control_file":
117             raise ValueError("missing control file")
118         for name, size, hashes in get_hashes(tf):
119             try:
120                 name = name.decode("utf8")
121             except UnicodeDecodeError:
122                 print("warning: skipping filename with encoding error")
123                 continue # skip files with non-utf8 encoding for now
124             yield dict(name=name, size=size, hashes=hashes)
125         yield "commit"
126         break
127
128 def process_package_with_hash(filelike, sha256hash):
129     hstream = HashedStream(filelike, hashlib.sha256())
130     for elem in process_package(hstream):
131         if elem == "commit":
132             while hstream.read(4096):
133                 pass
134             if hstream.hexdigest() != sha256hash:
135                 raise ValueError("hash sum mismatch")
136             yield elem
137             break
138         yield elem
139
140 def main():
141     parser = optparse.OptionParser()
142     parser.add_option("-H", "--hash", action="store",
143                       help="verify that stdin hash given sha256 hash")
144     options, args = parser.parse_args()
145     if options.hash:
146         gen = process_package_with_hash(sys.stdin, options.hash)
147     else:
148         gen = process_package(sys.stdin)
149     yaml.safe_dump_all(gen, sys.stdout)
150
151 if __name__ == "__main__":
152     main()