2 """This tool reads a debian package from stdin and emits a yaml stream on
3 stdout. It does not access a database. Therefore it can be run in parallel and
4 on multiple machines. The generated yaml conatins multiple documents. The first
5 document contains package metadata. Then a document is emitted for each file.
6 And finally a document consisting of the string "commit" is emitted."""
13 from debian import deb822
17 from dedup.arreader import ArReader
18 from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, hash_file
19 from dedup.compression import GzipDecompressor, DecompressedStream
20 from dedup.image import ImageHash
22 class MultiHash(object):
23 def __init__(self, *hashes):
26 def update(self, data):
27 for hasher in self.hashes:
30 boring_sha512_hashes = set((
32 "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e",
34 "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09"))
36 def sha512_nontrivial():
37 return HashBlacklist(hashlib.sha512(), boring_sha512_hashes)
40 hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
41 hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
42 hashobj.name = "gzip_sha512"
43 return HashBlacklist(hashobj, boring_sha512_hashes)
46 hashobj = ImageHash(hashlib.sha512())
47 hashobj = SuppressingHash(hashobj, (ValueError,))
48 hashobj.name = "image_sha512"
53 if not elem.isreg(): # excludes hard links as well
55 hasher = MultiHash(sha512_nontrivial(), gziphash(), imagehash())
56 hasher = hash_file(hasher, tar.extractfile(elem))
58 for hashobj in hasher.hashes:
59 hashvalue = hashobj.hexdigest()
61 hashes[hashobj.name] = hashvalue
62 yield (elem.name, elem.size, hashes)
64 def process_package(filelike):
65 af = ArReader(filelike)
68 while state not in ("finished", "skipped"):
70 name = af.read_entry()
72 if state != "finished":
73 raise ValueError("data.tar not found")
74 if name == "control.tar.gz":
76 raise ValueError("unexpected control.tar.gz")
78 tf = tarfile.open(fileobj=af, mode="r|gz")
80 if elem.name != "./control":
82 if state != "control":
83 raise ValueError("duplicate control file")
84 state = "control_file"
85 control = tf.extractfile(elem).read()
86 control = deb822.Packages(control)
87 package = control["package"].encode("ascii")
89 source = control["source"].encode("ascii").split()[0]
92 version = control["version"].encode("ascii")
93 architecture = control["architecture"].encode("ascii")
95 depends = control.relations.get("depends", [])
96 depends = set(dep[0]["name"].encode("ascii")
97 for dep in depends if len(dep) == 1)
98 yield dict(package=package, source=source, version=version,
99 architecture=architecture, depends=depends)
102 elif name == "data.tar.gz":
103 tf = tarfile.open(fileobj=af, mode="r|gz")
104 elif name == "data.tar.bz2":
105 tf = tarfile.open(fileobj=af, mode="r|bz2")
106 elif name == "data.tar.xz":
107 zf = DecompressedStream(af, lzma.LZMADecompressor())
108 tf = tarfile.open(fileobj=zf, mode="r|")
111 if state != "control_file":
112 raise ValueError("missing control file")
113 for name, size, hashes in get_hashes(tf):
115 name = name.decode("utf8")
116 except UnicodeDecodeError:
117 print("warning: skipping filename with encoding error")
118 continue # skip files with non-utf8 encoding for now
119 yield dict(name=name, size=size, hashes=hashes)
124 yaml.safe_dump_all(process_package(sys.stdin), sys.stdout)
126 if __name__ == "__main__":