9 from debian.debian_support import version_compare
10 from debian import deb822
13 from dedup.arreader import ArReader
14 from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, hash_file
15 from dedup.compression import GzipDecompressor, DecompressedStream
16 from dedup.image import ImageHash
18 class MultiHash(object):
19 def __init__(self, *hashes):
22 def update(self, data):
23 for hasher in self.hashes:
26 boring_sha512_hashes = set((
28 "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e",
30 "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09"))
32 def sha512_nontrivial():
33 return HashBlacklist(hashlib.sha512(), boring_sha512_hashes)
36 hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
37 hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
38 hashobj.name = "gzip_sha512"
39 return HashBlacklist(hashobj, boring_sha512_hashes)
42 hashobj = ImageHash(hashlib.sha512())
43 hashobj = SuppressingHash(hashobj, (ValueError,))
44 hashobj.name = "image_sha512"
49 if not elem.isreg(): # excludes hard links as well
51 hasher = MultiHash(sha512_nontrivial(), gziphash(), imagehash())
52 hasher = hash_file(hasher, tar.extractfile(elem))
54 for hashobj in hasher.hashes:
55 hashvalue = hashobj.hexdigest()
57 hashes[hashobj.name] = hashvalue
58 yield (elem.name, elem.size, hashes)
60 def process_package(db, filelike):
62 cur.execute("PRAGMA foreign_keys = ON;")
63 af = ArReader(filelike)
68 name = af.read_entry()
71 if name == "control.tar.gz":
73 raise ValueError("unexpected control.tar.gz")
75 tf = tarfile.open(fileobj=af, mode="r|gz")
77 if elem.name != "./control":
79 if state != "control":
80 raise ValueError("duplicate control file")
81 state = "control_file"
82 control = tf.extractfile(elem).read()
83 control = deb822.Packages(control)
84 package = control["package"].encode("ascii")
86 source = control["source"].encode("ascii").split()[0]
89 version = control["version"].encode("ascii")
90 architecture = control["architecture"].encode("ascii")
92 cur.execute("SELECT version FROM package WHERE package = ?;",
95 if row and version_compare(row[0], version) > 0:
96 return # already seen a newer package
98 cur.execute("DELETE FROM content WHERE package = ?;",
100 cur.execute("INSERT OR REPLACE INTO package (package, version, architecture, source) VALUES (?, ?, ?, ?);",
101 (package, version, architecture, source))
102 depends = control.relations.get("depends", [])
103 depends = set(dep[0]["name"].encode("ascii")
104 for dep in depends if len(dep) == 1)
105 cur.execute("DELETE FROM dependency WHERE package = ?;",
107 cur.executemany("INSERT INTO dependency (package, required) VALUES (?, ?);",
108 ((package, dep) for dep in depends))
111 elif name == "data.tar.gz":
112 tf = tarfile.open(fileobj=af, mode="r|gz")
113 elif name == "data.tar.bz2":
114 tf = tarfile.open(fileobj=af, mode="r|bz2")
115 elif name == "data.tar.xz":
116 zf = DecompressedStream(af, lzma.LZMADecompressor())
117 tf = tarfile.open(fileobj=zf, mode="r|")
120 if state != "control_file":
121 raise ValueError("missing control file")
122 for name, size, hashes in get_hashes(tf):
124 name = name.decode("utf8")
125 except UnicodeDecodeError:
126 print("warning: skipping filename with encoding error")
127 continue # skip files with non-utf8 encoding for now
128 cur.execute("INSERT INTO content (package, filename, size) VALUES (?, ?, ?);",
129 (package, name, size))
131 cur.executemany("INSERT INTO hash (cid, function, hash) VALUES (?, ?, ?);",
132 ((cid, func, hexhash) for func, hexhash in hashes.items()))
135 raise ValueError("data.tar not found")
138 db = sqlite3.connect("test.sqlite3")
139 process_package(db, sys.stdin)
141 if __name__ == "__main__":