10 from debian.debian_support import version_compare
11 from debian import deb822
14 from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, hash_file
15 from dedup.compression import GzipDecompressor, DecompressedStream
16 from dedup.image import ImageHash
18 class ArReader(object):
19 global_magic = b"!<arch>\n"
22 def __init__(self, fileobj):
23 self.fileobj = fileobj
27 def skip(self, length):
29 data = self.fileobj.read(min(4096, length))
31 raise ValueError("archive truncated")
35 data = self.fileobj.read(len(self.global_magic))
36 if data != self.global_magic:
37 raise ValueError("ar global header not found")
41 self.skip_current_entry()
43 if self.fileobj.read(1) != '\n':
44 raise ValueError("missing ar padding")
46 file_header = self.fileobj.read(60)
48 raise EOFError("end of archive found")
49 parts = struct.unpack("16s 12s 6s 6s 8s 10s 2s", file_header)
50 parts = [p.rstrip(" ") for p in parts]
51 if parts.pop() != self.file_magic:
52 raise ValueError("ar file header not found")
53 self.remaining = int(parts[5])
54 self.padding = self.remaining % 2
55 return parts[0] # name
57 def skip_current_entry(self):
58 self.skip(self.remaining)
61 def read(self, length=None):
63 length = self.remaining
65 length = min(self.remaining, length)
66 data = self.fileobj.read(length)
67 self.remaining -= len(data)
70 class MultiHash(object):
71 def __init__(self, *hashes):
74 def update(self, data):
75 for hasher in self.hashes:
78 boring_sha512_hashes = set((
80 "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e",
82 "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09"))
84 def sha512_nontrivial():
85 return HashBlacklist(hashlib.sha512(), boring_sha512_hashes)
88 hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
89 hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
90 hashobj.name = "gzip_sha512"
91 return HashBlacklist(hashobj, boring_sha512_hashes)
94 hashobj = ImageHash(hashlib.sha512())
95 hashobj = SuppressingHash(hashobj, (ValueError,))
96 hashobj.name = "image_sha512"
101 if not elem.isreg(): # excludes hard links as well
103 hasher = MultiHash(sha512_nontrivial(), gziphash(), imagehash())
104 hasher = hash_file(hasher, tar.extractfile(elem))
105 for hashobj in hasher.hashes:
106 hashvalue = hashobj.hexdigest()
108 yield (elem.name, elem.size, hashobj.name, hashvalue)
110 def process_package(db, filelike):
112 af = ArReader(filelike)
117 name = af.read_entry()
120 if name == "control.tar.gz":
122 raise ValueError("unexpected control.tar.gz")
124 tf = tarfile.open(fileobj=af, mode="r|gz")
126 if elem.name != "./control":
128 if state != "control":
129 raise ValueError("duplicate control file")
130 state = "control_file"
131 control = tf.extractfile(elem).read()
132 control = deb822.Packages(control)
133 package = control["package"].encode("ascii")
135 source = control["source"].encode("ascii").split()[0]
138 version = control["version"].encode("ascii")
139 architecture = control["architecture"].encode("ascii")
141 cur.execute("SELECT version FROM package WHERE package = ?;",
144 if row and version_compare(row[0], version) > 0:
145 return # already seen a newer package
147 cur.execute("DELETE FROM package WHERE package = ?;",
149 cur.execute("DELETE FROM content WHERE package = ?;",
151 cur.execute("INSERT INTO package (package, version, architecture) VALUES (?, ?, ?);",
152 (package, version, architecture))
153 depends = control.relations.get("depends", [])
154 depends = set(dep[0]["name"].encode("ascii")
155 for dep in depends if len(dep) == 1)
156 cur.execute("DELETE FROM dependency WHERE package = ?;",
158 cur.executemany("INSERT INTO dependency (package, required) VALUES (?, ?);",
159 ((package, dep) for dep in depends))
160 cur.execute("DELETE FROM source WHERE package = ?;",
162 cur.execute("INSERT INTO source (source, package) VALUES (?, ?);",
166 elif name == "data.tar.gz":
167 tf = tarfile.open(fileobj=af, mode="r|gz")
168 elif name == "data.tar.bz2":
169 tf = tarfile.open(fileobj=af, mode="r|bz2")
170 elif name == "data.tar.xz":
171 zf = DecompressedStream(af, lzma.LZMADecompressor())
172 tf = tarfile.open(fileobj=zf, mode="r|")
175 if state != "control_file":
176 raise ValueError("missing control file")
177 for name, size, function, hexhash in get_hashes(tf):
179 name = name.decode("utf8")
180 except UnicodeDecodeError:
181 print("warning: skipping filename with encoding error")
182 continue # skip files with non-utf8 encoding for now
183 cur.execute("INSERT INTO content (package, filename, size, function, hash) VALUES (?, ?, ?, ?, ?);",
184 (package, name, size, function, hexhash))
187 raise ValueError("data.tar not found")
190 db = sqlite3.connect("test.sqlite3")
191 process_package(db, sys.stdin)
193 if __name__ == "__main__":