Merge branch 'crosshash'
[~helmut/debian-dedup.git] / importpkg.py
1 #!/usr/bin/python
2 """
3 CREATE TABLE package (package TEXT PRIMARY KEY, version TEXT, architecture TEXT);
4 CREATE TABLE content (package TEXT, filename TEXT, size INTEGER, function TEXT, hash TEXT, FOREIGN KEY (package) REFERENCES package(package));
5 CREATE TABLE dependency (package TEXT, required TEXT, FOREIGN KEY (package) REFERENCES package(package), FOREIGN KEY (required) REFERENCES package(package));
6 CREATE INDEX content_package_index ON content (package);
7 CREATE INDEX content_hash_index ON content (hash);
8 """
9
10 import hashlib
11 import sqlite3
12 import struct
13 import sys
14 import tarfile
15 import zlib
16
17 from debian.debian_support import version_compare
18 from debian import deb822
19 import lzma
20
21 from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, hash_file
22 from dedup.compression import GzipDecompressor, DecompressedStream
23 from dedup.image import ImageHash
24
25 class ArReader(object):
26     global_magic = b"!<arch>\n"
27     file_magic = b"`\n"
28
29     def __init__(self, fileobj):
30         self.fileobj = fileobj
31         self.remaining = None
32         self.padding = 0
33
34     def skip(self, length):
35         while length:
36             data = self.fileobj.read(min(4096, length))
37             if not data:
38                 raise ValueError("archive truncated")
39             length -= len(data)
40
41     def read_magic(self):
42         data = self.fileobj.read(len(self.global_magic))
43         if data != self.global_magic:
44             raise ValueError("ar global header not found")
45         self.remaining = 0
46
47     def read_entry(self):
48         self.skip_current_entry()
49         if self.padding:
50             if self.fileobj.read(1) != '\n':
51                 raise ValueError("missing ar padding")
52             self.padding = 0
53         file_header = self.fileobj.read(60)
54         if not file_header:
55             raise EOFError("end of archive found")
56         parts = struct.unpack("16s 12s 6s 6s 8s 10s 2s", file_header)
57         parts = [p.rstrip(" ") for p in parts]
58         if parts.pop() != self.file_magic:
59             raise ValueError("ar file header not found")
60         self.remaining = int(parts[5])
61         self.padding = self.remaining % 2
62         return parts[0] # name
63
64     def skip_current_entry(self):
65         self.skip(self.remaining)
66         self.remaining = 0
67
68     def read(self, length=None):
69         if length is None:
70             length = self.remaining
71         else:
72             length = min(self.remaining, length)
73         data = self.fileobj.read(length)
74         self.remaining -= len(data)
75         return data
76
77 class MultiHash(object):
78     def __init__(self, *hashes):
79         self.hashes = hashes
80
81     def update(self, data):
82         for hasher in self.hashes:
83             hasher.update(data)
84
85 boring_sha512_hashes = set((
86     # ""
87     "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e",
88     # "\n"
89     "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09"))
90
91 def sha512_nontrivial():
92     return HashBlacklist(hashlib.sha512(), boring_sha512_hashes)
93
94 def gziphash():
95     hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
96     hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
97     hashobj.name = "gzip_sha512"
98     return HashBlacklist(hashobj, boring_sha512_hashes)
99
100 def imagehash():
101     hashobj = ImageHash(hashlib.sha512())
102     hashobj = SuppressingHash(hashobj, (ValueError,))
103     hashobj.name = "image_sha512"
104     return hashobj
105
106 def get_hashes(tar):
107     for elem in tar:
108         if not elem.isreg(): # excludes hard links as well
109             continue
110         hasher = MultiHash(sha512_nontrivial(), gziphash(), imagehash())
111         hasher = hash_file(hasher, tar.extractfile(elem))
112         for hashobj in hasher.hashes:
113             hashvalue = hashobj.hexdigest()
114             if hashvalue:
115                 yield (elem.name, elem.size, hashobj.name, hashvalue)
116
117 def process_package(db, filelike):
118     cur = db.cursor()
119     af = ArReader(filelike)
120     af.read_magic()
121     state = "start"
122     while True:
123         try:
124             name = af.read_entry()
125         except EOFError:
126             break
127         if name == "control.tar.gz":
128             if state != "start":
129                 raise ValueError("unexpected control.tar.gz")
130             state = "control"
131             tf = tarfile.open(fileobj=af, mode="r|gz")
132             for elem in tf:
133                 if elem.name != "./control":
134                     continue
135                 if state != "control":
136                     raise ValueError("duplicate control file")
137                 state = "control_file"
138                 control = tf.extractfile(elem).read()
139                 control = deb822.Packages(control)
140                 package = control["package"].encode("ascii")
141                 version = control["version"].encode("ascii")
142                 architecture = control["architecture"].encode("ascii")
143
144                 cur.execute("SELECT version FROM package WHERE package = ?;",
145                             (package,))
146                 row = cur.fetchone()
147                 if row and version_compare(row[0], version) > 0:
148                     return # already seen a newer package
149
150                 cur.execute("DELETE FROM package WHERE package = ?;",
151                             (package,))
152                 cur.execute("DELETE FROM content WHERE package = ?;",
153                             (package,))
154                 cur.execute("INSERT INTO package (package, version, architecture) VALUES (?, ?, ?);",
155                             (package, version, architecture))
156                 depends = control.relations.get("depends", [])
157                 depends = set(dep[0]["name"].encode("ascii")
158                               for dep in depends if len(dep) == 1)
159                 cur.execute("DELETE FROM dependency WHERE package = ?;",
160                             (package,))
161                 cur.executemany("INSERT INTO dependency (package, required) VALUES (?, ?);",
162                                 ((package, dep) for dep in depends))
163                 break
164             continue
165         elif name == "data.tar.gz":
166             tf = tarfile.open(fileobj=af, mode="r|gz")
167         elif name == "data.tar.bz2":
168             tf = tarfile.open(fileobj=af, mode="r|bz2")
169         elif name == "data.tar.xz":
170             zf = DecompressedStream(af, lzma.LZMADecompressor())
171             tf = tarfile.open(fileobj=zf, mode="r|")
172         else:
173             continue
174         if state != "control_file":
175             raise ValueError("missing control file")
176         for name, size, function, hexhash in get_hashes(tf):
177             try:
178                 name = name.decode("utf8")
179             except UnicodeDecodeError:
180                 print("warning: skipping filename with encoding error")
181                 continue # skip files with non-utf8 encoding for now
182             cur.execute("INSERT INTO content (package, filename, size, function, hash) VALUES (?, ?, ?, ?, ?);",
183                         (package, name, size, function, hexhash))
184         db.commit()
185         return
186     raise ValueError("data.tar not found")
187
188 def main():
189     db = sqlite3.connect("test.sqlite3")
190     process_package(db, sys.stdin)
191
192 if __name__ == "__main__":
193     main()