Merge branch schemachange
[~helmut/debian-dedup.git] / importpkg.py
1 #!/usr/bin/python
2
3 import hashlib
4 import sqlite3
5 import sys
6 import tarfile
7 import zlib
8
9 from debian.debian_support import version_compare
10 from debian import deb822
11 import lzma
12
13 from dedup.arreader import ArReader
14 from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, hash_file
15 from dedup.compression import GzipDecompressor, DecompressedStream
16 from dedup.image import ImageHash
17
18 class MultiHash(object):
19     def __init__(self, *hashes):
20         self.hashes = hashes
21
22     def update(self, data):
23         for hasher in self.hashes:
24             hasher.update(data)
25
26 boring_sha512_hashes = set((
27     # ""
28     "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e",
29     # "\n"
30     "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09"))
31
32 def sha512_nontrivial():
33     return HashBlacklist(hashlib.sha512(), boring_sha512_hashes)
34
35 def gziphash():
36     hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
37     hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
38     hashobj.name = "gzip_sha512"
39     return HashBlacklist(hashobj, boring_sha512_hashes)
40
41 def imagehash():
42     hashobj = ImageHash(hashlib.sha512())
43     hashobj = SuppressingHash(hashobj, (ValueError,))
44     hashobj.name = "image_sha512"
45     return hashobj
46
47 def get_hashes(tar):
48     for elem in tar:
49         if not elem.isreg(): # excludes hard links as well
50             continue
51         hasher = MultiHash(sha512_nontrivial(), gziphash(), imagehash())
52         hasher = hash_file(hasher, tar.extractfile(elem))
53         hashes = {}
54         for hashobj in hasher.hashes:
55             hashvalue = hashobj.hexdigest()
56             if hashvalue:
57                 hashes[hashobj.name] = hashvalue
58         yield (elem.name, elem.size, hashes)
59
60 def process_package(db, filelike):
61     cur = db.cursor()
62     cur.execute("PRAGMA foreign_keys = ON;")
63     af = ArReader(filelike)
64     af.read_magic()
65     state = "start"
66     while True:
67         try:
68             name = af.read_entry()
69         except EOFError:
70             break
71         if name == "control.tar.gz":
72             if state != "start":
73                 raise ValueError("unexpected control.tar.gz")
74             state = "control"
75             tf = tarfile.open(fileobj=af, mode="r|gz")
76             for elem in tf:
77                 if elem.name != "./control":
78                     continue
79                 if state != "control":
80                     raise ValueError("duplicate control file")
81                 state = "control_file"
82                 control = tf.extractfile(elem).read()
83                 control = deb822.Packages(control)
84                 package = control["package"].encode("ascii")
85                 try:
86                     source = control["source"].encode("ascii").split()[0]
87                 except KeyError:
88                     source = package
89                 version = control["version"].encode("ascii")
90                 architecture = control["architecture"].encode("ascii")
91
92                 cur.execute("SELECT version FROM package WHERE package = ?;",
93                             (package,))
94                 row = cur.fetchone()
95                 if row and version_compare(row[0], version) > 0:
96                     return # already seen a newer package
97
98                 cur.execute("DELETE FROM content WHERE package = ?;",
99                             (package,))
100                 cur.execute("INSERT OR REPLACE INTO package (package, version, architecture, source) VALUES (?, ?, ?, ?);",
101                             (package, version, architecture, source))
102                 depends = control.relations.get("depends", [])
103                 depends = set(dep[0]["name"].encode("ascii")
104                               for dep in depends if len(dep) == 1)
105                 cur.execute("DELETE FROM dependency WHERE package = ?;",
106                             (package,))
107                 cur.executemany("INSERT INTO dependency (package, required) VALUES (?, ?);",
108                                 ((package, dep) for dep in depends))
109                 break
110             continue
111         elif name == "data.tar.gz":
112             tf = tarfile.open(fileobj=af, mode="r|gz")
113         elif name == "data.tar.bz2":
114             tf = tarfile.open(fileobj=af, mode="r|bz2")
115         elif name == "data.tar.xz":
116             zf = DecompressedStream(af, lzma.LZMADecompressor())
117             tf = tarfile.open(fileobj=zf, mode="r|")
118         else:
119             continue
120         if state != "control_file":
121             raise ValueError("missing control file")
122         for name, size, hashes in get_hashes(tf):
123             try:
124                 name = name.decode("utf8")
125             except UnicodeDecodeError:
126                 print("warning: skipping filename with encoding error")
127                 continue # skip files with non-utf8 encoding for now
128             cur.execute("INSERT INTO content (package, filename, size) VALUES (?, ?, ?);",
129                         (package, name, size))
130             cid = cur.lastrowid
131             cur.executemany("INSERT INTO hash (cid, function, hash) VALUES (?, ?, ?);",
132                             ((cid, func, hexhash) for func, hexhash in hashes.items()))
133         db.commit()
134         return
135     raise ValueError("data.tar not found")
136
137 def main():
138     db = sqlite3.connect("test.sqlite3")
139     process_package(db, sys.stdin)
140
141 if __name__ == "__main__":
142     main()