move ArReader from importpkg to dedup.arreader
[~helmut/debian-dedup.git] / importpkg.py
1 #!/usr/bin/python
2
3 import hashlib
4 import sqlite3
5 import sys
6 import tarfile
7 import zlib
8
9 from debian.debian_support import version_compare
10 from debian import deb822
11 import lzma
12
13 from dedup.arreader import ArReader
14 from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, hash_file
15 from dedup.compression import GzipDecompressor, DecompressedStream
16 from dedup.image import ImageHash
17
18 class MultiHash(object):
19     def __init__(self, *hashes):
20         self.hashes = hashes
21
22     def update(self, data):
23         for hasher in self.hashes:
24             hasher.update(data)
25
26 boring_sha512_hashes = set((
27     # ""
28     "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e",
29     # "\n"
30     "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09"))
31
32 def sha512_nontrivial():
33     return HashBlacklist(hashlib.sha512(), boring_sha512_hashes)
34
35 def gziphash():
36     hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
37     hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
38     hashobj.name = "gzip_sha512"
39     return HashBlacklist(hashobj, boring_sha512_hashes)
40
41 def imagehash():
42     hashobj = ImageHash(hashlib.sha512())
43     hashobj = SuppressingHash(hashobj, (ValueError,))
44     hashobj.name = "image_sha512"
45     return hashobj
46
47 def get_hashes(tar):
48     for elem in tar:
49         if not elem.isreg(): # excludes hard links as well
50             continue
51         hasher = MultiHash(sha512_nontrivial(), gziphash(), imagehash())
52         hasher = hash_file(hasher, tar.extractfile(elem))
53         for hashobj in hasher.hashes:
54             hashvalue = hashobj.hexdigest()
55             if hashvalue:
56                 yield (elem.name, elem.size, hashobj.name, hashvalue)
57
58 def process_package(db, filelike):
59     cur = db.cursor()
60     af = ArReader(filelike)
61     af.read_magic()
62     state = "start"
63     while True:
64         try:
65             name = af.read_entry()
66         except EOFError:
67             break
68         if name == "control.tar.gz":
69             if state != "start":
70                 raise ValueError("unexpected control.tar.gz")
71             state = "control"
72             tf = tarfile.open(fileobj=af, mode="r|gz")
73             for elem in tf:
74                 if elem.name != "./control":
75                     continue
76                 if state != "control":
77                     raise ValueError("duplicate control file")
78                 state = "control_file"
79                 control = tf.extractfile(elem).read()
80                 control = deb822.Packages(control)
81                 package = control["package"].encode("ascii")
82                 try:
83                     source = control["source"].encode("ascii").split()[0]
84                 except KeyError:
85                     source = package
86                 version = control["version"].encode("ascii")
87                 architecture = control["architecture"].encode("ascii")
88
89                 cur.execute("SELECT version FROM package WHERE package = ?;",
90                             (package,))
91                 row = cur.fetchone()
92                 if row and version_compare(row[0], version) > 0:
93                     return # already seen a newer package
94
95                 cur.execute("DELETE FROM package WHERE package = ?;",
96                             (package,))
97                 cur.execute("DELETE FROM content WHERE package = ?;",
98                             (package,))
99                 cur.execute("INSERT INTO package (package, version, architecture) VALUES (?, ?, ?);",
100                             (package, version, architecture))
101                 depends = control.relations.get("depends", [])
102                 depends = set(dep[0]["name"].encode("ascii")
103                               for dep in depends if len(dep) == 1)
104                 cur.execute("DELETE FROM dependency WHERE package = ?;",
105                             (package,))
106                 cur.executemany("INSERT INTO dependency (package, required) VALUES (?, ?);",
107                                 ((package, dep) for dep in depends))
108                 cur.execute("DELETE FROM source WHERE package = ?;",
109                             (package,))
110                 cur.execute("INSERT INTO source (source, package) VALUES (?, ?);",
111                             (source, package))
112                 break
113             continue
114         elif name == "data.tar.gz":
115             tf = tarfile.open(fileobj=af, mode="r|gz")
116         elif name == "data.tar.bz2":
117             tf = tarfile.open(fileobj=af, mode="r|bz2")
118         elif name == "data.tar.xz":
119             zf = DecompressedStream(af, lzma.LZMADecompressor())
120             tf = tarfile.open(fileobj=zf, mode="r|")
121         else:
122             continue
123         if state != "control_file":
124             raise ValueError("missing control file")
125         for name, size, function, hexhash in get_hashes(tf):
126             try:
127                 name = name.decode("utf8")
128             except UnicodeDecodeError:
129                 print("warning: skipping filename with encoding error")
130                 continue # skip files with non-utf8 encoding for now
131             cur.execute("INSERT INTO content (package, filename, size, function, hash) VALUES (?, ?, ?, ?, ?);",
132                         (package, name, size, function, hexhash))
133         db.commit()
134         return
135     raise ValueError("data.tar not found")
136
137 def main():
138     db = sqlite3.connect("test.sqlite3")
139     process_package(db, sys.stdin)
140
141 if __name__ == "__main__":
142     main()