add option -d --database for db path to all scripts
[~helmut/debian-dedup.git] / importpkg.py
1 #!/usr/bin/python
2 """This tool reads a debian package from stdin and emits a yaml stream on
3 stdout.  It does not access a database. Therefore it can be run in parallel and
4 on multiple machines. The generated yaml conatins multiple documents. The first
5 document contains package metadata. Then a document is emitted for each file.
6 And finally a document consisting of the string "commit" is emitted."""
7
8 import hashlib
9 import optparse
10 import sys
11 import tarfile
12 import zlib
13
14 import lzma
15 import yaml
16
17 from dedup.arreader import ArReader
18 from dedup.debpkg import process_control, get_tar_hashes
19 from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \
20     HashedStream
21 from dedup.compression import GzipDecompressor, DecompressedStream
22 from dedup.image import GIFHash, PNGHash
23
24 boring_sha512_hashes = set((
25     # ""
26     "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e",
27     # "\n"
28     "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09"))
29
30 def sha512_nontrivial():
31     return HashBlacklist(hashlib.sha512(), boring_sha512_hashes)
32
33 def gziphash():
34     hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
35     hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
36     hashobj.name = "gzip_sha512"
37     # don't blacklist boring hashes for gzip to get gzip issues right
38     return hashobj
39
40 def pnghash():
41     hashobj = PNGHash(hashlib.sha512())
42     hashobj = SuppressingHash(hashobj, (ValueError,))
43     hashobj.name = "png_sha512"
44     return hashobj
45
46 def gifhash():
47     hashobj = GIFHash(hashlib.sha512())
48     hashobj = SuppressingHash(hashobj, (ValueError,))
49     hashobj.name = "gif_sha512"
50     return hashobj
51
52 def process_package(filelike, hash_functions):
53     af = ArReader(filelike)
54     af.read_magic()
55     state = "start"
56     while True:
57         try:
58             name = af.read_entry()
59         except EOFError:
60             raise ValueError("data.tar not found")
61         if name == "control.tar.gz":
62             if state != "start":
63                 raise ValueError("unexpected control.tar.gz")
64             state = "control"
65             tf = tarfile.open(fileobj=af, mode="r|gz")
66             for elem in tf:
67                 if elem.name != "./control":
68                     continue
69                 if state != "control":
70                     raise ValueError("duplicate control file")
71                 state = "control_file"
72                 yield process_control(tf.extractfile(elem).read())
73                 break
74             continue
75         elif name == "data.tar.gz":
76             tf = tarfile.open(fileobj=af, mode="r|gz")
77         elif name == "data.tar.bz2":
78             tf = tarfile.open(fileobj=af, mode="r|bz2")
79         elif name == "data.tar.xz":
80             zf = DecompressedStream(af, lzma.LZMADecompressor())
81             tf = tarfile.open(fileobj=zf, mode="r|")
82         elif name == "data.tar":
83             tf = tarfile.open(fileobj=af, mode="r|")
84         else:
85             continue
86         if state != "control_file":
87             raise ValueError("missing control file")
88         for name, size, hashes in get_tar_hashes(tf, hash_functions):
89             try:
90                 name = name.decode("utf8")
91             except UnicodeDecodeError:
92                 print("warning: skipping filename with encoding error")
93                 continue # skip files with non-utf8 encoding for now
94             yield dict(name=name, size=size, hashes=hashes)
95         yield "commit"
96         break
97
98 def process_package_with_hash(filelike, hash_functions, sha256hash):
99     hstream = HashedStream(filelike, hashlib.sha256())
100     for elem in process_package(hstream, hash_functions):
101         if elem == "commit":
102             while hstream.read(4096):
103                 pass
104             if hstream.hexdigest() != sha256hash:
105                 raise ValueError("hash sum mismatch")
106             yield elem
107             break
108         yield elem
109
110 def main():
111     parser = optparse.OptionParser()
112     parser.add_option("-H", "--hash", action="store",
113                       help="verify that stdin hash given sha256 hash")
114     options, args = parser.parse_args()
115     hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash]
116     if options.hash:
117         gen = process_package_with_hash(sys.stdin, hash_functions, options.hash)
118     else:
119         gen = process_package(sys.stdin, hash_functions)
120     yaml.safe_dump_all(gen, sys.stdout)
121
122 if __name__ == "__main__":
123     main()