importpkg: add support for control.tar and control.tar.xz
[~helmut/debian-dedup.git] / importpkg.py
1 #!/usr/bin/python
2 """This tool reads a Debian package from stdin and emits a yaml stream on
3 stdout.  It does not access a database. Therefore it can be run in parallel and
4 on multiple machines. The generated yaml contains multiple documents. The first
5 document contains package metadata. Then a document is emitted for each file.
6 And finally a document consisting of the string "commit" is emitted."""
7
8 import hashlib
9 import optparse
10 import sys
11 import tarfile
12 import zlib
13
14 import lzma
15 import yaml
16
17 from dedup.arreader import ArReader
18 from dedup.debpkg import process_control, get_tar_hashes
19 from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \
20         HashBlacklistContent
21 from dedup.compression import GzipDecompressor, DecompressedStream
22 from dedup.image import GIFHash, PNGHash
23
24 boring_content = set(("", "\n"))
25
26 def sha512_nontrivial():
27     return HashBlacklistContent(hashlib.sha512(), boring_content)
28
29 def gziphash():
30     hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
31     hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
32     hashobj.name = "gzip_sha512"
33     return HashBlacklistContent(hashobj, boring_content)
34
35 def pnghash():
36     hashobj = PNGHash(hashlib.sha512())
37     hashobj = SuppressingHash(hashobj, (ValueError,))
38     hashobj.name = "png_sha512"
39     return hashobj
40
41 def gifhash():
42     hashobj = GIFHash(hashlib.sha512())
43     hashobj = SuppressingHash(hashobj, (ValueError,))
44     hashobj.name = "gif_sha512"
45     return hashobj
46
47 def process_package(filelike, hash_functions):
48     af = ArReader(filelike)
49     af.read_magic()
50     state = "start"
51     while True:
52         try:
53             name = af.read_entry()
54         except EOFError:
55             raise ValueError("data.tar not found")
56         if name == "control.tar.gz":
57             new_state = "control"
58             tf = tarfile.open(fileobj=af, mode="r|gz")
59         elif name == "control.tar.xz":
60             new_state = "control"
61             zf = DecompressedStream(af, lzma.LZMADecompressor())
62             tf = tarfile.open(fileobj=zf, mode="r|")
63         elif name == "control.tar":
64             new_state = "control"
65             tf = tarfile.open(fileobj=af, mode="r|")
66         elif name == "data.tar.gz":
67             new_state = "data"
68             tf = tarfile.open(fileobj=af, mode="r|gz")
69         elif name == "data.tar.bz2":
70             new_state = "data"
71             tf = tarfile.open(fileobj=af, mode="r|bz2")
72         elif name == "data.tar.xz":
73             new_state = "data"
74             zf = DecompressedStream(af, lzma.LZMADecompressor())
75             tf = tarfile.open(fileobj=zf, mode="r|")
76         elif name == "data.tar":
77             new_state = "data"
78             tf = tarfile.open(fileobj=af, mode="r|")
79         else:
80             continue
81         if new_state == "control":
82             if state != "start":
83                 raise ValueError("unexpected control.tar")
84             state = new_state
85             for elem in tf:
86                 if elem.name != "./control":
87                     continue
88                 if state != "control":
89                     raise ValueError("duplicate control file")
90                 state = "control_file"
91                 yield process_control(tf.extractfile(elem).read())
92                 break
93             continue
94         elif new_state == "data":
95             if state != "control_file":
96                 raise ValueError("missing control file")
97             state = new_state
98             for name, size, hashes in get_tar_hashes(tf, hash_functions):
99                 try:
100                     name = name.decode("utf8")
101                 except UnicodeDecodeError:
102                     print("warning: skipping filename with encoding error")
103                     continue # skip files with non-utf8 encoding for now
104                 yield dict(name=name, size=size, hashes=hashes)
105             yield "commit"
106             break
107
108 def process_package_with_hash(filelike, hash_functions, sha256hash):
109     hstream = HashedStream(filelike, hashlib.sha256())
110     for elem in process_package(hstream, hash_functions):
111         if elem == "commit":
112             while hstream.read(4096):
113                 pass
114             if hstream.hexdigest() != sha256hash:
115                 raise ValueError("hash sum mismatch")
116             yield elem
117             break
118         yield elem
119
120 def main():
121     parser = optparse.OptionParser()
122     parser.add_option("-H", "--hash", action="store",
123                       help="verify that stdin hash given sha256 hash")
124     options, args = parser.parse_args()
125     hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash]
126     if options.hash:
127         gen = process_package_with_hash(sys.stdin, hash_functions, options.hash)
128     else:
129         gen = process_package(sys.stdin, hash_functions)
130     yaml.safe_dump_all(gen, sys.stdout)
131
132 if __name__ == "__main__":
133     main()