e8cc2fa20f8a0535a1763af42cc41296e8fd0e27
[~helmut/debian-dedup.git] / importpkg.py
1 #!/usr/bin/python
2 """This tool reads a Debian package from stdin and emits a yaml stream on
3 stdout.  It does not access a database. Therefore it can be run in parallel and
4 on multiple machines. The generated yaml contains multiple documents. The first
5 document contains package metadata. Then a document is emitted for each file.
6 And finally a document consisting of the string "commit" is emitted."""
7
8 import hashlib
9 import optparse
10 import sys
11 import tarfile
12 import zlib
13
14 import yaml
15
16 from dedup.debpkg import DebExtractor, process_control, get_tar_hashes
17 from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \
18         HashBlacklistContent
19 from dedup.compression import GzipDecompressor, decompress
20 from dedup.image import GIFHash, PNGHash
21
22 boring_content = set(("", "\n"))
23
24 def sha512_nontrivial():
25     return HashBlacklistContent(hashlib.sha512(), boring_content)
26
27 def gziphash():
28     hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
29     hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
30     hashobj.name = "gzip_sha512"
31     return HashBlacklistContent(hashobj, boring_content)
32
33 def pnghash():
34     hashobj = PNGHash(hashlib.sha512())
35     hashobj = SuppressingHash(hashobj, (ValueError,))
36     hashobj.name = "png_sha512"
37     return hashobj
38
39 def gifhash():
40     hashobj = GIFHash(hashlib.sha512())
41     hashobj = SuppressingHash(hashobj, (ValueError,))
42     hashobj.name = "gif_sha512"
43     return hashobj
44
45 if sys.version_info.major >= 3:
46     def decompress_tar(filelike, extension):
47         filelike = decompress(filelike, extension.decode("ascii"))
48         return tarfile.open(fileobj=filelike, mode="r|")
49
50     def decodetarname(name):
51         """Decoded name of a tarinfo.
52         @raises UnicodeDecodeError:
53         """
54         try:
55             name.encode("utf8", "strict")
56         except UnicodeEncodeError as e:
57             if e.reason == "surrogates not allowed":
58                 name.encode("utf8", "surrogateescape").decode("utf8", "strict")
59         return name
60 else:
61     def decompress_tar(filelike, extension):
62         filelike = decompress(filelike, extension.decode("ascii"))
63         return tarfile.open(fileobj=filelike, mode="r|", encoding="utf8",
64                             errors="surrogateescape")
65
66     def decodetarname(name):
67         """Decoded name of a tarinfo.
68         @raises UnicodeDecodeError:
69         """
70         return name.decode("utf8")
71
72 class ProcessingFinished(Exception):
73     pass
74
75 class ImportpkgExtractor(DebExtractor):
76     hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash]
77
78     def __init__(self, callback):
79         self.state = "start"
80         self.callback = callback
81
82     def handle_ar_member(self, name, filelike):
83         if name.startswith(b"control.tar"):
84             if self.state != "start":
85                 raise ValueError("unexpected control.tar")
86             self.state = "control"
87             tf = decompress_tar(filelike, name[11:])
88             for elem in tf:
89                 if elem.name not in ("./control", "control"):
90                     continue
91                 if self.state != "control":
92                     raise ValueError("duplicate control file")
93                 self.state = "control_file"
94                 self.callback(process_control(tf.extractfile(elem).read()))
95                 break
96         elif name.startswith(b"data.tar"):
97             if self.state != "control_file":
98                 raise ValueError("missing control file")
99             self.state = "data"
100             tf = decompress_tar(filelike, name[8:])
101             for name, size, hashes in get_tar_hashes(tf, self.hash_functions):
102                 try:
103                     name = decodetarname(name)
104                 except UnicodeDecodeError:
105                     print("warning: skipping filename with encoding error")
106                     continue # skip files with non-utf8 encoding for now
107                 self.callback(dict(name=name, size=size, hashes=hashes))
108             raise ProcessingFinished()
109
110     def handle_ar_end(self):
111         if self.state != "data":
112             raise ValueError("data.tar not found")
113
114 def main():
115     parser = optparse.OptionParser()
116     parser.add_option("-H", "--hash", action="store",
117                       help="verify that stdin hash given sha256 hash")
118     options, args = parser.parse_args()
119     try:
120         stdin = sys.stdin.buffer
121     except AttributeError: # python2
122         stdin = sys.stdin
123     dumper = yaml.SafeDumper(sys.stdout)
124     dumper.open()
125     if options.hash:
126         stdin = HashedStream(stdin, hashlib.sha256())
127     try:
128         ImportpkgExtractor(dumper.represent).process(stdin)
129     except ProcessingFinished:
130         pass
131     else:
132         raise RuntimeError("unexpected termination of extractor")
133     if options.hash:
134         stdin.validate(options.hash)
135     dumper.represent("commit")
136     dumper.close()
137
138 if __name__ == "__main__":
139     main()