add a class DebExtractor for guiding feature extraction
[~helmut/debian-dedup.git] / importpkg.py
1 #!/usr/bin/python
2 """This tool reads a Debian package from stdin and emits a yaml stream on
3 stdout.  It does not access a database. Therefore it can be run in parallel and
4 on multiple machines. The generated yaml contains multiple documents. The first
5 document contains package metadata. Then a document is emitted for each file.
6 And finally a document consisting of the string "commit" is emitted."""
7
8 import hashlib
9 import optparse
10 import sys
11 import tarfile
12 import zlib
13
14 import lzma
15 import yaml
16
17 from dedup.debpkg import DebExtractor, process_control, get_tar_hashes
18 from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \
19         HashBlacklistContent
20 from dedup.compression import GzipDecompressor, DecompressedStream
21 from dedup.image import GIFHash, PNGHash
22
23 boring_content = set(("", "\n"))
24
25 def sha512_nontrivial():
26     return HashBlacklistContent(hashlib.sha512(), boring_content)
27
28 def gziphash():
29     hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
30     hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
31     hashobj.name = "gzip_sha512"
32     return HashBlacklistContent(hashobj, boring_content)
33
34 def pnghash():
35     hashobj = PNGHash(hashlib.sha512())
36     hashobj = SuppressingHash(hashobj, (ValueError,))
37     hashobj.name = "png_sha512"
38     return hashobj
39
40 def gifhash():
41     hashobj = GIFHash(hashlib.sha512())
42     hashobj = SuppressingHash(hashobj, (ValueError,))
43     hashobj.name = "gif_sha512"
44     return hashobj
45
46 def decompress_tar(filelike, extension):
47     if extension in (b".lzma", b".xz"):
48         filelike = DecompressedStream(filelike, lzma.LZMADecompressor())
49         extension = b""
50     if extension not in (b"", b".gz", b".bz2"):
51         raise ValueError("unknown compression format with extension %r" %
52                          extension)
53     return tarfile.open(fileobj=filelike,
54                         mode="r|" + extension[1:].decode("ascii"))
55
56 class ProcessingFinished(Exception):
57     pass
58
59 class ImportpkgExtractor(DebExtractor):
60     def __init__(self, hash_functions, callback):
61         self.state = "start"
62         self.hash_functions = hash_functions
63         self.callback = callback
64
65     def handle_ar_member(self, name, filelike):
66         if name.startswith(b"control.tar"):
67             if self.state != "start":
68                 raise ValueError("unexpected control.tar")
69             self.state = "control"
70             tf = decompress_tar(filelike, name[11:])
71             for elem in tf:
72                 if elem.name not in ("./control", "control"):
73                     continue
74                 if self.state != "control":
75                     raise ValueError("duplicate control file")
76                 self.state = "control_file"
77                 self.callback(process_control(tf.extractfile(elem).read()))
78                 break
79         elif name.startswith(b"data.tar"):
80             if self.state != "control_file":
81                 raise ValueError("missing control file")
82             self.state = "data"
83             tf = decompress_tar(filelike, name[8:])
84             for name, size, hashes in get_tar_hashes(tf, self.hash_functions):
85                 try:
86                     name = name.decode("utf8")
87                 except UnicodeDecodeError:
88                     print("warning: skipping filename with encoding error")
89                     continue # skip files with non-utf8 encoding for now
90                 self.callback(dict(name=name, size=size, hashes=hashes))
91             raise ProcessingFinished()
92
93     def handle_ar_end(self):
94         if self.state != "data":
95             raise ValueError("data.tar not found")
96
97 def main():
98     parser = optparse.OptionParser()
99     parser.add_option("-H", "--hash", action="store",
100                       help="verify that stdin hash given sha256 hash")
101     options, args = parser.parse_args()
102     hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash]
103     try:
104         stdin = sys.stdin.buffer
105     except AttributeError: # python2
106         stdin = sys.stdin
107     dumper = yaml.SafeDumper(sys.stdout)
108     dumper.open()
109     if options.hash:
110         stdin = HashedStream(stdin, hashlib.sha256())
111     try:
112         ImportpkgExtractor(hash_functions, dumper.represent).process(stdin)
113     except ProcessingFinished:
114         pass
115     else:
116         raise RuntimeError("unexpected termination of extractor")
117     if options.hash:
118         stdin.validate(options.hash)
119     dumper.represent("commit")
120     dumper.close()
121
122 if __name__ == "__main__":
123     main()