GzipDecompressor: don't treat checksum as garbage trailer
[~helmut/debian-dedup.git] / dedup / compression.py
1 import struct
2 import zlib
3
4 class GzipDecompressor(object):
5     """An interface to gzip which is similar to bz2.BZ2Decompressor and
6     lzma.LZMADecompressor."""
7     def __init__(self):
8         self.sawheader = False
9         self.inbuffer = b""
10         self.decompressor = None
11         self.crc = 0
12         self.size = 0
13
14     def decompress(self, data):
15         """
16         @raises ValueError: if no gzip magic is found
17         @raises zlib.error: from zlib invocations
18         """
19         while True:
20             if self.decompressor:
21                 data = self.decompressor.decompress(data)
22                 self.crc = zlib.crc32(data, self.crc)
23                 self.size += len(data)
24                 unused_data = self.decompressor.unused_data
25                 if not unused_data:
26                     return data
27                 self.decompressor = None
28                 return data + self.decompress(unused_data)
29             self.inbuffer += data
30             skip = 10
31             if len(self.inbuffer) < skip:
32                 return b""
33             if not self.inbuffer.startswith(b"\037\213\010"):
34                 raise ValueError("gzip magic not found")
35             flag = ord(self.inbuffer[3])
36             if flag & 4:
37                 if len(self.inbuffer) < skip + 2:
38                     return b""
39                 length, = struct.unpack("<H", self.inbuffer[skip:skip+2])
40                 skip += 2 + length
41             for field in (8, 16):
42                 if flag & field:
43                     length = self.inbuffer.find(b"\0", skip)
44                     if length < 0:
45                         return b""
46                     skip = length + 1
47             if flag & 2:
48                 skip += 2
49             if len(self.inbuffer) < skip:
50                 return b""
51             data = self.inbuffer[skip:]
52             self.inbuffer = b""
53             self.sawheader = True
54             self.decompressor = zlib.decompressobj(-zlib.MAX_WBITS)
55
56     @property
57     def unused_data(self):
58         if self.decompressor:
59             return self.decompressor.unused_data
60         elif not self.sawheader:
61             return self.inbuffer
62         else:
63             expect = struct.pack("<ll", self.crc, self.size)
64             if self.inbuffer.startswith(expect) and \
65                     self.inbuffer[len(expect):].replace("\0", "") == "":
66                 return b""
67             return self.inbuffer
68
69     def flush(self):
70         """
71         @raises zlib.error: from zlib invocations
72         """
73         if not self.decompressor:
74             return b""
75         return self.decompressor.flush()
76
77     def copy(self):
78         new = GzipDecompressor()
79         new.inbuffer = self.inbuffer
80         if self.decompressor:
81             new.decompressor = self.decompressor.copy()
82         new.sawheader = self.sawheader
83         new.crc = self.crc
84         new.size = self.size
85         return new
86
87 class DecompressedStream(object):
88     """Turn a readable file-like into a decompressed file-like. Te only part
89     of being file-like consists of the read(size) method in both cases."""
90     blocksize = 65536
91
92     def __init__(self, fileobj, decompressor):
93         """
94         @param fileobj: a file-like object providing read(size)
95         @param decompressor: a bz2.BZ2Decompressor or lzma.LZMADecompressor
96             like object providing methods decompress and flush and an
97             attribute unused_data
98         """
99         self.fileobj = fileobj
100         self.decompressor = decompressor
101         self.buff = b""
102
103     def read(self, length=None):
104         data = True
105         while True:
106             if length is not None and len(self.buff) >= length:
107                 ret = self.buff[:length]
108                 self.buff = self.buff[length:]
109                 return ret
110             elif not data: # read EOF in last iteration
111                 ret = self.buff
112                 self.buff = b""
113                 return ret
114             data = self.fileobj.read(self.blocksize)
115             if data:
116                 self.buff += self.decompressor.decompress(data)
117             else:
118                 self.buff += self.decompressor.flush()