move compression functions to module dedup.compression
[~helmut/debian-dedup.git] / dedup / compression.py
1 import struct
2 import zlib
3
4 class GzipDecompressor(object):
5     """An interface to gzip which is similar to bz2.BZ2Decompressor and
6     lzma.LZMADecompressor."""
7     def __init__(self):
8         self.inbuffer = b""
9         self.decompressor = None
10
11     def decompress(self, data):
12         """
13         @raises ValueError: if no gzip magic is found
14         @raises zlib.error: from zlib invocations
15         """
16         while True:
17             if self.decompressor:
18                 data = self.decompressor.decompress(data)
19                 unused_data = self.decompressor.unused_data
20                 if not unused_data:
21                     return data
22                 self.decompressor = None
23                 return data + self.decompress(unused_data)
24             self.inbuffer += data
25             skip = 10
26             if len(self.inbuffer) < skip:
27                 return b""
28             if not self.inbuffer.startswith(b"\037\213\010"):
29                 raise ValueError("gzip magic not found")
30             flag = ord(self.inbuffer[3])
31             if flag & 4:
32                 if len(self.inbuffer) < skip + 2:
33                     return b""
34                 length, = struct.unpack("<H", self.inbuffer[skip:skip+2])
35                 skip += 2 + length
36             for field in (8, 16):
37                 if flag & field:
38                     length = self.inbuffer.find(b"\0", skip)
39                     if length < 0:
40                         return b""
41                     skip = length + 1
42             if flag & 2:
43                 skip += 2
44             if len(self.inbuffer) < skip:
45                 return b""
46             data = self.inbuffer[skip:]
47             self.inbuffer = b""
48             self.decompressor = zlib.decompressobj(-zlib.MAX_WBITS)
49
50     @property
51     def unused_data(self):
52         if self.decompressor:
53             return self.decompressor.unused_data
54         else:
55             return self.inbuffer
56
57     def flush(self):
58         """
59         @raises zlib.error: from zlib invocations
60         """
61         if not self.decompressor:
62             return b""
63         return self.decompressor.flush()
64
65     def copy(self):
66         new = GzipDecompressor()
67         new.inbuffer = self.inbuffer
68         if self.decompressor:
69             new.decompressor = self.decompressor.copy()
70         return new
71
72 class DecompressedStream(object):
73     """Turn a readable file-like into a decompressed file-like. Te only part
74     of being file-like consists of the read(size) method in both cases."""
75     blocksize = 65536
76
77     def __init__(self, fileobj, decompressor):
78         """
79         @param fileobj: a file-like object providing read(size)
80         @param decompressor: a bz2.BZ2Decompressor or lzma.LZMADecompressor
81             like object providing methods decompress and flush and an
82             attribute unused_data
83         """
84         self.fileobj = fileobj
85         self.decompressor = decompressor
86         self.buff = b""
87
88     def read(self, length=None):
89         data = True
90         while True:
91             if length is not None and len(self.buff) >= length:
92                 ret = self.buff[:length]
93                 self.buff = self.buff[length:]
94                 return ret
95             elif not data: # read EOF in last iteration
96                 ret = self.buff
97                 self.buff = b""
98                 return ret
99             data = self.fileobj.read(self.blocksize)
100             if data:
101                 self.buff += self.decompressor.decompress(data)
102             else:
103                 self.buff += self.decompressor.flush()