4cc93577652c2265f3d54ecb74848887a73af438
[~helmut/debian-dedup.git] / dedup / filemagic.py
1 """A very strange "hash" that uses the magic module (python3-magic) to guess
2 the file type."""
3
4 import magic
5
6 class FileDigester(object):
7     """A hashlib-like class to guess a filetype using the magic module."""
8     FILE_BYTES_MAX = 1024 * 1024 # copied from file source
9
10     def __init__(self):
11         self.buff = b""
12         self.identification = None
13
14     def _compute_identification(self):
15         try:
16             return magic.none_magic.buffer(self.buff)
17         except UnicodeDecodeError:
18             return "magic identification is not valid UTF-8"
19
20     def update(self, buff):
21         if self.identification:
22             return
23         self.buff += buff
24         if len(self.buff) >= self.FILE_BYTES_MAX:
25             self.identification = self._compute_identification()
26             self.buff = None
27
28     def identify(self):
29         """Return the guessed file magic identification."""
30         if self.identification:
31             return self.identification
32         return self._compute_identification()
33
34     def hexdigest(self):
35         """Compatibility with hashlib. An alias of identify. Doesn't return
36         hex."""
37         return self.identify()
38
39     def copy(self):
40         new = FileDigester()
41         new.buff = self.buff
42         new.identification = self.identification
43         return new