hash image contents
authorHelmut Grohne <helmut@subdivi.de>
Sun, 24 Feb 2013 00:03:30 +0000 (01:03 +0100)
committerHelmut Grohne <helmut@subdivi.de>
Sun, 24 Feb 2013 00:03:30 +0000 (01:03 +0100)
README
dedup/image.py [new file with mode: 0644]
importpkg.py

diff --git a/README b/README
index 7bc4517..aff9868 100644 (file)
--- a/README
+++ b/README
@@ -1,7 +1,7 @@
 Required packages
 -----------------
 
-aptitude install python python-debian python-lzma curl python-jinja2 python-werkzeug sqlite3
+aptitude install python python-debian python-lzma curl python-jinja2 python-werkzeug sqlite3 python-imaging
 
 Create a database
 -----------------
diff --git a/dedup/image.py b/dedup/image.py
new file mode 100644 (file)
index 0000000..e05e7da
--- /dev/null
@@ -0,0 +1,67 @@
+import io
+import struct
+
+import PIL.Image
+
+class ImageHash(object):
+    """A hash on the contents of an image. This disregards mode, depth and meta
+    information. Note that due to limitations in PIL and the image format
+    (interlacing) the full contents are stored and decoded in hexdigest."""
+    maxsize = 1024 * 1024 * 32
+    # max memory usage is about 5 * maxpixels in bytes
+    maxpixels = 1024 * 1024 * 32
+
+    def __init__(self, hashobj):
+        """
+        @param hashobj: a hashlib-like object
+        """
+        self.hashobj = hashobj
+        self.imagedetected = False
+        self.content = io.BytesIO()
+
+    def update(self, data):
+        self.content.write(data)
+        if self.content.tell() > self.maxsize:
+            raise ValueError("maximum image size exceeded")
+        if self.imagedetected:
+            return
+        if self.content.tell() < 33: # header + IHDR
+            return
+        curvalue = self.content.getvalue()
+        if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"):
+            width, height = struct.unpack(">II", curvalue[16:24])
+            if width * height > self.maxpixels:
+                raise ValueError("maximum image pixels exceeded")
+            self.imagedetected = True
+            return
+        raise ValueError("not a png image")
+
+    def copy(self):
+        new = ImageHash()
+        new.hashobj = self.hashobj.copy()
+        new.imagedetected = self.imagedetected
+        new.content = io.BytesIO(self.content.getvalue())
+        return new
+
+    def hexdigest(self):
+        if not self.imagedetected:
+            raise ValueError("not a png image")
+        hashobj = self.hashobj.copy()
+        pos = self.content.tell()
+        try:
+            self.content.seek(0)
+            img = PIL.Image.open(self.content)
+            width, height = img.size
+            pack = lambda elem: struct.pack("BBBB", *elem)
+            # special casing easy modes reduces memory usage
+            if img.mode == "L":
+                pack = lambda elem: struct.pack("BBBB", elem, elem, elem, 255)
+            elif img.mode == "RGB":
+                pack = lambda elem: struct.pack("BBBB", *(elem + (255,)))
+            elif img.mode != "RGBA":
+                img = img.convert("RGBA")
+            for elem in img.getdata():
+                hashobj.update(pack(elem))
+        finally:
+            self.content.seek(pos)
+        return "%s%8.8x%8.8x" % (hashobj.hexdigest(), width, height)
index 5901b57..d626fba 100755 (executable)
@@ -20,6 +20,7 @@ import lzma
 
 from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, hash_file
 from dedup.compression import GzipDecompressor, DecompressedStream
+from dedup.image import ImageHash
 
 class ArReader(object):
     global_magic = b"!<arch>\n"
@@ -96,11 +97,17 @@ def gziphash():
     hashobj.name = "gzip_sha512"
     return HashBlacklist(hashobj, boring_sha512_hashes)
 
+def imagehash():
+    hashobj = ImageHash(hashlib.sha512())
+    hashobj = SuppressingHash(hashobj, (ValueError,))
+    hashobj.name = "image_sha512"
+    return hashobj
+
 def get_hashes(tar):
     for elem in tar:
         if not elem.isreg(): # excludes hard links as well
             continue
-        hasher = MultiHash(sha512_nontrivial(), gziphash())
+        hasher = MultiHash(sha512_nontrivial(), gziphash(), imagehash())
         hasher = hash_file(hasher, tar.extractfile(elem))
         for hashobj in hasher.hashes:
             hashvalue = hashobj.hexdigest()