* Rename "image_sha512" to "png_sha512".
* dedup.image.ImageHash is now a base class for image hashes such as
PNGHash and GIFHash.
* Enable both hashes in importpkg.
* Fix README.
* Add new hash combinations to webapp.
* Add "gif file not named *.gif" to issues in update_sharing.
* Add redirect for "image_sha512" to webapp for backwards
compatibility.
Finding PNG images that do not carry a .png file extension.
- SELECT package.name, content.filename, content.size FROM content JOIN hash ON content.id = hash.cid JOIN package ON content.pid = package.id JOIN function ON hash.fid = function.id WHERE function.name = "image_sha512" AND lower(filename) NOT LIKE "%.png";
+ SELECT package.name, content.filename, content.size FROM content JOIN hash ON content.id = hash.cid JOIN package ON content.pid = package.id JOIN function ON hash.fid = function.id WHERE function.name = "png_sha512" AND lower(filename) NOT LIKE "%.png";
Finding .gz files which either are not gziped or contain errors.
import PIL.Image
class ImageHash(object):
- """A hash on the contents of an image. This disregards mode, depth and meta
- information. Note that due to limitations in PIL and the image format
- (interlacing) the full contents are stored and decoded in hexdigest."""
+ """A hash on the contents of an image datat type supported by PIL. This
+ disregards mode, depth and meta information. Note that due to limitations
+ in PIL and the image format (interlacing) the full contents are stored and
+ decoded in hexdigest."""
maxsize = 1024 * 1024 * 32
# max memory usage is about 5 * maxpixels in bytes
maxpixels = 1024 * 1024 * 32
self.imagedetected = False
self.content = io.BytesIO()
+ def detect(self):
+ raise NotImplementedError
+
def update(self, data):
self.content.write(data)
if self.content.tell() > self.maxsize:
raise ValueError("maximum image size exceeded")
- if self.imagedetected:
- return
- if self.content.tell() < 33: # header + IHDR
- return
- curvalue = self.content.getvalue()
- if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"):
- width, height = struct.unpack(">II", curvalue[16:24])
- if width * height > self.maxpixels:
- raise ValueError("maximum image pixels exceeded")
- self.imagedetected = True
- return
- raise ValueError("not a png image")
+ if not self.imagedetected:
+ self.imagedetected = self.detect()
def copy(self):
- new = ImageHash()
- new.hashobj = self.hashobj.copy()
+ new = self.__class__(self.hashobj.copy())
new.imagedetected = self.imagedetected
new.content = io.BytesIO(self.content.getvalue())
return new
def hexdigest(self):
if not self.imagedetected:
- raise ValueError("not a png image")
+ raise ValueError("not a image")
hashobj = self.hashobj.copy()
pos = self.content.tell()
try:
try:
img = PIL.Image.open(self.content)
except IOError:
- raise ValueError("broken png header")
+ raise ValueError("broken header")
width, height = img.size
pack = lambda elem: struct.pack("BBBB", *elem)
# special casing easy modes reduces memory usage
elif img.mode != "RGBA":
try:
img = img.convert("RGBA")
- except (SyntaxError, IndexError, IOError): # crazy stuff from PIL
- raise ValueError("error reading png image")
+ except (SyntaxError, IndexError, IOError):
+ # crazy stuff from PIL
+ raise ValueError("error reading image")
try:
for elem in img.getdata():
hashobj.update(pack(elem))
except (SyntaxError, IndexError, IOError): # crazy stuff from PIL
- raise ValueError("error reading png image")
+ raise ValueError("error reading image")
finally:
self.content.seek(pos)
return "%s%8.8x%8.8x" % (hashobj.hexdigest(), width, height)
+
+
+class PNGHash(ImageHash):
+ """A hash on the contents of a PNG image."""
+
+ def detect(self):
+ if self.content.tell() < 33: # header + IHDR
+ return False
+ curvalue = self.content.getvalue()
+ if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"):
+ width, height = struct.unpack(">II", curvalue[16:24])
+ if width * height > self.maxpixels:
+ raise ValueError("maximum image pixels exceeded")
+ return True
+ raise ValueError("not a png image")
+
+class GIFHash(ImageHash):
+ """A hash on the contents of the first frame of a GIF image."""
+
+ def detect(self):
+ if self.content.tell() < 10: # magic + logical dimension
+ return False
+ curvalue = self.content.getvalue()
+ if curvalue.startswith((b"GIF87a", "GIF89a")):
+ width, height = struct.unpack("<HH", curvalue[6:10])
+ if width * height > self.maxpixels:
+ raise ValueError("maximum image pixels exceeded")
+ return True
+ raise ValueError("not a png image")
from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \
HashedStream, hash_file
from dedup.compression import GzipDecompressor, DecompressedStream
-from dedup.image import ImageHash
+from dedup.image import GIFHash, PNGHash
class MultiHash(object):
def __init__(self, *hashes):
hashobj.name = "gzip_sha512"
return HashBlacklist(hashobj, boring_sha512_hashes)
-def imagehash():
- hashobj = ImageHash(hashlib.sha512())
+def pnghash():
+ hashobj = PNGHash(hashlib.sha512())
hashobj = SuppressingHash(hashobj, (ValueError,))
- hashobj.name = "image_sha512"
+ hashobj.name = "png_sha512"
+ return hashobj
+
+def gifhash():
+ hashobj = GIFHash(hashlib.sha512())
+ hashobj = SuppressingHash(hashobj, (ValueError,))
+ hashobj.name = "gif_sha512"
return hashobj
def get_hashes(tar):
for elem in tar:
if not elem.isreg(): # excludes hard links as well
continue
- hasher = MultiHash(sha512_nontrivial(), gziphash(), imagehash())
+ hasher = MultiHash(sha512_nontrivial(), gziphash(), pnghash(),
+ gifhash())
hasher = hash_file(hasher, tar.extractfile(elem))
hashes = {}
for hashobj in hasher.hashes:
CREATE TABLE package (id INTEGER PRIMARY KEY, name TEXT UNIQUE, version TEXT, architecture TEXT, source TEXT);
CREATE TABLE content (id INTEGER PRIMARY KEY, pid INTEGER, filename TEXT, size INTEGER, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE);
CREATE TABLE function (id INTEGER PRIMARY KEY, name TEXT UNIQUE NOT NULL);
-INSERT INTO function (name) VALUES ("sha512"), ("gzip_sha512"), ("image_sha512");
+INSERT INTO function (name) VALUES ("sha512"), ("gzip_sha512"), ("png_sha512"), ("gif_sha512");
CREATE TABLE hash (cid INTEGER, fid INTEGER NOT NULL, hash TEXT, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE, FOREIGN KEY (fid) REFERENCES function(id));
CREATE TABLE dependency (pid INTEGER, required TEXT, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE);
CREATE INDEX content_package_size_index ON content (pid, size);
[(row[1],) for row in rows])
process_pkgdict(cur, pkgdict)
cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = 'gzip_sha512');")
- cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'image_sha512' AND lower(filename) NOT LIKE '%.png';")
+ cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'png_sha512' AND lower(filename) NOT LIKE '%.png';")
+ cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'gif image not named something.gif' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'gif_sha512' AND lower(filename) NOT LIKE '%.gif';")
db.commit()
if __name__ == "__main__":
hash_functions = [
("sha512", "sha512"),
- ("image_sha512", "image_sha512"),
+ ("png_sha512", "png_sha512"),
+ ("png_sha512", "gif_sha512"),
+ ("gif_sha512", "png_sha512"),
+ ("gif_sha512", "gif_sha512"),
("gzip_sha512", "gzip_sha512"),
("sha512", "gzip_sha512"),
("gzip_sha512", "sha512")]
elif endpoint == "detail":
return self.show_detail(args["package1"], args["package2"])
elif endpoint == "hash":
+ if args["function"] == "image_sha512":
+ # backwards compatibility
+ raise RequestRedirect("%s/hash/png_sha512/%s" %
+ (request.environ["SCRIPT_NAME"],
+ args["hashvalue"]))
return self.show_hash(args["function"], args["hashvalue"])
elif endpoint == "index":
if not request.environ["PATH_INFO"]: