many improvements
authorHelmut Grohne <helmut@subdivi.de>
Wed, 20 Feb 2013 14:28:04 +0000 (15:28 +0100)
committerHelmut Grohne <helmut@subdivi.de>
Wed, 20 Feb 2013 14:28:04 +0000 (15:28 +0100)
 * multiple hashes
 * template engine
 * new table package
 * comparison view
 * hashvalue view

test.py
webapp.py

diff --git a/test.py b/test.py
index ca9ed0b..733a52b 100755 (executable)
--- a/test.py
+++ b/test.py
@@ -1,6 +1,7 @@
 #!/usr/bin/python
 """
-CREATE TABLE content (package TEXT, version TEXT, architecture TEXT, filename TEXT, size INTEGER, hash TEXT);
+CREATE TABLE package (package TEXT PRIMARY KEY, version TEXT, architecture TEXT);
+CREATE TABLE content (package TEXT, filename TEXT, size INTEGER, function TEXT, hash TEXT, FOREIGN KEY (package) REFERENCES package(package));
 CREATE INDEX content_package_index ON content (package);
 CREATE INDEX content_hash_index ON content (hash);
 """
@@ -12,6 +13,7 @@ import sqlite3
 import struct
 import sys
 import tarfile
+import zlib
 
 import apt_pkg
 import lzma
@@ -91,6 +93,123 @@ class XzStream(object):
             else:
                 self.buff += self.decomp.flush()
 
+class MultiHash(object):
+    def __init__(self, *hashes):
+        self.hashes = hashes
+
+    def update(self, data):
+        for hasher in self.hashes:
+            hasher.update(data)
+
+class HashBlacklist(object):
+    def __init__(self, hasher, blacklist=set()):
+        self.hasher = hasher
+        self.blacklist = blacklist
+        self.update = self.hasher.update
+        self.name = hasher.name
+
+    def hexdigest(self):
+        digest = self.hasher.hexdigest()
+        if digest in self.blacklist:
+            return None
+        return digest
+
+class GzipDecompressor(object):
+    def __init__(self):
+        self.inbuffer = b""
+        self.decompressor = None # zlib.decompressobj(-zlib.MAX_WBITS)
+
+    def decompress(self, data):
+        if self.decompressor:
+            data = self.decompressor.decompress(data)
+            if not self.decompressor.unused_data:
+                return data
+            unused_data = self.decompressor.unused_data
+            self.decompressor = None
+            return data + self.decompress(unused_data)
+        self.inbuffer += data
+        skip = 10
+        if len(self.inbuffer) < skip:
+            return b""
+        if not self.inbuffer.startswith(b"\037\213\010"):
+            raise ValueError("gzip magic not found")
+        flag = ord(self.inbuffer[3])
+        if flag & 4:
+            if len(self.inbuffer) < skip + 2:
+                return b""
+            length, = struct.unpack("<H", self.inbuffer[skip:skip+2])
+            skip += 2 + length
+        for field in (8, 16):
+            if flag & field:
+                length = self.inbuffer.find("\0", skip)
+                if length < 0:
+                    return b""
+                skip = length + 1
+        if flag & 2:
+            skip += 2
+        if len(self.inbuffer) < skip:
+            return b""
+        data = self.inbuffer[skip:]
+        self.inbuffer = b""
+        self.decompressor = zlib.decompressobj(-zlib.MAX_WBITS)
+        return self.decompress(data)
+
+    @property
+    def unused_data(self):
+        if self.decompressor:
+            return self.decompressor.unused_data
+        else:
+            return self.inbuffer
+
+    def flush(self):
+        if not self.decompressor:
+            return b""
+        return self.decompressor.flush()
+
+    def copy(self):
+        new = GzipDecompressor()
+        new.inbuffer = self.inbuffer
+        if self.decompressor:
+            new.decompressor = self.decompressor.copy()
+        return new
+
+class DecompressedHash(object):
+    def __init__(self, decompressor, hashobj):
+        self.decompressor = decompressor
+        self.hashobj = hashobj
+
+    def update(self, data):
+        self.hashobj.update(self.decompressor.decompress(data))
+
+    def hexdigest(self):
+        if not hasattr(self.decompressor, "flush"):
+            return self.hashobj.hexdigest()
+        tmpdecomp = self.decompressor.copy()
+        data = tmpdecomp.flush()
+        tmphash = self.hashobj.copy()
+        tmphash.update(data)
+        return tmphash.hexdigest()
+
+class SuppressingHash(object):
+    def __init__(self, hashobj, exceptions=()):
+        self.hashobj = hashobj
+        self.exceptions = exceptions
+
+    def update(self, data):
+        if self.hashobj:
+            try:
+                self.hashobj.update(data)
+            except self.exceptions:
+                self.hashobj = None
+
+    def hexdigest(self):
+        if self.hashobj:
+            try:
+                return self.hashobj.hexdigest()
+            except self.exceptions:
+                self.hashobj = None
+        return None
+
 def hash_file(hashobj, filelike, blocksize=65536):
     data = filelike.read(blocksize)
     while data:
@@ -98,6 +217,21 @@ def hash_file(hashobj, filelike, blocksize=65536):
         data = filelike.read(blocksize)
     return hashobj
 
+boring_sha512_hashes = set((
+    # ""
+    "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e",
+    # "\n"
+    "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09"))
+
+def sha512_nontrivial():
+    return HashBlacklist(hashlib.sha512(), boring_sha512_hashes)
+
+def gziphash():
+    hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
+    hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
+    hashobj.name = "gzip_sha512"
+    return HashBlacklist(hashobj, boring_sha512_hashes)
+
 def get_hashes(filelike):
     af = ArReader(filelike, lambda name: name.startswith("data.tar"))
     name, membersize = af.skiptillmember()
@@ -115,8 +249,12 @@ def get_hashes(filelike):
             continue
         if not elem.isreg(): # excludes hard links as well
             continue
-        hasher = hash_file(hashlib.sha512(), tf.extractfile(elem))
-        yield (elem.name, elem.size, hasher.hexdigest())
+        hasher = MultiHash(sha512_nontrivial(), gziphash())
+        hasher = hash_file(hasher, tf.extractfile(elem))
+        for hashobj in hasher.hashes:
+            hashvalue = hashobj.hexdigest()
+            if hashvalue:
+                yield (elem.name, elem.size, hashobj.name, hashvalue)
 
 def main():
     filename = sys.argv[1]
@@ -125,21 +263,22 @@ def main():
     db = sqlite3.connect("test.sqlite3")
     cur = db.cursor()
 
-    cur.execute("SELECT version FROM content WHERE package = ?;", (package,))
+    cur.execute("SELECT version FROM package WHERE package = ?;", (package,))
     versions = [tpl[0] for tpl in cur.fetchall()]
     versions.append(version)
     versions.sort(cmp=apt_pkg.version_compare)
     if versions[-1] != version:
         return # not the newest version
 
+    cur.execute("DELETE FROM package WHERE package = ?;", (package,))
     cur.execute("DELETE FROM content WHERE package = ?;", (package,))
-    #cur.execute("DELETE FROM content WHERE package = ? AND version = ? AND architecture = ?;",
-    #        (package, version, architecture))
+    cur.execute("INSERT INTO package (package, version, architecture) VALUES (?, ?, ?);",
+                (package, version, architecture))
     with open(filename) as pkg:
-        for name, size, hexhash in get_hashes(pkg):
+        for name, size, function, hexhash in get_hashes(pkg):
             name = name.decode("utf8")
-            cur.execute("INSERT INTO content (package, version, architecture, filename, size, hash) VALUES (?, ?, ?, ?, ?, ?);",
-                    (package, version, architecture, name, size, hexhash))
+            cur.execute("INSERT INTO content (package, filename, size, function, hash) VALUES (?, ?, ?, ?, ?);",
+                    (package, name, size, function, hexhash))
     db.commit()
 
 if __name__ == "__main__":
index ec9c079..b745db2 100755 (executable)
--- a/webapp.py
+++ b/webapp.py
@@ -3,12 +3,16 @@
 import sqlite3
 from wsgiref.simple_server import make_server
 
+import jinja2
 from werkzeug.debug import DebuggedApplication
 from werkzeug.exceptions import HTTPException, NotFound
 from werkzeug.routing import Map, Rule
 from werkzeug.wrappers import Request, Response
 
+jinjaenv = jinja2.Environment()
+
 def format_size(size):
+    assert isinstance(size, int)
     size = float(size)
     fmt = "%d B"
     if size >= 1024:
@@ -22,13 +26,63 @@ def format_size(size):
         fmt = "%.1f GB"
     return fmt % size
 
+jinjaenv.filters["format_size"] = format_size
+
+package_template = jinjaenv.from_string(
+"""<html><head><title>duplication of {{ package|e }}</title></head>
+<body><h1>{{ package|e }}</h1>
+<p>Version: {{ version|e }}</p>
+<p>Architecture: {{ architecture|e }}</p>
+<p>Number of files: {{ num_files }}</p>
+<p>Total size: {{ total_size|format_size }}</p>
+{%- if shared -%}
+    {%- for function, sharing in shared.items() -%}
+        <h3>sharing with respect to {{ function }}</h3>
+        <table border='1'><tr><th>package</th><th>files shared</th><th>data shared</th></tr>
+        {%- for entry in sharing|sort(attribute="savable", reverse=true) -%}
+            <tr><td>{% if entry.package %}<a href="{{ entry.package|e }}">{{ entry.package|e }}</a>{% else %}self{% endif %}
+                <a href="../compare/{{ package|e }}/{{ entry.package|default(package, true)|e }}">compare</a></td>
+            <td>{{ entry.duplicate }} ({{ (100 * entry.duplicate / num_files)|int }}%)</td>
+            <td>{{ entry.savable|format_size }} ({{ (100 * entry.savable / total_size)|int }}%)</td></tr>
+        {%- endfor -%}
+        </table>
+    {%- endfor -%}
+{%- endif -%}
+{{ content }}
+</body></html>""")
+
+detail_template = jinjaenv.from_string(
+"""<html><head><title>sharing between {{ details1.package|e }} and {{ details2.package|e }}</title></head>
+<body><h1>{{ details1.package|e }} &lt;-&gt; {{ details2.package|e }}</h1>
+{%- if shared -%}
+<table border='1'><tr><th>size</th><th>filename in {{ details1.package|e }}</th><th>filename in {{ details2.package|e }}</th><th>hash functions</th></tr>
+    {%- for entry in shared|sort(attribute="size", reverse=true) -%}
+        <tr><td>{{ entry.size|format_size }}</td><td>{{ entry.filename1 }}</td><td>{{ entry.filename2 }}</td><td>
+        {%- for function, hashvalue in entry.functions.items() %}<a href="../../hash/{{ function|e }}/{{ hashvalue|e }}">{{ function|e }}</a> {% endfor %}</td></tr>
+    {%- endfor -%}
+</table>
+{%- endif -%}
+</body></html>""")
+
+hash_template = jinjaenv.from_string(
+"""<html><head><title>information on {{ function|e }} hash {{ hashvalue|e }}</title></head>
+<body><h1>{{ function|e }} {{ hashvalue|e }}</h1>
+<table border='1'><tr><th>package</th><th>filename</th><th>size</th></tr>
+{%- for entry in entries -%}
+    <tr><td><a href="../../binary/{{ entry.package|e }}">{{ entry.package|e }}</a></td>
+    <td>{{ entry.filename|e }}</td><td>{{ entry.size|format_size }}</td></tr>
+{%- endfor -%}
+</table>
+</body></html>""")
+
 class Application(object):
     def __init__(self):
         self.db = sqlite3.connect("test.sqlite3")
         self.cur = self.db.cursor()
         self.routingmap = Map([
-            Rule("/<package>", methods=("GET",),
-                 endpoint="package"),
+            Rule("/binary/<package>", methods=("GET",), endpoint="package"),
+            Rule("/compare/<package1>/<package2>", methods=("GET",), endpoint="detail"),
+            Rule("/hash/<function>/<hashvalue>", methods=("GET",), endpoint="hash"),
         ])
 
     @Request.application
@@ -36,55 +90,95 @@ class Application(object):
         mapadapter = self.routingmap.bind_to_environ(request.environ)
         try:
             endpoint, args = mapadapter.match()
-            assert endpoint == "package"
-            return self.show_package(args["package"])
+            if endpoint == "package":
+                return self.show_package(args["package"])
+            elif endpoint == "detail":
+                return self.show_detail(args["package1"], args["package2"])
+            elif endpoint == "hash":
+                return self.show_hash(args["function"], args["hashvalue"])
+            raise NotFound()
         except HTTPException as e:
             return e
 
-    def show_package(self, package):
-        self.cur.execute("SELECT version, architecture FROM content WHERE package = ? LIMIT 1;", (package,))
+    def get_details(self, package):
+        self.cur.execute("SELECT version, architecture FROM package WHERE package = ?;",
+                         (package,))
         row = self.cur.fetchone()
         if not row:
             raise NotFound()
         version, architecture = row
-        self.cur.execute("SELECT count(filename) FROM content WHERE package = ?;", (package,))
-        num_files = self.cur.fetchone()[0]
-        self.cur.execute("SELECT sum(size) FROM content WHERE package = ?;", (package,))
-        total_size = self.cur.fetchone()[0]
-        content = "<p>Version: %s</p><p>Architecture: %s</p>" % (version, architecture)
-        content += "<p>Number of files: %d</p>" % num_files
-        content += "<p>Total size: %s</p>" % format_size(total_size)
+        details = dict(package=package,
+                       version=version,
+                       architecture=architecture)
+        self.cur.execute("SELECT count(filename), sum(size) FROM content WHERE package = ?;",
+                         (package,))
+        num_files, total_size = self.cur.fetchone()
+        details.update(dict(num_files=num_files, total_size=total_size))
+        return details
+
+    def show_package(self, package):
+        params = self.get_details(package)
 
         shared = dict()
-        self.cur.execute("SELECT a.filename, a.hash, a.size, b.package FROM content AS a JOIN content AS b ON a.hash = b.hash WHERE a.package = ? AND (a.filename != b.filename OR b.package != ?);", (package, package))
-        for afile, hashval, size, bpkg in self.cur.fetchall():
-            shared.setdefault(bpkg, dict()).setdefault(hashval, (size, set()))[1].add(afile)
+        self.cur.execute("SELECT a.filename, a.function, a.hash, a.size, b.package FROM content AS a JOIN content AS b ON a.function = b.function AND a.hash = b.hash WHERE a.package = ? AND (a.filename != b.filename OR b.package != ?);",
+                         (package, package))
+        for afile, function, hashval, size, bpkg in self.cur.fetchall():
+            pkgdict = shared.setdefault(function, dict())
+            hashdict = pkgdict.setdefault(bpkg, dict())
+            fileset = hashdict.setdefault(hashval, (size, set()))[1]
+            fileset.add(afile)
+        sharedstats = {}
         if shared:
-            sharedstats = []
-            mapping = shared.pop(package, dict())
-            if mapping:
-                duplicate = sum(len(files) for _, files in mapping.values())
-                savable = sum(size * (len(files) - 1) for size, files in mapping.values())
-                sharedstats.append(("self", duplicate, savable))
-            for pkg, mapping in shared.items():
-                pkglink = '<a href="%s">%s</a>' % (pkg, pkg)
-                duplicate = sum(len(files) for _, files in mapping.values())
-                savable = sum(size * len(files) for size, files in mapping.values())
-                sharedstats.append((pkglink, duplicate, savable))
-            sharedstats.sort(key=lambda row: row[2], reverse=True)
-            content += "<table border='1'><tr><th>package</th><th>files shared</th><th>data shared</th></tr>"
-            for pkg, duplicate, savable in sharedstats:
-                content += "<tr><td>%s</td><td>%d (%d%%)</td><td>%s (%d%%)</td></tr>" % (pkg, duplicate, 100. * duplicate / num_files, format_size(savable), 100. * savable / total_size)
-            content += "</table>"
-
-        r = Response(content_type="text/html")
-        r.data = "<html><head><title>duplication of %(package)s</title></head><body><h1>%(package)s</h1>%(content)s</body></html>" % dict(package=package, content=content)
-        return r
+            for function, sharing in shared.items():
+                sharedstats[function] = list()
+                mapping = sharing.pop(package, dict())
+                if mapping:
+                    duplicate = sum(len(files) for _, files in mapping.values())
+                    savable = sum(size * (len(files) - 1) for size, files in mapping.values())
+                    sharedstats[function].append(dict(package=None, duplicate=duplicate, savable=savable))
+                for pkg, mapping in sharing.items():
+                    duplicate = sum(len(files) for _, files in mapping.values())
+                    savable = sum(size * len(files) for size, files in mapping.values())
+                    sharedstats[function].append(dict(package=pkg, duplicate=duplicate, savable=savable))
+
+        params["shared"] = sharedstats
+        return Response(package_template.render(**params).encode("utf8"),
+                        content_type="text/html")
+
+    def show_detail(self, package1, package2):
+        details1 = self.get_details(package1)
+        details2 = self.get_details(package2)
+
+        self.cur.execute("SELECT a.filename, b.filename, a.size, a.function, a.hash FROM content AS a JOIN content AS b ON a.function = b.function AND a.hash = b.hash WHERE a.package = ? AND b.package = ? AND a.filename != b.filename;",
+                        (package1, package2))
+        shared = dict()
+        for filename1, filename2, size, function, hashvalue in self.cur.fetchall():
+            shared.setdefault((filename1, filename2, size), dict())[function] = hashvalue
+        shared = [dict(filename1=filename1, filename2=filename2, size=size,
+                       functions=functions)
+                  for (filename1, filename2, size), functions in shared.items()]
+        params = dict(
+            details1=details1,
+            details2=details2,
+            shared=shared)
+        return Response(detail_template.render(**params).encode("utf8"),
+                        content_type="text/html")
+
+    def show_hash(self, function, hashvalue):
+        self.cur.execute("SELECT package, filename, size FROM content WHERE function = ? AND hash = ?;",
+                         (function, hashvalue))
+        entries = [dict(package=package, filename=filename, size=size)
+                   for package, filename, size in self.cur.fetchall()]
+        if not entries:
+            raise NotFound()
+        params = dict(function=function, hashvalue=hashvalue, entries=entries)
+        return Response(hash_template.render(**params).encode("utf8"),
+                        content_type="text/html")
 
 def main():
     app = Application()
-    app = DebuggedApplication(app, evalex=True)
-    make_server("localhost", 8800, app).serve_forever()
+    #app = DebuggedApplication(app, evalex=True)
+    make_server("0.0.0.0", 8800, app).serve_forever()
 
 if __name__ == "__main__":
     main()