webapp: reduce size of comparison output
[~helmut/debian-dedup.git] / webapp.py
1 #!/usr/bin/python
2
3 import datetime
4 import os.path
5 import sqlite3
6 from wsgiref.simple_server import make_server
7
8 import jinja2
9 from werkzeug.exceptions import HTTPException, NotFound
10 from werkzeug.routing import Map, Rule, RequestRedirect
11 from werkzeug.wrappers import Request, Response
12 from werkzeug.wsgi import SharedDataMiddleware
13
14 from dedup.utils import fetchiter
15
16 hash_functions = [
17         ("sha512", "sha512"),
18         ("image_sha512", "image_sha512"),
19         ("gzip_sha512", "gzip_sha512"),
20         ("sha512", "gzip_sha512"),
21         ("gzip_sha512", "sha512")]
22
23 jinjaenv = jinja2.Environment(loader=jinja2.FileSystemLoader("."))
24
25 def format_size(size):
26     size = float(size)
27     fmt = "%d B"
28     if size >= 1024:
29         size /= 1024
30         fmt = "%.1f KB"
31     if size >= 1024:
32         size /= 1024
33         fmt = "%.1f MB"
34     if size >= 1024:
35         size /= 1024
36         fmt = "%.1f GB"
37     return fmt % size
38
39 def function_combination(function1, function2):
40     if function1 == function2:
41         return function1
42     return "%s -> %s" % (function1, function2)
43
44 jinjaenv.filters["format_size"] = format_size
45
46 base_template = jinjaenv.get_template("base.html")
47
48 package_template = jinjaenv.from_string(
49 """{% extends "base.html" %}
50 {% block title %}duplication of {{ package|e }}{% endblock %}
51 {% block content %}<h1>{{ package|e }}</h1>
52 <p>Version: {{ version|e }}</p>
53 <p>Architecture: {{ architecture|e }}</p>
54 <p>Number of files: {{ num_files }}</p>
55 <p>Total size: {{ total_size|format_size }}</p>
56 {%- if shared -%}
57     {%- for function, sharing in shared.items() -%}
58         <h3>sharing with respect to {{ function|e }}</h3>
59         <table border='1'><tr><th>package</th><th>files shared</th><th>data shared</th></tr>
60         {%- for entry in sharing|sort(attribute="savable", reverse=true) -%}
61             <tr><td{% if not entry.package or entry.package in dependencies %} class="dependency"{% endif %}>
62                 {%- if entry.package %}<a href="{{ entry.package|e }}"><span class="binary-package">{{ entry.package|e }}</span></a>{% else %}self{% endif %}
63                 <a href="../compare/{{ package|e }}/{{ entry.package|default(package, true)|e }}">compare</a></td>
64             <td>{{ entry.duplicate }} ({{ (100 * entry.duplicate / num_files)|int }}%)</td>
65             <td>{{ entry.savable|format_size }} ({{ (100 * entry.savable / total_size)|int }}%)</td></tr>
66         {%- endfor -%}
67         </table>
68     {%- endfor -%}
69 <p>Note: Packages with yellow background are required to be installed when this package is installed.</p>
70 {%- endif -%}
71 {% endblock %}""")
72
73 detail_template = jinjaenv.from_string(
74 """{% extends "base.html" %}
75 {% block title %}sharing between {{ details1.package|e }} and {{ details2.package|e }}{% endblock%}
76 {% block content %}
77 <h1><a href="../../binary/{{ details1.package|e }}">{{ details1.package|e }}</a> &lt;-&gt; <a href="../../binary/{{ details2.package|e }}">{{ details2.package|e }}</a></h1>
78 <table border='1'><tr><th colspan="2">{{ details1.package|e }}</th><th colspan="2">{{ details2.package|e }}</th></tr>
79 <tr><th>size</th><th>filename</th><th>hash functions</th><th>filename</th></tr>
80 {%- for entry in shared -%}
81     <tr><td{% if entry.matches|length > 1 %} rowspan={{ entry.matches|length }}{% endif %}>{{ entry.size|format_size }}</td><td{% if entry.matches|length > 1 %} rowspan={{ entry.matches|length }}{% endif %}>
82     {%- for filename in entry.filenames %}<span class="filename">{{ filename|e }}</span>{% endfor -%}</td><td>
83     {% for filename, match in entry.matches.items() -%}
84         {% if not loop.first %}<tr><td>{% endif -%}
85         {%- for funccomb, hashvalue in match.items() -%}
86             <a href="../../hash/{{ funccomb[0]|e }}/{{ hashvalue|e }}">{{ funccomb[0]|e }}</a>
87             {%- if funccomb[0] != funccomb[1] %} -&gt; <a href="../../hash/{{ funccomb[1]|e }}/{{ hashvalue|e }}">{{ funccomb[1]|e }}</a>{% endif %}
88             {%- if not loop.last %}, {% endif %}
89         {%- endfor -%}
90         </td><td><span class="filename">{{ filename|e }}</span></td></tr>
91     {%- endfor -%}
92 {%- endfor -%}
93 </table>
94 {% endblock %}""")
95
96 hash_template = jinjaenv.from_string(
97 """{% extends "base.html" %}
98 {% block title %}information on {{ function|e }} hash {{ hashvalue|e }}{% endblock %}
99 {% block content %}
100 <h1>{{ function|e }} {{ hashvalue|e }}</h1>
101 <table border='1'><tr><th>package</th><th>filename</th><th>size</th><th>different function</th></tr>
102 {%- for entry in entries -%}
103     <tr><td><a href="../../binary/{{ entry.package|e }}"><span class="binary-package">{{ entry.package|e }}</span></a></td>
104     <td><span class="filename">{{ entry.filename|e }}</span></td><td>{{ entry.size|format_size }}</td>
105     <td>{% if function != entry.function %}{{ entry.function|e }}{% endif %}</td></tr>
106 {%- endfor -%}
107 </table>
108 {% endblock %}""")
109
110 index_template = jinjaenv.from_string(
111 """{% extends "base.html" %}
112 {% block title %}Debian duplication detector{% endblock %}
113 {% block header %}
114     <script type="text/javascript">
115         function getLinkTarget() {
116             var pkg = document.getElementById("pkg_name").value;
117             if(pkg) {
118                 return "/binary/"+pkg;
119             }
120             return '#';
121         }
122         function processData() {
123             var link = document.getElementById("perma_link");
124             link.href = getLinkTarget();
125             link.text = location.href + getLinkTarget();
126         }
127         window.onload = function() {
128             document.getElementById('pkg_name').onkeyup = processData;
129             document.getElementById("pkg_form").onsubmit = function () {
130                 location.href = getLinkTarget();
131                 return false;
132             }
133             processData();
134             document.getElementById("form_div").style.display = '';
135         }
136     </script>
137 {% endblock %}
138 {% block content %}
139 <h1>Debian duplication detector</h1>
140 <ul>
141 <li>To inspect a particlar binary package, go to <pre>binary/&lt;packagename&gt;</pre> Example: <a href="binary/git">binary/git</a>
142     <div style="display:none" id="form_div"><fieldset>
143             <legend>Inspect package</legend>
144             <noscript><b>This form is disfunctional when javascript is not enabled</b></noscript>
145             Enter binary package to inspect - Note: Non-existing packages will result in <b>404</b>-Errors
146             <form id="pkg_form">
147                 <label for="pkg_name">Name: <input type="text" size="30" name="pkg_name" id="pkg_name">
148                 <input type="submit" value="Go"> Permanent Link: <a id="perma_link" href="#"></a>
149             </form>
150     </fieldset></div></li>
151 <li>To inspect a combination of binary packages go to <pre>compare/&lt;firstpackage&gt;/&lt;secondpackage&gt;</pre> Example: <a href="compare/git/git">compare/git/git</a></li>
152 <li>To discover package shipping a particular file go to <pre>hash/sha512/&lt;hashvalue&gt;</pre> Example: <a href="hash/sha512/ed94df7781793f06f9426a600c1bde86397afc7b35cb3aa11b60214bd31e35ad893b53a04a2cf4676154982d7c204c4aa165d6ccdaac0170031364a05dbab3bc">hash/sha512/ed94df7781793f06f9426a600c1bde86397afc7b35cb3aa11b60214bd31e35ad893b53a04a2cf4676154982d7c204c4aa165d6ccdaac0170031364a05dbab3bc</a></li>
153 </ul>
154 {% endblock %}""")
155
156 source_template = jinjaenv.from_string(
157 """{% extends "base.html" %}
158 {% block title %}overview of {{ source|e }}{% endblock %}
159 {% block content %}
160 <h1>overview of {{ source|e }}</h1>
161 <table border='1'><tr><th>binary from {{ source|e }}</th><th>savable</th><th>other package</th></tr>
162 {% for package, sharing in packages.items() %}
163     <tr><td><a href="../binary/{{ package|e }}"><span class="binary-package">{{ package|e }}</span></td><td>
164     {%- if sharing -%}
165         {{ sharing.savable|format_size }}</td><td><a href="../binary/{{ sharing.package|e }}"><span class="binary-package">{{ sharing.package|e }}</span></a> <a href="../compare/{{ package|e }}/{{ sharing.package|e }}">compare</a>
166     {%- else -%}</td><td>{%- endif -%}
167     </td></tr>
168 {% endfor %}
169 </table>
170 <p>Note: Not all sharing listed here. Click on binary packages with non-zero savable to see more.</p>
171 {% endblock %}""")
172
173 def encode_and_buffer(iterator):
174     buff = b""
175     for elem in iterator:
176         buff += elem.encode("utf8")
177         if len(buff) >= 2048:
178             yield buff
179             buff = b""
180     if buff:
181         yield buff
182
183 def html_response(unicode_iterator, max_age=24 * 60 * 60):
184     resp = Response(encode_and_buffer(unicode_iterator), mimetype="text/html")
185     resp.cache_control.max_age = max_age
186     resp.expires = datetime.datetime.now() + datetime.timedelta(seconds=max_age)
187     return resp
188
189 def generate_shared(rows):
190     """internal helper from show_detail"""
191     entry = None
192     for filename1, size1, func1, filename2, size2, func2, hashvalue in rows:
193         funccomb = (func1, func2)
194         if funccomb not in hash_functions:
195             continue
196         if entry and (entry["filename1"] != filename1 or
197                       entry["filename2"] != filename2):
198             yield entry
199             entry = None
200         if entry:
201             funcdict = entry["functions"]
202         else:
203             funcdict = dict()
204             entry = dict(filename1=filename1, filename2=filename2, size1=size1,
205                          size2=size2, functions=funcdict)
206         funcdict[funccomb] = hashvalue
207     if entry:
208         yield entry
209
210 class Application(object):
211     def __init__(self, db):
212         self.db = db
213         self.routingmap = Map([
214             Rule("/", methods=("GET",), endpoint="index"),
215             Rule("/binary/<package>", methods=("GET",), endpoint="package"),
216             Rule("/compare/<package1>/<package2>", methods=("GET",), endpoint="detail"),
217             Rule("/hash/<function>/<hashvalue>", methods=("GET",), endpoint="hash"),
218             Rule("/source/<package>", methods=("GET",), endpoint="source"),
219         ])
220
221     @Request.application
222     def __call__(self, request):
223         mapadapter = self.routingmap.bind_to_environ(request.environ)
224         try:
225             endpoint, args = mapadapter.match()
226             if endpoint == "package":
227                 return self.show_package(args["package"])
228             elif endpoint == "detail":
229                 return self.show_detail(args["package1"], args["package2"])
230             elif endpoint == "hash":
231                 return self.show_hash(args["function"], args["hashvalue"])
232             elif endpoint == "index":
233                 if not request.environ["PATH_INFO"]:
234                     raise RequestRedirect(request.environ["SCRIPT_NAME"] + "/")
235                 return html_response(index_template.render(dict(urlroot="")))
236             elif endpoint == "source":
237                 return self.show_source(args["package"])
238             raise NotFound()
239         except HTTPException as e:
240             return e
241
242     def get_details(self, package):
243         cur = self.db.cursor()
244         cur.execute("SELECT version, architecture FROM package WHERE package = ?;",
245                     (package,))
246         row = cur.fetchone()
247         if not row:
248             raise NotFound()
249         version, architecture = row
250         details = dict(package=package,
251                        version=version,
252                        architecture=architecture)
253         cur.execute("SELECT count(filename), sum(size) FROM content WHERE package = ?;",
254                     (package,))
255         num_files, total_size = cur.fetchone()
256         if total_size is None:
257             total_size = 0
258         details.update(dict(num_files=num_files, total_size=total_size))
259         return details
260
261     def get_dependencies(self, package):
262         cur = self.db.cursor()
263         cur.execute("SELECT required FROM dependency WHERE package = ?;",
264                     (package,))
265         return set(row[0] for row in fetchiter(cur))
266
267     def cached_sharedstats(self, package):
268         cur = self.db.cursor()
269         sharedstats = {}
270         cur.execute("SELECT package2, func1, func2, files, size FROM sharing WHERE package1 = ?;",
271                     (package,))
272         for package2, func1, func2, files, size in fetchiter(cur):
273             if (func1, func2) not in hash_functions:
274                 continue
275             curstats = sharedstats.setdefault(
276                     function_combination(func1, func2), list())
277             if package2 == package:
278                 package2 = None
279             curstats.append(dict(package=package2, duplicate=files, savable=size))
280         return sharedstats
281
282     def show_package(self, package):
283         params = self.get_details(package)
284         params["dependencies"] = self.get_dependencies(package)
285         params["shared"] = self.cached_sharedstats(package)
286         params["urlroot"] = ".."
287         return html_response(package_template.render(params))
288
289     def compute_comparison(self, package1, package2):
290         """Compute a sequence of comparison objects ordery by the size of the
291         object in the first package. Each element of the sequence is a dict
292         defining the following keys:
293          * filenames: A set of filenames in package1 all referring to the
294            same object.
295          * size: Size of the object in bytes.
296          * matches: A mapping from filenames in package2 to a mapping from
297            hash function pairs to hash values.
298         """
299         cur = self.db.cursor()
300         cur.execute("SELECT id, filename, size, hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid WHERE package = ? AND function = 'sha512' ORDER BY size DESC;",
301                     (package1,))
302         cursize = -1
303         files = dict()
304         minmatch = 2 if package1 == package2 else 1
305         for cid, filename, size, hashvalue in fetchiter(cur):
306             if cursize != size:
307                 for entry in files.values():
308                     if len(entry["matches"]) >= minmatch:
309                         yield entry
310                 files.clear()
311                 cursize = size
312
313             if hashvalue in files:
314                 files[hashvalue]["filenames"].add(filename)
315                 continue
316
317             entry = dict(filenames=set((filename,)), size=size, matches={})
318             files[hashvalue] = entry
319
320             cur2 = self.db.cursor()
321             cur2.execute("SELECT ha.function, ha.hash, hb.function, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id WHERE ha.cid = ? AND package = ?;",
322                          (cid, package2))
323             for func1, hashvalue, func2, filename in fetchiter(cur2):
324                 entry["matches"].setdefault(filename, {})[func1, func2] = \
325                         hashvalue
326             cur2.close()
327         cur.close()
328
329         for entry in files.values():
330             if len(entry["matches"]) >= minmatch:
331                 yield entry
332
333     def show_detail(self, package1, package2):
334         details1 = details2 = self.get_details(package1)
335         if package1 != package2:
336             details2 = self.get_details(package2)
337
338         shared = self.compute_comparison(package1, package2)
339         params = dict(
340             details1=details1,
341             details2=details2,
342             urlroot="../..",
343             shared=shared)
344         return html_response(detail_template.stream(params))
345
346     def show_hash(self, function, hashvalue):
347         cur = self.db.cursor()
348         cur.execute("SELECT content.package, content.filename, content.size, hash.function FROM content JOIN hash ON content.id = hash.cid WHERE hash = ?;",
349                     (hashvalue,))
350         entries = [dict(package=package, filename=filename, size=size,
351                         function=otherfunc)
352                    for package, filename, size, otherfunc in fetchiter(cur)
353                    if (function, otherfunc) in hash_functions]
354         if not entries:
355             raise NotFound()
356         params = dict(function=function, hashvalue=hashvalue, entries=entries,
357                       urlroot="../..")
358         return html_response(hash_template.render(params))
359
360     def show_source(self, package):
361         cur = self.db.cursor()
362         cur.execute("SELECT package FROM package WHERE source = ?;",
363                     (package,))
364         binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur))
365         if not binpkgs:
366             raise NotFound
367         cur.execute("SELECT package.package, sharing.package2, sharing.func1, sharing.func2, sharing.files, sharing.size FROM package JOIN sharing ON package.package = sharing.package1 WHERE package.source = ?;",
368                     (package,))
369         for binary, otherbin, func1, func2, files, size in fetchiter(cur):
370             entry = dict(package=otherbin,
371                          funccomb=function_combination(func1, func2),
372                          duplicate=files, savable=size)
373             oldentry = binpkgs.get(binary)
374             if not (oldentry and oldentry["savable"] >= size):
375                 binpkgs[binary] = entry
376         params = dict(source=package, packages=binpkgs, urlroot="..")
377         return html_response(source_template.render(params))
378
379 def main():
380     app = Application(sqlite3.connect("test.sqlite3"))
381     staticdir = os.path.join(os.path.dirname(__file__), "static")
382     app = SharedDataMiddleware(app, {"/": staticdir})
383     make_server("0.0.0.0", 8800, app).serve_forever()
384
385 if __name__ == "__main__":
386     main()