Merge branch functionid
[~helmut/debian-dedup.git] / webapp.py
1 #!/usr/bin/python
2
3 import datetime
4 import os.path
5 import sqlite3
6 from wsgiref.simple_server import make_server
7
8 import jinja2
9 from werkzeug.exceptions import HTTPException, NotFound
10 from werkzeug.routing import Map, Rule, RequestRedirect
11 from werkzeug.wrappers import Request, Response
12 from werkzeug.wsgi import SharedDataMiddleware
13
14 from dedup.utils import fetchiter
15
16 hash_functions = [
17         ("sha512", "sha512"),
18         ("image_sha512", "image_sha512"),
19         ("gzip_sha512", "gzip_sha512"),
20         ("sha512", "gzip_sha512"),
21         ("gzip_sha512", "sha512")]
22
23 jinjaenv = jinja2.Environment(loader=jinja2.FileSystemLoader("."))
24
25 def format_size(size):
26     size = float(size)
27     fmt = "%d B"
28     if size >= 1024:
29         size /= 1024
30         fmt = "%.1f KB"
31     if size >= 1024:
32         size /= 1024
33         fmt = "%.1f MB"
34     if size >= 1024:
35         size /= 1024
36         fmt = "%.1f GB"
37     return fmt % size
38
39 def function_combination(function1, function2):
40     if function1 == function2:
41         return function1
42     return "%s -> %s" % (function1, function2)
43
44 # Workaround for jinja bug #59 (broken filesizeformat)
45 jinjaenv.filters["filesizeformat"] = format_size
46
47 base_template = jinjaenv.get_template("base.html")
48
49 package_template = jinjaenv.from_string(
50 """{% extends "base.html" %}
51 {% block title %}duplication of {{ package|e }}{% endblock %}
52 {% block content %}<h1>{{ package|e }}</h1>
53 <p>Version: {{ version|e }}</p>
54 <p>Architecture: {{ architecture|e }}</p>
55 <p>Number of files: {{ num_files }}</p>
56 <p>Total size: {{ total_size|filesizeformat }}</p>
57 {%- if shared -%}
58     {%- for function, sharing in shared.items() -%}
59         <h3>sharing with respect to {{ function|e }}</h3>
60         <table border='1'><tr><th>package</th><th>files shared</th><th>data shared</th></tr>
61         {%- for entry in sharing|sort(attribute="savable", reverse=true) -%}
62             <tr><td{% if not entry.package or entry.package in dependencies %} class="dependency"{% endif %}>
63                 {%- if entry.package %}<a href="{{ entry.package|e }}"><span class="binary-package">{{ entry.package|e }}</span></a>{% else %}self{% endif %}
64                 <a href="../compare/{{ package|e }}/{{ entry.package|default(package, true)|e }}">compare</a></td>
65             <td>{{ entry.duplicate }} ({{ (100 * entry.duplicate / num_files)|int }}%)</td>
66             <td>{{ entry.savable|filesizeformat }} ({{ (100 * entry.savable / total_size)|int }}%)</td></tr>
67         {%- endfor -%}
68         </table>
69     {%- endfor -%}
70 <p>Note: Packages with yellow background are required to be installed when this package is installed.</p>
71 {%- endif -%}
72 {%- if issues -%}
73     <h3>issues with particular files</h3>
74     <table border='1'><tr><th>filename</th><th>issue</th></tr>
75     {%- for filename, issue in issues|dictsort(true) -%}
76         <tr><td><span class="filename">{{ filename|e }}</span></td><td>{{ issue|e }}</td></tr>
77     {%- endfor -%}
78     </table>
79 {%- endif -%}
80 {% endblock %}""")
81
82 detail_template = jinjaenv.from_string(
83 """{% extends "base.html" %}
84 {% block title %}sharing between {{ details1.package|e }} and {{ details2.package|e }}{% endblock%}
85 {% block content %}
86 <h1><a href="../../binary/{{ details1.package|e }}">{{ details1.package|e }}</a> &lt;-&gt; <a href="../../binary/{{ details2.package|e }}">{{ details2.package|e }}</a></h1>
87 <p>Version of {{ details1.package|e }}: {{ details1.version|e }}</p>
88 <p>Architecture of {{ details1.package|e }}: {{ details1.architecture|e }}</p>
89 {%- if details1.package != details2.package -%}
90 <p>Version of {{ details2.package|e }}: {{ details2.version|e }}</p>
91 <p>Architecture of {{ details2.package|e }}: {{ details2.architecture|e }}</p>
92 {%- endif -%}
93 <table border='1'><tr><th colspan="2">{{ details1.package|e }}</th><th colspan="2">{{ details2.package|e }}</th></tr>
94 <tr><th>size</th><th>filename</th><th>hash functions</th><th>filename</th></tr>
95 {%- for entry in shared -%}
96     <tr><td{% if entry.matches|length > 1 %} rowspan={{ entry.matches|length }}{% endif %}>{{ entry.size|filesizeformat }}</td><td{% if entry.matches|length > 1 %} rowspan={{ entry.matches|length }}{% endif %}>
97     {%- for filename in entry.filenames %}<span class="filename">{{ filename|e }}</span>{% endfor -%}</td><td>
98     {% for filename, match in entry.matches.items() -%}
99         {% if not loop.first %}<tr><td>{% endif -%}
100         {%- for funccomb, hashvalue in match.items() -%}
101             <a href="../../hash/{{ funccomb[0]|e }}/{{ hashvalue|e }}">{{ funccomb[0]|e }}</a>
102             {%- if funccomb[0] != funccomb[1] %} -&gt; <a href="../../hash/{{ funccomb[1]|e }}/{{ hashvalue|e }}">{{ funccomb[1]|e }}</a>{% endif %}
103             {%- if not loop.last %}, {% endif %}
104         {%- endfor -%}
105         </td><td><span class="filename">{{ filename|e }}</span></td></tr>
106     {%- endfor -%}
107 {%- endfor -%}
108 </table>
109 {% endblock %}""")
110
111 hash_template = jinjaenv.from_string(
112 """{% extends "base.html" %}
113 {% block title %}information on {{ function|e }} hash {{ hashvalue|e }}{% endblock %}
114 {% block content %}
115 <h1>{{ function|e }} {{ hashvalue|e }}</h1>
116 <table border='1'><tr><th>package</th><th>filename</th><th>size</th><th>different function</th></tr>
117 {%- for entry in entries -%}
118     <tr><td><a href="../../binary/{{ entry.package|e }}"><span class="binary-package">{{ entry.package|e }}</span></a></td>
119     <td><span class="filename">{{ entry.filename|e }}</span></td><td>{{ entry.size|filesizeformat }}</td>
120     <td>{% if function != entry.function %}{{ entry.function|e }}{% endif %}</td></tr>
121 {%- endfor -%}
122 </table>
123 {% endblock %}""")
124
125 index_template = jinjaenv.from_string(
126 """{% extends "base.html" %}
127 {% block title %}Debian duplication detector{% endblock %}
128 {% block header %}
129     <script type="text/javascript">
130         function getLinkTarget() {
131             var pkg = document.getElementById("pkg_name").value;
132             if(pkg) {
133                 return "/binary/"+pkg;
134             }
135             return '#';
136         }
137         function processData() {
138             var link = document.getElementById("perma_link");
139             link.href = getLinkTarget();
140             link.text = location.href + getLinkTarget();
141         }
142         window.onload = function() {
143             document.getElementById('pkg_name').onkeyup = processData;
144             document.getElementById("pkg_form").onsubmit = function () {
145                 location.href = getLinkTarget();
146                 return false;
147             }
148             processData();
149             document.getElementById("form_div").style.display = '';
150         }
151     </script>
152 {% endblock %}
153 {% block content %}
154 <h1>Debian duplication detector</h1>
155 <ul>
156 <li>To inspect a particlar binary package, go to <pre>binary/&lt;packagename&gt;</pre> Example: <a href="binary/git">binary/git</a>
157     <div style="display:none" id="form_div"><fieldset>
158             <legend>Inspect package</legend>
159             <noscript><b>This form is disfunctional when javascript is not enabled</b></noscript>
160             Enter binary package to inspect - Note: Non-existing packages will result in <b>404</b>-Errors
161             <form id="pkg_form">
162                 <label for="pkg_name">Name: </label><input type="text" size="30" name="pkg_name" id="pkg_name">
163                 <input type="submit" value="Go"> Permanent Link: <a id="perma_link" href="#"></a>
164             </form>
165     </fieldset></div></li>
166 <li>To inspect a combination of binary packages go to <pre>compare/&lt;firstpackage&gt;/&lt;secondpackage&gt;</pre> Example: <a href="compare/git/git">compare/git/git</a></li>
167 <li>To discover package shipping a particular file go to <pre>hash/sha512/&lt;hashvalue&gt;</pre> Example: <a href="hash/sha512/7633623b66b5e686bb94dd96a7cdb5a7e5ee00e87004fab416a5610d59c62badaf512a2e26e34e2455b7ed6b76690d2cd47464836d7d85d78b51d50f7e933d5c">hash/sha512/7633623b66b5e686bb94dd96a7cdb5a7e5ee00e87004fab416a5610d59c62badaf512a2e26e34e2455b7ed6b76690d2cd47464836d7d85d78b51d50f7e933d5c</a></li>
168 </ul>
169 {% endblock %}""")
170
171 source_template = jinjaenv.from_string(
172 """{% extends "base.html" %}
173 {% block title %}overview of {{ source|e }}{% endblock %}
174 {% block content %}
175 <h1>overview of {{ source|e }}</h1>
176 <table border='1'><tr><th>binary from {{ source|e }}</th><th>savable</th><th>other package</th></tr>
177 {% for package, sharing in packages.items() %}
178     <tr><td><a href="../binary/{{ package|e }}"><span class="binary-package">{{ package|e }}</span></a></td><td>
179     {%- if sharing -%}
180         {{ sharing.savable|filesizeformat }}</td><td><a href="../binary/{{ sharing.package|e }}"><span class="binary-package">{{ sharing.package|e }}</span></a> <a href="../compare/{{ package|e }}/{{ sharing.package|e }}">compare</a>
181     {%- else -%}</td><td>{%- endif -%}
182     </td></tr>
183 {% endfor %}
184 </table>
185 <p>Note: Not all sharing listed here. Click on binary packages with non-zero savable to see more.</p>
186 {% endblock %}""")
187
188 def encode_and_buffer(iterator):
189     buff = b""
190     for elem in iterator:
191         buff += elem.encode("utf8")
192         if len(buff) >= 2048:
193             yield buff
194             buff = b""
195     if buff:
196         yield buff
197
198 def html_response(unicode_iterator, max_age=24 * 60 * 60):
199     resp = Response(encode_and_buffer(unicode_iterator), mimetype="text/html")
200     resp.cache_control.max_age = max_age
201     resp.expires = datetime.datetime.now() + datetime.timedelta(seconds=max_age)
202     return resp
203
204 class Application(object):
205     def __init__(self, db):
206         self.db = db
207         self.routingmap = Map([
208             Rule("/", methods=("GET",), endpoint="index"),
209             Rule("/binary/<package>", methods=("GET",), endpoint="package"),
210             Rule("/compare/<package1>/<package2>", methods=("GET",), endpoint="detail"),
211             Rule("/hash/<function>/<hashvalue>", methods=("GET",), endpoint="hash"),
212             Rule("/source/<package>", methods=("GET",), endpoint="source"),
213         ])
214
215     @Request.application
216     def __call__(self, request):
217         mapadapter = self.routingmap.bind_to_environ(request.environ)
218         try:
219             endpoint, args = mapadapter.match()
220             if endpoint == "package":
221                 return self.show_package(args["package"])
222             elif endpoint == "detail":
223                 return self.show_detail(args["package1"], args["package2"])
224             elif endpoint == "hash":
225                 return self.show_hash(args["function"], args["hashvalue"])
226             elif endpoint == "index":
227                 if not request.environ["PATH_INFO"]:
228                     raise RequestRedirect(request.environ["SCRIPT_NAME"] + "/")
229                 return html_response(index_template.render(dict(urlroot="")))
230             elif endpoint == "source":
231                 return self.show_source(args["package"])
232             raise NotFound()
233         except HTTPException as e:
234             return e
235
236     def get_details(self, package):
237         cur = self.db.cursor()
238         cur.execute("SELECT id, version, architecture FROM package WHERE name = ?;",
239                     (package,))
240         row = cur.fetchone()
241         if not row:
242             raise NotFound()
243         pid, version, architecture = row
244         details = dict(pid=pid,
245                        package=package,
246                        version=version,
247                        architecture=architecture)
248         cur.execute("SELECT count(filename), sum(size) FROM content WHERE pid = ?;",
249                     (pid,))
250         num_files, total_size = cur.fetchone()
251         if total_size is None:
252             total_size = 0
253         details.update(dict(num_files=num_files, total_size=total_size))
254         return details
255
256     def get_dependencies(self, pid):
257         cur = self.db.cursor()
258         cur.execute("SELECT required FROM dependency WHERE pid = ?;",
259                     (pid,))
260         return set(row[0] for row in fetchiter(cur))
261
262     def cached_sharedstats(self, pid):
263         cur = self.db.cursor()
264         sharedstats = {}
265         cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ?;",
266                     (pid,))
267         for pid2, package2, func1, func2, files, size in fetchiter(cur):
268             if (func1, func2) not in hash_functions:
269                 continue
270             curstats = sharedstats.setdefault(
271                     function_combination(func1, func2), list())
272             if pid2 == pid:
273                 package2 = None
274             curstats.append(dict(package=package2, duplicate=files, savable=size))
275         return sharedstats
276
277     def show_package(self, package):
278         params = self.get_details(package)
279         params["dependencies"] = self.get_dependencies(params["pid"])
280         params["shared"] = self.cached_sharedstats(params["pid"])
281         params["urlroot"] = ".."
282         cur = self.db.cursor()
283         cur.execute("SELECT content.filename, issue.issue FROM content JOIN issue ON content.id = issue.cid WHERE content.pid = ?;",
284                     (params["pid"],))
285         params["issues"] = dict(cur.fetchall())
286         cur.close()
287         return html_response(package_template.render(params))
288
289     def compute_comparison(self, pid1, pid2):
290         """Compute a sequence of comparison objects ordery by the size of the
291         object in the first package. Each element of the sequence is a dict
292         defining the following keys:
293          * filenames: A set of filenames in package 1 (pid1) all referring to
294            the same object.
295          * size: Size of the object in bytes.
296          * matches: A mapping from filenames in package 2 (pid2) to a mapping
297            from hash function pairs to hash values.
298         """
299         cur = self.db.cursor()
300         cur.execute("SELECT content.id, content.filename, content.size, hash.hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid JOIN function ON hash.fid = function.id WHERE pid = ? AND function.name = 'sha512' ORDER BY size DESC;",
301                     (pid1,))
302         cursize = -1
303         files = dict()
304         minmatch = 2 if pid1 == pid2 else 1
305         for cid, filename, size, hashvalue in fetchiter(cur):
306             if cursize != size:
307                 for entry in files.values():
308                     if len(entry["matches"]) >= minmatch:
309                         yield entry
310                 files.clear()
311                 cursize = size
312
313             if hashvalue in files:
314                 files[hashvalue]["filenames"].add(filename)
315                 continue
316
317             entry = dict(filenames=set((filename,)), size=size, matches={})
318             files[hashvalue] = entry
319
320             cur2 = self.db.cursor()
321             cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ?;",
322                          (cid, pid2))
323             for func1, hashvalue, func2, filename in fetchiter(cur2):
324                 entry["matches"].setdefault(filename, {})[func1, func2] = \
325                         hashvalue
326             cur2.close()
327         cur.close()
328
329         for entry in files.values():
330             if len(entry["matches"]) >= minmatch:
331                 yield entry
332
333     def show_detail(self, package1, package2):
334         details1 = details2 = self.get_details(package1)
335         if package1 != package2:
336             details2 = self.get_details(package2)
337
338         shared = self.compute_comparison(details1["pid"], details2["pid"])
339         params = dict(
340             details1=details1,
341             details2=details2,
342             urlroot="../..",
343             shared=shared)
344         return html_response(detail_template.stream(params))
345
346     def show_hash(self, function, hashvalue):
347         cur = self.db.cursor()
348         cur.execute("SELECT package.name, content.filename, content.size, function.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function ON hash.fid = function.id WHERE hash = ?;",
349                     (hashvalue,))
350         entries = [dict(package=package, filename=filename, size=size,
351                         function=otherfunc)
352                    for package, filename, size, otherfunc in fetchiter(cur)
353                    if (function, otherfunc) in hash_functions]
354         if not entries:
355             raise NotFound()
356         params = dict(function=function, hashvalue=hashvalue, entries=entries,
357                       urlroot="../..")
358         return html_response(hash_template.render(params))
359
360     def show_source(self, package):
361         cur = self.db.cursor()
362         cur.execute("SELECT name FROM package WHERE source = ?;",
363                     (package,))
364         binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur))
365         if not binpkgs:
366             raise NotFound
367         cur.execute("SELECT p1.name, p2.name, f1.name, f2.name, sharing.files, sharing.size FROM sharing JOIN package AS p1 ON sharing.pid1 = p1.id JOIN package AS p2 ON sharing.pid2 = p2.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE p1.source = ?;",
368                     (package,))
369         for binary, otherbin, func1, func2, files, size in fetchiter(cur):
370             entry = dict(package=otherbin,
371                          funccomb=function_combination(func1, func2),
372                          duplicate=files, savable=size)
373             oldentry = binpkgs.get(binary)
374             if not (oldentry and oldentry["savable"] >= size):
375                 binpkgs[binary] = entry
376         params = dict(source=package, packages=binpkgs, urlroot="..")
377         return html_response(source_template.render(params))
378
379 def main():
380     app = Application(sqlite3.connect("test.sqlite3"))
381     staticdir = os.path.join(os.path.dirname(__file__), "static")
382     app = SharedDataMiddleware(app, {"/": staticdir})
383     make_server("0.0.0.0", 8800, app).serve_forever()
384
385 if __name__ == "__main__":
386     main()