webapp: turn the <br> after filename into a style
[~helmut/debian-dedup.git] / webapp.py
1 #!/usr/bin/python
2
3 import datetime
4 import os.path
5 import sqlite3
6 from wsgiref.simple_server import make_server
7
8 import jinja2
9 from werkzeug.exceptions import HTTPException, NotFound
10 from werkzeug.routing import Map, Rule, RequestRedirect
11 from werkzeug.wrappers import Request, Response
12 from werkzeug.wsgi import SharedDataMiddleware
13
14 from dedup.utils import fetchiter
15
16 hash_functions = [
17         ("sha512", "sha512"),
18         ("image_sha512", "image_sha512"),
19         ("gzip_sha512", "gzip_sha512"),
20         ("sha512", "gzip_sha512"),
21         ("gzip_sha512", "sha512")]
22
23 jinjaenv = jinja2.Environment(loader=jinja2.FileSystemLoader("."))
24
25 def format_size(size):
26     size = float(size)
27     fmt = "%d B"
28     if size >= 1024:
29         size /= 1024
30         fmt = "%.1f KB"
31     if size >= 1024:
32         size /= 1024
33         fmt = "%.1f MB"
34     if size >= 1024:
35         size /= 1024
36         fmt = "%.1f GB"
37     return fmt % size
38
39 def function_combination(function1, function2):
40     if function1 == function2:
41         return function1
42     return "%s -> %s" % (function1, function2)
43
44 jinjaenv.filters["format_size"] = format_size
45
46 base_template = jinjaenv.get_template("base.html")
47
48 package_template = jinjaenv.from_string(
49 """{% extends "base.html" %}
50 {% block title %}duplication of {{ package|e }}{% endblock %}
51 {% block content %}<h1>{{ package|e }}</h1>
52 <p>Version: {{ version|e }}</p>
53 <p>Architecture: {{ architecture|e }}</p>
54 <p>Number of files: {{ num_files }}</p>
55 <p>Total size: {{ total_size|format_size }}</p>
56 {%- if shared -%}
57     {%- for function, sharing in shared.items() -%}
58         <h3>sharing with respect to {{ function|e }}</h3>
59         <table border='1'><tr><th>package</th><th>files shared</th><th>data shared</th></tr>
60         {%- for entry in sharing|sort(attribute="savable", reverse=true) -%}
61             <tr><td{% if not entry.package or entry.package in dependencies %} class="dependency"{% endif %}>
62                 {%- if entry.package %}<a href="{{ entry.package|e }}">{{ entry.package|e }}</a>{% else %}self{% endif %}
63                 <a href="../compare/{{ package|e }}/{{ entry.package|default(package, true)|e }}">compare</a></td>
64             <td>{{ entry.duplicate }} ({{ (100 * entry.duplicate / num_files)|int }}%)</td>
65             <td>{{ entry.savable|format_size }} ({{ (100 * entry.savable / total_size)|int }}%)</td></tr>
66         {%- endfor -%}
67         </table>
68     {%- endfor -%}
69 <p>Note: Packages with yellow background are required to be installed when this package is installed.</p>
70 {%- endif -%}
71 {% endblock %}""")
72
73 detail_template = jinjaenv.from_string(
74 """{% extends "base.html" %}
75 {% block title %}sharing between {{ details1.package|e }} and {{ details2.package|e }}{% endblock%}
76 {% block content %}
77 <h1><a href="../../binary/{{ details1.package|e }}">{{ details1.package|e }}</a> &lt;-&gt; <a href="../../binary/{{ details2.package|e }}">{{ details2.package|e }}</a></h1>
78 <table border='1'><tr><th colspan="2">{{ details1.package|e }}</th><th colspan="2">{{ details2.package|e }}</th></tr>
79 <tr><th>size</th><th>filename</th><th>hash functions</th><th>filename</th></tr>
80 {%- for entry in shared -%}
81     <tr><td rowspan={{ entry.matches|length }}>{{ entry.size|format_size }}</td><td rowspan={{ entry.matches|length }}>
82     {%- for filename in entry.filenames %}<span class="filename">{{ filename|e }}</span>{% endfor -%}</td><td>
83     {% for filename, match in entry.matches.items() -%}
84         {% if not loop.first %}<tr><td>{% endif -%}
85         {%- for funccomb, hashvalue in match.items() -%}
86             <a href="../../hash/{{ funccomb[0]|e }}/{{ hashvalue|e }}">{{ funccomb[0]|e }}</a>
87             {%- if funccomb[0] != funccomb[1] %} -&gt; <a href="../../hash/{{ funccomb[1]|e }}/{{ hashvalue|e }}">{{ funccomb[1]|e }}</a>{% endif %}
88             {%- if not loop.last %}, {% endif %}
89         {%- endfor -%}
90         </td><td><span class="filename">{{ filename|e }}</span></td></tr>
91     {%- endfor -%}
92 {%- endfor -%}
93 </table>
94 {% endblock %}""")
95
96 hash_template = jinjaenv.from_string(
97 """{% extends "base.html" %}
98 {% block title %}information on {{ function|e }} hash {{ hashvalue|e }}{% endblock %}
99 {% block content %}
100 <h1>{{ function|e }} {{ hashvalue|e }}</h1>
101 <table border='1'><tr><th>package</th><th>filename</th><th>size</th><th>different function</th></tr>
102 {%- for entry in entries -%}
103     <tr><td><a href="../../binary/{{ entry.package|e }}">{{ entry.package|e }}</a></td>
104     <td><span class="filename">{{ entry.filename|e }}</span></td><td>{{ entry.size|format_size }}</td>
105     <td>{% if function != entry.function %}{{ entry.function|e }}{% endif %}</td></tr>
106 {%- endfor -%}
107 </table>
108 {% endblock %}""")
109
110 index_template = jinjaenv.from_string(
111 """{% extends "base.html" %}
112 {% block title %}Debian duplication detector{% endblock %}
113 {% block header %}
114     <script type="text/javascript">
115         function getLinkTarget() {
116             var pkg = document.getElementById("pkg_name").value;
117             if(pkg) {
118                 return "/binary/"+pkg;
119             }
120             return '#';
121         }
122         function processData() {
123             var link = document.getElementById("perma_link");
124             link.href = getLinkTarget();
125             link.text = location.href + getLinkTarget();
126         }
127         window.onload = function() {
128             document.getElementById('pkg_name').onkeyup = processData;
129             document.getElementById("pkg_form").onsubmit = function () {
130                 location.href = getLinkTarget();
131                 return false;
132             }
133             processData();
134             document.getElementById("form_div").style.display = '';
135         }
136     </script>
137 {% endblock %}
138 {% block content %}
139 <h1>Debian duplication detector</h1>
140 <ul>
141 <li>To inspect a particlar binary package, go to <pre>binary/&lt;packagename&gt;</pre> Example: <a href="binary/git">binary/git</a>
142     <div style="display:none" id="form_div"><fieldset>
143             <legend>Inspect package</legend>
144             <noscript><b>This form is disfunctional when javascript is not enabled</b></noscript>
145             Enter binary package to inspect - Note: Non-existing packages will result in <b>404</b>-Errors
146             <form id="pkg_form">
147                 <label for="pkg_name">Name: <input type="text" size="30" name="pkg_name" id="pkg_name">
148                 <input type="submit" value="Go"> Permanent Link: <a id="perma_link" href="#"></a>
149             </form>
150     </fieldset></div></li>
151 <li>To inspect a combination of binary packages go to <pre>compare/&lt;firstpackage&gt;/&lt;secondpackage&gt;</pre> Example: <a href="compare/git/git">compare/git/git</a></li>
152 <li>To discover package shipping a particular file go to <pre>hash/sha512/&lt;hashvalue&gt;</pre> Example: <a href="hash/sha512/ed94df7781793f06f9426a600c1bde86397afc7b35cb3aa11b60214bd31e35ad893b53a04a2cf4676154982d7c204c4aa165d6ccdaac0170031364a05dbab3bc">hash/sha512/ed94df7781793f06f9426a600c1bde86397afc7b35cb3aa11b60214bd31e35ad893b53a04a2cf4676154982d7c204c4aa165d6ccdaac0170031364a05dbab3bc</a></li>
153 </ul>
154 {% endblock %}""")
155
156 source_template = jinjaenv.from_string(
157 """{% extends "base.html" %}
158 {% block title %}overview of {{ source|e }}{% endblock %}
159 {% block content %}
160 <h1>overview of {{ source|e }}</h1>
161 <table border='1'><tr><th>binary from {{ source|e }}</th><th>savable</th><th>other package</th></tr>
162 {% for package, sharing in packages.items() %}
163     <tr><td><a href="../binary/{{ package|e }}">{{ package|e }}</td><td>
164     {%- if sharing -%}
165         {{ sharing.savable|format_size }}</td><td><a href="../binary/{{ sharing.package|e }}">{{ sharing.package|e }}</a> <a href="../compare/{{ package|e }}/{{ sharing.package|e }}">compare</a>
166     {%- else -%}</td><td>{%- endif -%}
167     </td></tr>
168 {% endfor %}
169 </table>
170 <p>Note: Not all sharing listed here. Click on binary packages with non-zero savable to see more.</p>
171 {% endblock %}""")
172
173 def encode_and_buffer(iterator):
174     buff = b""
175     for elem in iterator:
176         buff += elem.encode("utf8")
177         if len(buff) >= 2048:
178             yield buff
179             buff = b""
180     if buff:
181         yield buff
182
183 def html_response(unicode_iterator, max_age=24 * 60 * 60):
184     resp = Response(encode_and_buffer(unicode_iterator), mimetype="text/html")
185     resp.cache_control.max_age = max_age
186     resp.expires = datetime.datetime.now() + datetime.timedelta(seconds=max_age)
187     return resp
188
189 def generate_shared(rows):
190     """internal helper from show_detail"""
191     entry = None
192     for filename1, size1, func1, filename2, size2, func2, hashvalue in rows:
193         funccomb = (func1, func2)
194         if funccomb not in hash_functions:
195             continue
196         if entry and (entry["filename1"] != filename1 or
197                       entry["filename2"] != filename2):
198             yield entry
199             entry = None
200         if entry:
201             funcdict = entry["functions"]
202         else:
203             funcdict = dict()
204             entry = dict(filename1=filename1, filename2=filename2, size1=size1,
205                          size2=size2, functions=funcdict)
206         funcdict[funccomb] = hashvalue
207     if entry:
208         yield entry
209
210 class Application(object):
211     def __init__(self, db):
212         self.db = db
213         self.routingmap = Map([
214             Rule("/", methods=("GET",), endpoint="index"),
215             Rule("/binary/<package>", methods=("GET",), endpoint="package"),
216             Rule("/compare/<package1>/<package2>", methods=("GET",), endpoint="detail"),
217             Rule("/hash/<function>/<hashvalue>", methods=("GET",), endpoint="hash"),
218             Rule("/source/<package>", methods=("GET",), endpoint="source"),
219         ])
220
221     @Request.application
222     def __call__(self, request):
223         mapadapter = self.routingmap.bind_to_environ(request.environ)
224         try:
225             endpoint, args = mapadapter.match()
226             if endpoint == "package":
227                 return self.show_package(args["package"])
228             elif endpoint == "detail":
229                 return self.show_detail(args["package1"], args["package2"])
230             elif endpoint == "hash":
231                 return self.show_hash(args["function"], args["hashvalue"])
232             elif endpoint == "index":
233                 if not request.environ["PATH_INFO"]:
234                     raise RequestRedirect(request.environ["SCRIPT_NAME"] + "/")
235                 return html_response(index_template.render(dict(urlroot="")))
236             elif endpoint == "source":
237                 return self.show_source(args["package"])
238             raise NotFound()
239         except HTTPException as e:
240             return e
241
242     def get_details(self, package):
243         cur = self.db.cursor()
244         cur.execute("SELECT version, architecture FROM package WHERE package = ?;",
245                     (package,))
246         row = cur.fetchone()
247         if not row:
248             raise NotFound()
249         version, architecture = row
250         details = dict(package=package,
251                        version=version,
252                        architecture=architecture)
253         cur.execute("SELECT count(filename), sum(size) FROM content WHERE package = ?;",
254                     (package,))
255         num_files, total_size = cur.fetchone()
256         details.update(dict(num_files=num_files, total_size=total_size))
257         return details
258
259     def get_dependencies(self, package):
260         cur = self.db.cursor()
261         cur.execute("SELECT required FROM dependency WHERE package = ?;",
262                     (package,))
263         return set(row[0] for row in fetchiter(cur))
264
265     def cached_sharedstats(self, package):
266         cur = self.db.cursor()
267         sharedstats = {}
268         cur.execute("SELECT package2, func1, func2, files, size FROM sharing WHERE package1 = ?;",
269                     (package,))
270         for package2, func1, func2, files, size in fetchiter(cur):
271             if (func1, func2) not in hash_functions:
272                 continue
273             curstats = sharedstats.setdefault(
274                     function_combination(func1, func2), list())
275             if package2 == package:
276                 package2 = None
277             curstats.append(dict(package=package2, duplicate=files, savable=size))
278         return sharedstats
279
280     def show_package(self, package):
281         params = self.get_details(package)
282         params["dependencies"] = self.get_dependencies(package)
283         params["shared"] = self.cached_sharedstats(package)
284         params["urlroot"] = ".."
285         return html_response(package_template.render(params))
286
287     def compute_comparison(self, package1, package2):
288         """Compute a sequence of comparison objects ordery by the size of the
289         object in the first package. Each element of the sequence is a dict
290         defining the following keys:
291          * filenames: A set of filenames in package1 all referring to the
292            same object.
293          * size: Size of the object in bytes.
294          * matches: A mapping from filenames in package2 to a mapping from
295            hash function pairs to hash values.
296         """
297         cur = self.db.cursor()
298         cur.execute("SELECT id, filename, size, hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid WHERE package = ? AND function = 'sha512' ORDER BY size DESC;",
299                     (package1,))
300         cursize = -1
301         files = dict()
302         minmatch = 2 if package1 == package2 else 1
303         for cid, filename, size, hashvalue in fetchiter(cur):
304             if cursize != size:
305                 for entry in files.values():
306                     if len(entry["matches"]) >= minmatch:
307                         yield entry
308                 files.clear()
309                 cursize = size
310
311             if hashvalue in files:
312                 files[hashvalue]["filenames"].add(filename)
313                 continue
314
315             entry = dict(filenames=set((filename,)), size=size, matches={})
316             files[hashvalue] = entry
317
318             cur2 = self.db.cursor()
319             cur2.execute("SELECT ha.function, ha.hash, hb.function, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id WHERE ha.cid = ? AND package = ?;",
320                          (cid, package2))
321             for func1, hashvalue, func2, filename in fetchiter(cur2):
322                 entry["matches"].setdefault(filename, {})[func1, func2] = \
323                         hashvalue
324             cur2.close()
325         cur.close()
326
327         for entry in files.values():
328             if len(entry["matches"]) >= minmatch:
329                 yield entry
330
331     def show_detail(self, package1, package2):
332         details1 = details2 = self.get_details(package1)
333         if package1 != package2:
334             details2 = self.get_details(package2)
335
336         shared = self.compute_comparison(package1, package2)
337         params = dict(
338             details1=details1,
339             details2=details2,
340             urlroot="../..",
341             shared=shared)
342         return html_response(detail_template.stream(params))
343
344     def show_hash(self, function, hashvalue):
345         cur = self.db.cursor()
346         cur.execute("SELECT content.package, content.filename, content.size, hash.function FROM content JOIN hash ON content.id = hash.cid WHERE hash = ?;",
347                     (hashvalue,))
348         entries = [dict(package=package, filename=filename, size=size,
349                         function=otherfunc)
350                    for package, filename, size, otherfunc in fetchiter(cur)
351                    if (function, otherfunc) in hash_functions]
352         if not entries:
353             raise NotFound()
354         params = dict(function=function, hashvalue=hashvalue, entries=entries,
355                       urlroot="../..")
356         return html_response(hash_template.render(params))
357
358     def show_source(self, package):
359         cur = self.db.cursor()
360         cur.execute("SELECT package FROM package WHERE source = ?;",
361                     (package,))
362         binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur))
363         if not binpkgs:
364             raise NotFound
365         cur.execute("SELECT package.package, sharing.package2, sharing.func1, sharing.func2, sharing.files, sharing.size FROM package JOIN sharing ON package.package = sharing.package1 WHERE package.source = ?;",
366                     (package,))
367         for binary, otherbin, func1, func2, files, size in fetchiter(cur):
368             entry = dict(package=otherbin,
369                          funccomb=function_combination(func1, func2),
370                          duplicate=files, savable=size)
371             oldentry = binpkgs.get(binary)
372             if not (oldentry and oldentry["savable"] >= size):
373                 binpkgs[binary] = entry
374         params = dict(source=package, packages=binpkgs, urlroot="..")
375         return html_response(source_template.render(params))
376
377 def main():
378     app = Application(sqlite3.connect("test.sqlite3"))
379     staticdir = os.path.join(os.path.dirname(__file__), "static")
380     app = SharedDataMiddleware(app, {"/": staticdir})
381     make_server("0.0.0.0", 8800, app).serve_forever()
382
383 if __name__ == "__main__":
384     main()