importpkg: simplify state logic
[~helmut/debian-dedup.git] / webapp.py
1 #!/usr/bin/python
2
3 import datetime
4 import os.path
5 import sqlite3
6 from wsgiref.simple_server import make_server
7
8 import jinja2
9 from werkzeug.exceptions import HTTPException, NotFound
10 from werkzeug.routing import Map, Rule, RequestRedirect
11 from werkzeug.wrappers import Request, Response
12 from werkzeug.wsgi import SharedDataMiddleware
13
14 from dedup.utils import fetchiter
15
16 hash_functions = [
17         ("sha512", "sha512"),
18         ("image_sha512", "image_sha512"),
19         ("gzip_sha512", "gzip_sha512"),
20         ("sha512", "gzip_sha512"),
21         ("gzip_sha512", "sha512")]
22
23 jinjaenv = jinja2.Environment(loader=jinja2.FileSystemLoader("."))
24
25 def format_size(size):
26     size = float(size)
27     fmt = "%d B"
28     if size >= 1024:
29         size /= 1024
30         fmt = "%.1f KB"
31     if size >= 1024:
32         size /= 1024
33         fmt = "%.1f MB"
34     if size >= 1024:
35         size /= 1024
36         fmt = "%.1f GB"
37     return fmt % size
38
39 def function_combination(function1, function2):
40     if function1 == function2:
41         return function1
42     return "%s -> %s" % (function1, function2)
43
44 # Workaround for jinja bug #59 (broken filesizeformat)
45 jinjaenv.filters["filesizeformat"] = format_size
46
47 base_template = jinjaenv.get_template("base.html")
48
49 package_template = jinjaenv.from_string(
50 """{% extends "base.html" %}
51 {% block title %}duplication of {{ package|e }}{% endblock %}
52 {% block content %}<h1>{{ package|e }}</h1>
53 <p>Version: {{ version|e }}</p>
54 <p>Architecture: {{ architecture|e }}</p>
55 <p>Number of files: {{ num_files }}</p>
56 <p>Total size: {{ total_size|filesizeformat }}</p>
57 {%- if shared -%}
58     {%- for function, sharing in shared.items() -%}
59         <h3>sharing with respect to {{ function|e }}</h3>
60         <table border='1'><tr><th>package</th><th>files shared</th><th>data shared</th></tr>
61         {%- for entry in sharing|sort(attribute="savable", reverse=true) -%}
62             <tr><td{% if not entry.package or entry.package in dependencies %} class="dependency"{% endif %}>
63                 {%- if entry.package %}<a href="{{ entry.package|e }}"><span class="binary-package">{{ entry.package|e }}</span></a>{% else %}self{% endif %}
64                 <a href="../compare/{{ package|e }}/{{ entry.package|default(package, true)|e }}">compare</a></td>
65             <td>{{ entry.duplicate }} ({{ (100 * entry.duplicate / num_files)|int }}%)</td>
66             <td>{{ entry.savable|filesizeformat }} ({{ (100 * entry.savable / total_size)|int }}%)</td></tr>
67         {%- endfor -%}
68         </table>
69     {%- endfor -%}
70 <p>Note: Packages with yellow background are required to be installed when this package is installed.</p>
71 {%- endif -%}
72 {% endblock %}""")
73
74 detail_template = jinjaenv.from_string(
75 """{% extends "base.html" %}
76 {% block title %}sharing between {{ details1.package|e }} and {{ details2.package|e }}{% endblock%}
77 {% block content %}
78 <h1><a href="../../binary/{{ details1.package|e }}">{{ details1.package|e }}</a> &lt;-&gt; <a href="../../binary/{{ details2.package|e }}">{{ details2.package|e }}</a></h1>
79 <p>Version of {{ details1.package|e }}: {{ details1.version|e }}</p>
80 <p>Architecture of {{ details1.package|e }}: {{ details1.architecture|e }}</p>
81 {%- if details1.package != details2.package -%}
82 <p>Version of {{ details2.package|e }}: {{ details2.version|e }}</p>
83 <p>Architecture of {{ details2.package|e }}: {{ details2.architecture|e }}</p>
84 {%- endif -%}
85 <table border='1'><tr><th colspan="2">{{ details1.package|e }}</th><th colspan="2">{{ details2.package|e }}</th></tr>
86 <tr><th>size</th><th>filename</th><th>hash functions</th><th>filename</th></tr>
87 {%- for entry in shared -%}
88     <tr><td{% if entry.matches|length > 1 %} rowspan={{ entry.matches|length }}{% endif %}>{{ entry.size|filesizeformat }}</td><td{% if entry.matches|length > 1 %} rowspan={{ entry.matches|length }}{% endif %}>
89     {%- for filename in entry.filenames %}<span class="filename">{{ filename|e }}</span>{% endfor -%}</td><td>
90     {% for filename, match in entry.matches.items() -%}
91         {% if not loop.first %}<tr><td>{% endif -%}
92         {%- for funccomb, hashvalue in match.items() -%}
93             <a href="../../hash/{{ funccomb[0]|e }}/{{ hashvalue|e }}">{{ funccomb[0]|e }}</a>
94             {%- if funccomb[0] != funccomb[1] %} -&gt; <a href="../../hash/{{ funccomb[1]|e }}/{{ hashvalue|e }}">{{ funccomb[1]|e }}</a>{% endif %}
95             {%- if not loop.last %}, {% endif %}
96         {%- endfor -%}
97         </td><td><span class="filename">{{ filename|e }}</span></td></tr>
98     {%- endfor -%}
99 {%- endfor -%}
100 </table>
101 {% endblock %}""")
102
103 hash_template = jinjaenv.from_string(
104 """{% extends "base.html" %}
105 {% block title %}information on {{ function|e }} hash {{ hashvalue|e }}{% endblock %}
106 {% block content %}
107 <h1>{{ function|e }} {{ hashvalue|e }}</h1>
108 <table border='1'><tr><th>package</th><th>filename</th><th>size</th><th>different function</th></tr>
109 {%- for entry in entries -%}
110     <tr><td><a href="../../binary/{{ entry.package|e }}"><span class="binary-package">{{ entry.package|e }}</span></a></td>
111     <td><span class="filename">{{ entry.filename|e }}</span></td><td>{{ entry.size|filesizeformat }}</td>
112     <td>{% if function != entry.function %}{{ entry.function|e }}{% endif %}</td></tr>
113 {%- endfor -%}
114 </table>
115 {% endblock %}""")
116
117 index_template = jinjaenv.from_string(
118 """{% extends "base.html" %}
119 {% block title %}Debian duplication detector{% endblock %}
120 {% block header %}
121     <script type="text/javascript">
122         function getLinkTarget() {
123             var pkg = document.getElementById("pkg_name").value;
124             if(pkg) {
125                 return "/binary/"+pkg;
126             }
127             return '#';
128         }
129         function processData() {
130             var link = document.getElementById("perma_link");
131             link.href = getLinkTarget();
132             link.text = location.href + getLinkTarget();
133         }
134         window.onload = function() {
135             document.getElementById('pkg_name').onkeyup = processData;
136             document.getElementById("pkg_form").onsubmit = function () {
137                 location.href = getLinkTarget();
138                 return false;
139             }
140             processData();
141             document.getElementById("form_div").style.display = '';
142         }
143     </script>
144 {% endblock %}
145 {% block content %}
146 <h1>Debian duplication detector</h1>
147 <ul>
148 <li>To inspect a particlar binary package, go to <pre>binary/&lt;packagename&gt;</pre> Example: <a href="binary/git">binary/git</a>
149     <div style="display:none" id="form_div"><fieldset>
150             <legend>Inspect package</legend>
151             <noscript><b>This form is disfunctional when javascript is not enabled</b></noscript>
152             Enter binary package to inspect - Note: Non-existing packages will result in <b>404</b>-Errors
153             <form id="pkg_form">
154                 <label for="pkg_name">Name: <input type="text" size="30" name="pkg_name" id="pkg_name">
155                 <input type="submit" value="Go"> Permanent Link: <a id="perma_link" href="#"></a>
156             </form>
157     </fieldset></div></li>
158 <li>To inspect a combination of binary packages go to <pre>compare/&lt;firstpackage&gt;/&lt;secondpackage&gt;</pre> Example: <a href="compare/git/git">compare/git/git</a></li>
159 <li>To discover package shipping a particular file go to <pre>hash/sha512/&lt;hashvalue&gt;</pre> Example: <a href="hash/sha512/7633623b66b5e686bb94dd96a7cdb5a7e5ee00e87004fab416a5610d59c62badaf512a2e26e34e2455b7ed6b76690d2cd47464836d7d85d78b51d50f7e933d5c">hash/sha512/7633623b66b5e686bb94dd96a7cdb5a7e5ee00e87004fab416a5610d59c62badaf512a2e26e34e2455b7ed6b76690d2cd47464836d7d85d78b51d50f7e933d5c</a></li>
160 </ul>
161 {% endblock %}""")
162
163 source_template = jinjaenv.from_string(
164 """{% extends "base.html" %}
165 {% block title %}overview of {{ source|e }}{% endblock %}
166 {% block content %}
167 <h1>overview of {{ source|e }}</h1>
168 <table border='1'><tr><th>binary from {{ source|e }}</th><th>savable</th><th>other package</th></tr>
169 {% for package, sharing in packages.items() %}
170     <tr><td><a href="../binary/{{ package|e }}"><span class="binary-package">{{ package|e }}</span></a></td><td>
171     {%- if sharing -%}
172         {{ sharing.savable|filesizeformat }}</td><td><a href="../binary/{{ sharing.package|e }}"><span class="binary-package">{{ sharing.package|e }}</span></a> <a href="../compare/{{ package|e }}/{{ sharing.package|e }}">compare</a>
173     {%- else -%}</td><td>{%- endif -%}
174     </td></tr>
175 {% endfor %}
176 </table>
177 <p>Note: Not all sharing listed here. Click on binary packages with non-zero savable to see more.</p>
178 {% endblock %}""")
179
180 def encode_and_buffer(iterator):
181     buff = b""
182     for elem in iterator:
183         buff += elem.encode("utf8")
184         if len(buff) >= 2048:
185             yield buff
186             buff = b""
187     if buff:
188         yield buff
189
190 def html_response(unicode_iterator, max_age=24 * 60 * 60):
191     resp = Response(encode_and_buffer(unicode_iterator), mimetype="text/html")
192     resp.cache_control.max_age = max_age
193     resp.expires = datetime.datetime.now() + datetime.timedelta(seconds=max_age)
194     return resp
195
196 def generate_shared(rows):
197     """internal helper from show_detail"""
198     entry = None
199     for filename1, size1, func1, filename2, size2, func2, hashvalue in rows:
200         funccomb = (func1, func2)
201         if funccomb not in hash_functions:
202             continue
203         if entry and (entry["filename1"] != filename1 or
204                       entry["filename2"] != filename2):
205             yield entry
206             entry = None
207         if entry:
208             funcdict = entry["functions"]
209         else:
210             funcdict = dict()
211             entry = dict(filename1=filename1, filename2=filename2, size1=size1,
212                          size2=size2, functions=funcdict)
213         funcdict[funccomb] = hashvalue
214     if entry:
215         yield entry
216
217 class Application(object):
218     def __init__(self, db):
219         self.db = db
220         self.routingmap = Map([
221             Rule("/", methods=("GET",), endpoint="index"),
222             Rule("/binary/<package>", methods=("GET",), endpoint="package"),
223             Rule("/compare/<package1>/<package2>", methods=("GET",), endpoint="detail"),
224             Rule("/hash/<function>/<hashvalue>", methods=("GET",), endpoint="hash"),
225             Rule("/source/<package>", methods=("GET",), endpoint="source"),
226         ])
227
228     @Request.application
229     def __call__(self, request):
230         mapadapter = self.routingmap.bind_to_environ(request.environ)
231         try:
232             endpoint, args = mapadapter.match()
233             if endpoint == "package":
234                 return self.show_package(args["package"])
235             elif endpoint == "detail":
236                 return self.show_detail(args["package1"], args["package2"])
237             elif endpoint == "hash":
238                 return self.show_hash(args["function"], args["hashvalue"])
239             elif endpoint == "index":
240                 if not request.environ["PATH_INFO"]:
241                     raise RequestRedirect(request.environ["SCRIPT_NAME"] + "/")
242                 return html_response(index_template.render(dict(urlroot="")))
243             elif endpoint == "source":
244                 return self.show_source(args["package"])
245             raise NotFound()
246         except HTTPException as e:
247             return e
248
249     def get_details(self, package):
250         cur = self.db.cursor()
251         cur.execute("SELECT version, architecture FROM package WHERE package = ?;",
252                     (package,))
253         row = cur.fetchone()
254         if not row:
255             raise NotFound()
256         version, architecture = row
257         details = dict(package=package,
258                        version=version,
259                        architecture=architecture)
260         cur.execute("SELECT count(filename), sum(size) FROM content WHERE package = ?;",
261                     (package,))
262         num_files, total_size = cur.fetchone()
263         if total_size is None:
264             total_size = 0
265         details.update(dict(num_files=num_files, total_size=total_size))
266         return details
267
268     def get_dependencies(self, package):
269         cur = self.db.cursor()
270         cur.execute("SELECT required FROM dependency WHERE package = ?;",
271                     (package,))
272         return set(row[0] for row in fetchiter(cur))
273
274     def cached_sharedstats(self, package):
275         cur = self.db.cursor()
276         sharedstats = {}
277         cur.execute("SELECT package2, func1, func2, files, size FROM sharing WHERE package1 = ?;",
278                     (package,))
279         for package2, func1, func2, files, size in fetchiter(cur):
280             if (func1, func2) not in hash_functions:
281                 continue
282             curstats = sharedstats.setdefault(
283                     function_combination(func1, func2), list())
284             if package2 == package:
285                 package2 = None
286             curstats.append(dict(package=package2, duplicate=files, savable=size))
287         return sharedstats
288
289     def show_package(self, package):
290         params = self.get_details(package)
291         params["dependencies"] = self.get_dependencies(package)
292         params["shared"] = self.cached_sharedstats(package)
293         params["urlroot"] = ".."
294         return html_response(package_template.render(params))
295
296     def compute_comparison(self, package1, package2):
297         """Compute a sequence of comparison objects ordery by the size of the
298         object in the first package. Each element of the sequence is a dict
299         defining the following keys:
300          * filenames: A set of filenames in package1 all referring to the
301            same object.
302          * size: Size of the object in bytes.
303          * matches: A mapping from filenames in package2 to a mapping from
304            hash function pairs to hash values.
305         """
306         cur = self.db.cursor()
307         cur.execute("SELECT id, filename, size, hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid WHERE package = ? AND function = 'sha512' ORDER BY size DESC;",
308                     (package1,))
309         cursize = -1
310         files = dict()
311         minmatch = 2 if package1 == package2 else 1
312         for cid, filename, size, hashvalue in fetchiter(cur):
313             if cursize != size:
314                 for entry in files.values():
315                     if len(entry["matches"]) >= minmatch:
316                         yield entry
317                 files.clear()
318                 cursize = size
319
320             if hashvalue in files:
321                 files[hashvalue]["filenames"].add(filename)
322                 continue
323
324             entry = dict(filenames=set((filename,)), size=size, matches={})
325             files[hashvalue] = entry
326
327             cur2 = self.db.cursor()
328             cur2.execute("SELECT ha.function, ha.hash, hb.function, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id WHERE ha.cid = ? AND package = ?;",
329                          (cid, package2))
330             for func1, hashvalue, func2, filename in fetchiter(cur2):
331                 entry["matches"].setdefault(filename, {})[func1, func2] = \
332                         hashvalue
333             cur2.close()
334         cur.close()
335
336         for entry in files.values():
337             if len(entry["matches"]) >= minmatch:
338                 yield entry
339
340     def show_detail(self, package1, package2):
341         details1 = details2 = self.get_details(package1)
342         if package1 != package2:
343             details2 = self.get_details(package2)
344
345         shared = self.compute_comparison(package1, package2)
346         params = dict(
347             details1=details1,
348             details2=details2,
349             urlroot="../..",
350             shared=shared)
351         return html_response(detail_template.stream(params))
352
353     def show_hash(self, function, hashvalue):
354         cur = self.db.cursor()
355         cur.execute("SELECT content.package, content.filename, content.size, hash.function FROM content JOIN hash ON content.id = hash.cid WHERE hash = ?;",
356                     (hashvalue,))
357         entries = [dict(package=package, filename=filename, size=size,
358                         function=otherfunc)
359                    for package, filename, size, otherfunc in fetchiter(cur)
360                    if (function, otherfunc) in hash_functions]
361         if not entries:
362             raise NotFound()
363         params = dict(function=function, hashvalue=hashvalue, entries=entries,
364                       urlroot="../..")
365         return html_response(hash_template.render(params))
366
367     def show_source(self, package):
368         cur = self.db.cursor()
369         cur.execute("SELECT package FROM package WHERE source = ?;",
370                     (package,))
371         binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur))
372         if not binpkgs:
373             raise NotFound
374         cur.execute("SELECT package.package, sharing.package2, sharing.func1, sharing.func2, sharing.files, sharing.size FROM package JOIN sharing ON package.package = sharing.package1 WHERE package.source = ?;",
375                     (package,))
376         for binary, otherbin, func1, func2, files, size in fetchiter(cur):
377             entry = dict(package=otherbin,
378                          funccomb=function_combination(func1, func2),
379                          duplicate=files, savable=size)
380             oldentry = binpkgs.get(binary)
381             if not (oldentry and oldentry["savable"] >= size):
382                 binpkgs[binary] = entry
383         params = dict(source=package, packages=binpkgs, urlroot="..")
384         return html_response(source_template.render(params))
385
386 def main():
387     app = Application(sqlite3.connect("test.sqlite3"))
388     staticdir = os.path.join(os.path.dirname(__file__), "static")
389     app = SharedDataMiddleware(app, {"/": staticdir})
390     make_server("0.0.0.0", 8800, app).serve_forever()
391
392 if __name__ == "__main__":
393     main()