9e231285c59f027724dd22d27a58a73f11ce4784
[~helmut/debian-dedup.git] / webapp.py
1 #!/usr/bin/python
2
3 import datetime
4 import os.path
5 import sqlite3
6 from wsgiref.simple_server import make_server
7
8 import jinja2
9 from werkzeug.exceptions import HTTPException, NotFound
10 from werkzeug.routing import Map, Rule, RequestRedirect
11 from werkzeug.wrappers import Request, Response
12 from werkzeug.wsgi import SharedDataMiddleware
13
14 from dedup.utils import fetchiter
15
16 hash_functions = [
17         ("sha512", "sha512"),
18         ("image_sha512", "image_sha512"),
19         ("gzip_sha512", "gzip_sha512"),
20         ("sha512", "gzip_sha512"),
21         ("gzip_sha512", "sha512")]
22
23 jinjaenv = jinja2.Environment(loader=jinja2.FileSystemLoader("."))
24
25 def format_size(size):
26     size = float(size)
27     fmt = "%d B"
28     if size >= 1024:
29         size /= 1024
30         fmt = "%.1f KB"
31     if size >= 1024:
32         size /= 1024
33         fmt = "%.1f MB"
34     if size >= 1024:
35         size /= 1024
36         fmt = "%.1f GB"
37     return fmt % size
38
39 def function_combination(function1, function2):
40     if function1 == function2:
41         return function1
42     return "%s -> %s" % (function1, function2)
43
44 # Workaround for jinja bug #59 (broken filesizeformat)
45 jinjaenv.filters["filesizeformat"] = format_size
46
47 base_template = jinjaenv.get_template("base.html")
48
49 package_template = jinjaenv.from_string(
50 """{% extends "base.html" %}
51 {% block title %}duplication of {{ package|e }}{% endblock %}
52 {% block content %}<h1>{{ package|e }}</h1>
53 <p>Version: {{ version|e }}</p>
54 <p>Architecture: {{ architecture|e }}</p>
55 <p>Number of files: {{ num_files }}</p>
56 <p>Total size: {{ total_size|filesizeformat }}</p>
57 {%- if shared -%}
58     {%- for function, sharing in shared.items() -%}
59         <h3>sharing with respect to {{ function|e }}</h3>
60         <table border='1'><tr><th>package</th><th>files shared</th><th>data shared</th></tr>
61         {%- for entry in sharing|sort(attribute="savable", reverse=true) -%}
62             <tr><td{% if not entry.package or entry.package in dependencies %} class="dependency"{% endif %}>
63                 {%- if entry.package %}<a href="{{ entry.package|e }}"><span class="binary-package">{{ entry.package|e }}</span></a>{% else %}self{% endif %}
64                 <a href="../compare/{{ package|e }}/{{ entry.package|default(package, true)|e }}">compare</a></td>
65             <td>{{ entry.duplicate }} ({{ (100 * entry.duplicate / num_files)|int }}%)</td>
66             <td>{{ entry.savable|filesizeformat }} ({{ (100 * entry.savable / total_size)|int }}%)</td></tr>
67         {%- endfor -%}
68         </table>
69     {%- endfor -%}
70 <p>Note: Packages with yellow background are required to be installed when this package is installed.</p>
71 {%- endif -%}
72 {% endblock %}""")
73
74 detail_template = jinjaenv.from_string(
75 """{% extends "base.html" %}
76 {% block title %}sharing between {{ details1.package|e }} and {{ details2.package|e }}{% endblock%}
77 {% block content %}
78 <h1><a href="../../binary/{{ details1.package|e }}">{{ details1.package|e }}</a> &lt;-&gt; <a href="../../binary/{{ details2.package|e }}">{{ details2.package|e }}</a></h1>
79 <p>Version of {{ details1.package|e }}: {{ details1.version|e }}</p>
80 <p>Architecture of {{ details1.package|e }}: {{ details1.architecture|e }}</p>
81 {%- if details1.package != details2.package -%}
82 <p>Version of {{ details2.package|e }}: {{ details2.version|e }}</p>
83 <p>Architecture of {{ details2.package|e }}: {{ details2.architecture|e }}</p>
84 {%- endif -%}
85 <table border='1'><tr><th colspan="2">{{ details1.package|e }}</th><th colspan="2">{{ details2.package|e }}</th></tr>
86 <tr><th>size</th><th>filename</th><th>hash functions</th><th>filename</th></tr>
87 {%- for entry in shared -%}
88     <tr><td{% if entry.matches|length > 1 %} rowspan={{ entry.matches|length }}{% endif %}>{{ entry.size|filesizeformat }}</td><td{% if entry.matches|length > 1 %} rowspan={{ entry.matches|length }}{% endif %}>
89     {%- for filename in entry.filenames %}<span class="filename">{{ filename|e }}</span>{% endfor -%}</td><td>
90     {% for filename, match in entry.matches.items() -%}
91         {% if not loop.first %}<tr><td>{% endif -%}
92         {%- for funccomb, hashvalue in match.items() -%}
93             <a href="../../hash/{{ funccomb[0]|e }}/{{ hashvalue|e }}">{{ funccomb[0]|e }}</a>
94             {%- if funccomb[0] != funccomb[1] %} -&gt; <a href="../../hash/{{ funccomb[1]|e }}/{{ hashvalue|e }}">{{ funccomb[1]|e }}</a>{% endif %}
95             {%- if not loop.last %}, {% endif %}
96         {%- endfor -%}
97         </td><td><span class="filename">{{ filename|e }}</span></td></tr>
98     {%- endfor -%}
99 {%- endfor -%}
100 </table>
101 {% endblock %}""")
102
103 hash_template = jinjaenv.from_string(
104 """{% extends "base.html" %}
105 {% block title %}information on {{ function|e }} hash {{ hashvalue|e }}{% endblock %}
106 {% block content %}
107 <h1>{{ function|e }} {{ hashvalue|e }}</h1>
108 <table border='1'><tr><th>package</th><th>filename</th><th>size</th><th>different function</th></tr>
109 {%- for entry in entries -%}
110     <tr><td><a href="../../binary/{{ entry.package|e }}"><span class="binary-package">{{ entry.package|e }}</span></a></td>
111     <td><span class="filename">{{ entry.filename|e }}</span></td><td>{{ entry.size|filesizeformat }}</td>
112     <td>{% if function != entry.function %}{{ entry.function|e }}{% endif %}</td></tr>
113 {%- endfor -%}
114 </table>
115 {% endblock %}""")
116
117 index_template = jinjaenv.from_string(
118 """{% extends "base.html" %}
119 {% block title %}Debian duplication detector{% endblock %}
120 {% block header %}
121     <script type="text/javascript">
122         function getLinkTarget() {
123             var pkg = document.getElementById("pkg_name").value;
124             if(pkg) {
125                 return "/binary/"+pkg;
126             }
127             return '#';
128         }
129         function processData() {
130             var link = document.getElementById("perma_link");
131             link.href = getLinkTarget();
132             link.text = location.href + getLinkTarget();
133         }
134         window.onload = function() {
135             document.getElementById('pkg_name').onkeyup = processData;
136             document.getElementById("pkg_form").onsubmit = function () {
137                 location.href = getLinkTarget();
138                 return false;
139             }
140             processData();
141             document.getElementById("form_div").style.display = '';
142         }
143     </script>
144 {% endblock %}
145 {% block content %}
146 <h1>Debian duplication detector</h1>
147 <ul>
148 <li>To inspect a particlar binary package, go to <pre>binary/&lt;packagename&gt;</pre> Example: <a href="binary/git">binary/git</a>
149     <div style="display:none" id="form_div"><fieldset>
150             <legend>Inspect package</legend>
151             <noscript><b>This form is disfunctional when javascript is not enabled</b></noscript>
152             Enter binary package to inspect - Note: Non-existing packages will result in <b>404</b>-Errors
153             <form id="pkg_form">
154                 <label for="pkg_name">Name: <input type="text" size="30" name="pkg_name" id="pkg_name">
155                 <input type="submit" value="Go"> Permanent Link: <a id="perma_link" href="#"></a>
156             </form>
157     </fieldset></div></li>
158 <li>To inspect a combination of binary packages go to <pre>compare/&lt;firstpackage&gt;/&lt;secondpackage&gt;</pre> Example: <a href="compare/git/git">compare/git/git</a></li>
159 <li>To discover package shipping a particular file go to <pre>hash/sha512/&lt;hashvalue&gt;</pre> Example: <a href="hash/sha512/7633623b66b5e686bb94dd96a7cdb5a7e5ee00e87004fab416a5610d59c62badaf512a2e26e34e2455b7ed6b76690d2cd47464836d7d85d78b51d50f7e933d5c">hash/sha512/7633623b66b5e686bb94dd96a7cdb5a7e5ee00e87004fab416a5610d59c62badaf512a2e26e34e2455b7ed6b76690d2cd47464836d7d85d78b51d50f7e933d5c</a></li>
160 </ul>
161 {% endblock %}""")
162
163 source_template = jinjaenv.from_string(
164 """{% extends "base.html" %}
165 {% block title %}overview of {{ source|e }}{% endblock %}
166 {% block content %}
167 <h1>overview of {{ source|e }}</h1>
168 <table border='1'><tr><th>binary from {{ source|e }}</th><th>savable</th><th>other package</th></tr>
169 {% for package, sharing in packages.items() %}
170     <tr><td><a href="../binary/{{ package|e }}"><span class="binary-package">{{ package|e }}</span></a></td><td>
171     {%- if sharing -%}
172         {{ sharing.savable|filesizeformat }}</td><td><a href="../binary/{{ sharing.package|e }}"><span class="binary-package">{{ sharing.package|e }}</span></a> <a href="../compare/{{ package|e }}/{{ sharing.package|e }}">compare</a>
173     {%- else -%}</td><td>{%- endif -%}
174     </td></tr>
175 {% endfor %}
176 </table>
177 <p>Note: Not all sharing listed here. Click on binary packages with non-zero savable to see more.</p>
178 {% endblock %}""")
179
180 def encode_and_buffer(iterator):
181     buff = b""
182     for elem in iterator:
183         buff += elem.encode("utf8")
184         if len(buff) >= 2048:
185             yield buff
186             buff = b""
187     if buff:
188         yield buff
189
190 def html_response(unicode_iterator, max_age=24 * 60 * 60):
191     resp = Response(encode_and_buffer(unicode_iterator), mimetype="text/html")
192     resp.cache_control.max_age = max_age
193     resp.expires = datetime.datetime.now() + datetime.timedelta(seconds=max_age)
194     return resp
195
196 def generate_shared(rows):
197     """internal helper from show_detail"""
198     entry = None
199     for filename1, size1, func1, filename2, size2, func2, hashvalue in rows:
200         funccomb = (func1, func2)
201         if funccomb not in hash_functions:
202             continue
203         if entry and (entry["filename1"] != filename1 or
204                       entry["filename2"] != filename2):
205             yield entry
206             entry = None
207         if entry:
208             funcdict = entry["functions"]
209         else:
210             funcdict = dict()
211             entry = dict(filename1=filename1, filename2=filename2, size1=size1,
212                          size2=size2, functions=funcdict)
213         funcdict[funccomb] = hashvalue
214     if entry:
215         yield entry
216
217 class Application(object):
218     def __init__(self, db):
219         self.db = db
220         self.routingmap = Map([
221             Rule("/", methods=("GET",), endpoint="index"),
222             Rule("/binary/<package>", methods=("GET",), endpoint="package"),
223             Rule("/compare/<package1>/<package2>", methods=("GET",), endpoint="detail"),
224             Rule("/hash/<function>/<hashvalue>", methods=("GET",), endpoint="hash"),
225             Rule("/source/<package>", methods=("GET",), endpoint="source"),
226         ])
227
228     @Request.application
229     def __call__(self, request):
230         mapadapter = self.routingmap.bind_to_environ(request.environ)
231         try:
232             endpoint, args = mapadapter.match()
233             if endpoint == "package":
234                 return self.show_package(args["package"])
235             elif endpoint == "detail":
236                 return self.show_detail(args["package1"], args["package2"])
237             elif endpoint == "hash":
238                 return self.show_hash(args["function"], args["hashvalue"])
239             elif endpoint == "index":
240                 if not request.environ["PATH_INFO"]:
241                     raise RequestRedirect(request.environ["SCRIPT_NAME"] + "/")
242                 return html_response(index_template.render(dict(urlroot="")))
243             elif endpoint == "source":
244                 return self.show_source(args["package"])
245             raise NotFound()
246         except HTTPException as e:
247             return e
248
249     def get_details(self, package):
250         cur = self.db.cursor()
251         cur.execute("SELECT id, version, architecture FROM package WHERE name = ?;",
252                     (package,))
253         row = cur.fetchone()
254         if not row:
255             raise NotFound()
256         pid, version, architecture = row
257         details = dict(pid=pid,
258                        package=package,
259                        version=version,
260                        architecture=architecture)
261         cur.execute("SELECT count(filename), sum(size) FROM content WHERE pid = ?;",
262                     (pid,))
263         num_files, total_size = cur.fetchone()
264         if total_size is None:
265             total_size = 0
266         details.update(dict(num_files=num_files, total_size=total_size))
267         return details
268
269     def get_dependencies(self, pid):
270         cur = self.db.cursor()
271         cur.execute("SELECT required FROM dependency WHERE pid = ?;",
272                     (pid,))
273         return set(row[0] for row in fetchiter(cur))
274
275     def cached_sharedstats(self, pid):
276         cur = self.db.cursor()
277         sharedstats = {}
278         cur.execute("SELECT pid2, package.name, func1, func2, files, size FROM sharing JOIN package ON sharing.pid2 = package.id WHERE pid1 = ?;",
279                     (pid,))
280         for pid2, package2, func1, func2, files, size in fetchiter(cur):
281             if (func1, func2) not in hash_functions:
282                 continue
283             curstats = sharedstats.setdefault(
284                     function_combination(func1, func2), list())
285             if pid2 == pid:
286                 package2 = None
287             curstats.append(dict(package=package2, duplicate=files, savable=size))
288         return sharedstats
289
290     def show_package(self, package):
291         params = self.get_details(package)
292         params["dependencies"] = self.get_dependencies(params["pid"])
293         params["shared"] = self.cached_sharedstats(params["pid"])
294         params["urlroot"] = ".."
295         return html_response(package_template.render(params))
296
297     def compute_comparison(self, pid1, pid2):
298         """Compute a sequence of comparison objects ordery by the size of the
299         object in the first package. Each element of the sequence is a dict
300         defining the following keys:
301          * filenames: A set of filenames in package 1 (pid1) all referring to
302            the same object.
303          * size: Size of the object in bytes.
304          * matches: A mapping from filenames in package 2 (pid2) to a mapping
305            from hash function pairs to hash values.
306         """
307         cur = self.db.cursor()
308         cur.execute("SELECT id, filename, size, hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid WHERE pid = ? AND function = 'sha512' ORDER BY size DESC;",
309                     (pid1,))
310         cursize = -1
311         files = dict()
312         minmatch = 2 if pid1 == pid2 else 1
313         for cid, filename, size, hashvalue in fetchiter(cur):
314             if cursize != size:
315                 for entry in files.values():
316                     if len(entry["matches"]) >= minmatch:
317                         yield entry
318                 files.clear()
319                 cursize = size
320
321             if hashvalue in files:
322                 files[hashvalue]["filenames"].add(filename)
323                 continue
324
325             entry = dict(filenames=set((filename,)), size=size, matches={})
326             files[hashvalue] = entry
327
328             cur2 = self.db.cursor()
329             cur2.execute("SELECT ha.function, ha.hash, hb.function, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id WHERE ha.cid = ? AND pid = ?;",
330                          (cid, pid2))
331             for func1, hashvalue, func2, filename in fetchiter(cur2):
332                 entry["matches"].setdefault(filename, {})[func1, func2] = \
333                         hashvalue
334             cur2.close()
335         cur.close()
336
337         for entry in files.values():
338             if len(entry["matches"]) >= minmatch:
339                 yield entry
340
341     def show_detail(self, package1, package2):
342         details1 = details2 = self.get_details(package1)
343         if package1 != package2:
344             details2 = self.get_details(package2)
345
346         shared = self.compute_comparison(details1["pid"], details2["pid"])
347         params = dict(
348             details1=details1,
349             details2=details2,
350             urlroot="../..",
351             shared=shared)
352         return html_response(detail_template.stream(params))
353
354     def show_hash(self, function, hashvalue):
355         cur = self.db.cursor()
356         cur.execute("SELECT package.name, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id WHERE hash = ?;",
357                     (hashvalue,))
358         entries = [dict(package=package, filename=filename, size=size,
359                         function=otherfunc)
360                    for package, filename, size, otherfunc in fetchiter(cur)
361                    if (function, otherfunc) in hash_functions]
362         if not entries:
363             raise NotFound()
364         params = dict(function=function, hashvalue=hashvalue, entries=entries,
365                       urlroot="../..")
366         return html_response(hash_template.render(params))
367
368     def show_source(self, package):
369         cur = self.db.cursor()
370         cur.execute("SELECT name FROM package WHERE source = ?;",
371                     (package,))
372         binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur))
373         if not binpkgs:
374             raise NotFound
375         cur.execute("SELECT p1.name, p2.name, sharing.func1, sharing.func2, sharing.files, sharing.size FROM sharing JOIN package AS p1 ON sharing.pid1 = p1.id JOIN package AS p2 ON sharing.pid2 = p2.id WHERE p1.source = ?;",
376                     (package,))
377         for binary, otherbin, func1, func2, files, size in fetchiter(cur):
378             entry = dict(package=otherbin,
379                          funccomb=function_combination(func1, func2),
380                          duplicate=files, savable=size)
381             oldentry = binpkgs.get(binary)
382             if not (oldentry and oldentry["savable"] >= size):
383                 binpkgs[binary] = entry
384         params = dict(source=package, packages=binpkgs, urlroot="..")
385         return html_response(source_template.render(params))
386
387 def main():
388     app = Application(sqlite3.connect("test.sqlite3"))
389     staticdir = os.path.join(os.path.dirname(__file__), "static")
390     app = SharedDataMiddleware(app, {"/": staticdir})
391     make_server("0.0.0.0", 8800, app).serve_forever()
392
393 if __name__ == "__main__":
394     main()