README: foo.PNG is also a valid png name
[~helmut/debian-dedup.git] / webapp.py
1 #!/usr/bin/python
2
3 import datetime
4 import os.path
5 import sqlite3
6 from wsgiref.simple_server import make_server
7
8 import jinja2
9 from werkzeug.exceptions import HTTPException, NotFound
10 from werkzeug.routing import Map, Rule, RequestRedirect
11 from werkzeug.wrappers import Request, Response
12 from werkzeug.wsgi import SharedDataMiddleware
13
14 from dedup.utils import fetchiter
15
16 hash_functions = [
17         ("sha512", "sha512"),
18         ("image_sha512", "image_sha512"),
19         ("gzip_sha512", "gzip_sha512"),
20         ("sha512", "gzip_sha512"),
21         ("gzip_sha512", "sha512")]
22
23 jinjaenv = jinja2.Environment(loader=jinja2.FileSystemLoader("."))
24
25 def format_size(size):
26     size = float(size)
27     fmt = "%d B"
28     if size >= 1024:
29         size /= 1024
30         fmt = "%.1f KB"
31     if size >= 1024:
32         size /= 1024
33         fmt = "%.1f MB"
34     if size >= 1024:
35         size /= 1024
36         fmt = "%.1f GB"
37     return fmt % size
38
39 def function_combination(function1, function2):
40     if function1 == function2:
41         return function1
42     return "%s -> %s" % (function1, function2)
43
44 # Workaround for jinja bug #59 (broken filesizeformat)
45 jinjaenv.filters["filesizeformat"] = format_size
46
47 base_template = jinjaenv.get_template("base.html")
48
49 package_template = jinjaenv.from_string(
50 """{% extends "base.html" %}
51 {% block title %}duplication of {{ package|e }}{% endblock %}
52 {% block content %}<h1>{{ package|e }}</h1>
53 <p>Version: {{ version|e }}</p>
54 <p>Architecture: {{ architecture|e }}</p>
55 <p>Number of files: {{ num_files }}</p>
56 <p>Total size: {{ total_size|filesizeformat }}</p>
57 {%- if shared -%}
58     {%- for function, sharing in shared.items() -%}
59         <h3>sharing with respect to {{ function|e }}</h3>
60         <table border='1'><tr><th>package</th><th>files shared</th><th>data shared</th></tr>
61         {%- for entry in sharing|sort(attribute="savable", reverse=true) -%}
62             <tr><td{% if not entry.package or entry.package in dependencies %} class="dependency"{% endif %}>
63                 {%- if entry.package %}<a href="{{ entry.package|e }}"><span class="binary-package">{{ entry.package|e }}</span></a>{% else %}self{% endif %}
64                 <a href="../compare/{{ package|e }}/{{ entry.package|default(package, true)|e }}">compare</a></td>
65             <td>{{ entry.duplicate }} ({{ (100 * entry.duplicate / num_files)|int }}%)</td>
66             <td>{{ entry.savable|filesizeformat }} ({{ (100 * entry.savable / total_size)|int }}%)</td></tr>
67         {%- endfor -%}
68         </table>
69     {%- endfor -%}
70 <p>Note: Packages with yellow background are required to be installed when this package is installed.</p>
71 {%- endif -%}
72 {% endblock %}""")
73
74 detail_template = jinjaenv.from_string(
75 """{% extends "base.html" %}
76 {% block title %}sharing between {{ details1.package|e }} and {{ details2.package|e }}{% endblock%}
77 {% block content %}
78 <h1><a href="../../binary/{{ details1.package|e }}">{{ details1.package|e }}</a> &lt;-&gt; <a href="../../binary/{{ details2.package|e }}">{{ details2.package|e }}</a></h1>
79 <p>Version of {{ details1.package|e }}: {{ details1.version|e }}</p>
80 <p>Architecture of {{ details1.package|e }}: {{ details1.architecture|e }}</p>
81 {%- if details1.package != details2.package -%}
82 <p>Version of {{ details2.package|e }}: {{ details2.version|e }}</p>
83 <p>Architecture of {{ details2.package|e }}: {{ details2.architecture|e }}</p>
84 {%- endif -%}
85 <table border='1'><tr><th colspan="2">{{ details1.package|e }}</th><th colspan="2">{{ details2.package|e }}</th></tr>
86 <tr><th>size</th><th>filename</th><th>hash functions</th><th>filename</th></tr>
87 {%- for entry in shared -%}
88     <tr><td{% if entry.matches|length > 1 %} rowspan={{ entry.matches|length }}{% endif %}>{{ entry.size|filesizeformat }}</td><td{% if entry.matches|length > 1 %} rowspan={{ entry.matches|length }}{% endif %}>
89     {%- for filename in entry.filenames %}<span class="filename">{{ filename|e }}</span>{% endfor -%}</td><td>
90     {% for filename, match in entry.matches.items() -%}
91         {% if not loop.first %}<tr><td>{% endif -%}
92         {%- for funccomb, hashvalue in match.items() -%}
93             <a href="../../hash/{{ funccomb[0]|e }}/{{ hashvalue|e }}">{{ funccomb[0]|e }}</a>
94             {%- if funccomb[0] != funccomb[1] %} -&gt; <a href="../../hash/{{ funccomb[1]|e }}/{{ hashvalue|e }}">{{ funccomb[1]|e }}</a>{% endif %}
95             {%- if not loop.last %}, {% endif %}
96         {%- endfor -%}
97         </td><td><span class="filename">{{ filename|e }}</span></td></tr>
98     {%- endfor -%}
99 {%- endfor -%}
100 </table>
101 {% endblock %}""")
102
103 hash_template = jinjaenv.from_string(
104 """{% extends "base.html" %}
105 {% block title %}information on {{ function|e }} hash {{ hashvalue|e }}{% endblock %}
106 {% block content %}
107 <h1>{{ function|e }} {{ hashvalue|e }}</h1>
108 <table border='1'><tr><th>package</th><th>filename</th><th>size</th><th>different function</th></tr>
109 {%- for entry in entries -%}
110     <tr><td><a href="../../binary/{{ entry.package|e }}"><span class="binary-package">{{ entry.package|e }}</span></a></td>
111     <td><span class="filename">{{ entry.filename|e }}</span></td><td>{{ entry.size|filesizeformat }}</td>
112     <td>{% if function != entry.function %}{{ entry.function|e }}{% endif %}</td></tr>
113 {%- endfor -%}
114 </table>
115 {% endblock %}""")
116
117 index_template = jinjaenv.from_string(
118 """{% extends "base.html" %}
119 {% block title %}Debian duplication detector{% endblock %}
120 {% block header %}
121     <script type="text/javascript">
122         function getLinkTarget() {
123             var pkg = document.getElementById("pkg_name").value;
124             if(pkg) {
125                 return "/binary/"+pkg;
126             }
127             return '#';
128         }
129         function processData() {
130             var link = document.getElementById("perma_link");
131             link.href = getLinkTarget();
132             link.text = location.href + getLinkTarget();
133         }
134         window.onload = function() {
135             document.getElementById('pkg_name').onkeyup = processData;
136             document.getElementById("pkg_form").onsubmit = function () {
137                 location.href = getLinkTarget();
138                 return false;
139             }
140             processData();
141             document.getElementById("form_div").style.display = '';
142         }
143     </script>
144 {% endblock %}
145 {% block content %}
146 <h1>Debian duplication detector</h1>
147 <ul>
148 <li>To inspect a particlar binary package, go to <pre>binary/&lt;packagename&gt;</pre> Example: <a href="binary/git">binary/git</a>
149     <div style="display:none" id="form_div"><fieldset>
150             <legend>Inspect package</legend>
151             <noscript><b>This form is disfunctional when javascript is not enabled</b></noscript>
152             Enter binary package to inspect - Note: Non-existing packages will result in <b>404</b>-Errors
153             <form id="pkg_form">
154                 <label for="pkg_name">Name: </label><input type="text" size="30" name="pkg_name" id="pkg_name">
155                 <input type="submit" value="Go"> Permanent Link: <a id="perma_link" href="#"></a>
156             </form>
157     </fieldset></div></li>
158 <li>To inspect a combination of binary packages go to <pre>compare/&lt;firstpackage&gt;/&lt;secondpackage&gt;</pre> Example: <a href="compare/git/git">compare/git/git</a></li>
159 <li>To discover package shipping a particular file go to <pre>hash/sha512/&lt;hashvalue&gt;</pre> Example: <a href="hash/sha512/7633623b66b5e686bb94dd96a7cdb5a7e5ee00e87004fab416a5610d59c62badaf512a2e26e34e2455b7ed6b76690d2cd47464836d7d85d78b51d50f7e933d5c">hash/sha512/7633623b66b5e686bb94dd96a7cdb5a7e5ee00e87004fab416a5610d59c62badaf512a2e26e34e2455b7ed6b76690d2cd47464836d7d85d78b51d50f7e933d5c</a></li>
160 </ul>
161 {% endblock %}""")
162
163 source_template = jinjaenv.from_string(
164 """{% extends "base.html" %}
165 {% block title %}overview of {{ source|e }}{% endblock %}
166 {% block content %}
167 <h1>overview of {{ source|e }}</h1>
168 <table border='1'><tr><th>binary from {{ source|e }}</th><th>savable</th><th>other package</th></tr>
169 {% for package, sharing in packages.items() %}
170     <tr><td><a href="../binary/{{ package|e }}"><span class="binary-package">{{ package|e }}</span></a></td><td>
171     {%- if sharing -%}
172         {{ sharing.savable|filesizeformat }}</td><td><a href="../binary/{{ sharing.package|e }}"><span class="binary-package">{{ sharing.package|e }}</span></a> <a href="../compare/{{ package|e }}/{{ sharing.package|e }}">compare</a>
173     {%- else -%}</td><td>{%- endif -%}
174     </td></tr>
175 {% endfor %}
176 </table>
177 <p>Note: Not all sharing listed here. Click on binary packages with non-zero savable to see more.</p>
178 {% endblock %}""")
179
180 def encode_and_buffer(iterator):
181     buff = b""
182     for elem in iterator:
183         buff += elem.encode("utf8")
184         if len(buff) >= 2048:
185             yield buff
186             buff = b""
187     if buff:
188         yield buff
189
190 def html_response(unicode_iterator, max_age=24 * 60 * 60):
191     resp = Response(encode_and_buffer(unicode_iterator), mimetype="text/html")
192     resp.cache_control.max_age = max_age
193     resp.expires = datetime.datetime.now() + datetime.timedelta(seconds=max_age)
194     return resp
195
196 class Application(object):
197     def __init__(self, db):
198         self.db = db
199         self.routingmap = Map([
200             Rule("/", methods=("GET",), endpoint="index"),
201             Rule("/binary/<package>", methods=("GET",), endpoint="package"),
202             Rule("/compare/<package1>/<package2>", methods=("GET",), endpoint="detail"),
203             Rule("/hash/<function>/<hashvalue>", methods=("GET",), endpoint="hash"),
204             Rule("/source/<package>", methods=("GET",), endpoint="source"),
205         ])
206
207     @Request.application
208     def __call__(self, request):
209         mapadapter = self.routingmap.bind_to_environ(request.environ)
210         try:
211             endpoint, args = mapadapter.match()
212             if endpoint == "package":
213                 return self.show_package(args["package"])
214             elif endpoint == "detail":
215                 return self.show_detail(args["package1"], args["package2"])
216             elif endpoint == "hash":
217                 return self.show_hash(args["function"], args["hashvalue"])
218             elif endpoint == "index":
219                 if not request.environ["PATH_INFO"]:
220                     raise RequestRedirect(request.environ["SCRIPT_NAME"] + "/")
221                 return html_response(index_template.render(dict(urlroot="")))
222             elif endpoint == "source":
223                 return self.show_source(args["package"])
224             raise NotFound()
225         except HTTPException as e:
226             return e
227
228     def get_details(self, package):
229         cur = self.db.cursor()
230         cur.execute("SELECT id, version, architecture FROM package WHERE name = ?;",
231                     (package,))
232         row = cur.fetchone()
233         if not row:
234             raise NotFound()
235         pid, version, architecture = row
236         details = dict(pid=pid,
237                        package=package,
238                        version=version,
239                        architecture=architecture)
240         cur.execute("SELECT count(filename), sum(size) FROM content WHERE pid = ?;",
241                     (pid,))
242         num_files, total_size = cur.fetchone()
243         if total_size is None:
244             total_size = 0
245         details.update(dict(num_files=num_files, total_size=total_size))
246         return details
247
248     def get_dependencies(self, pid):
249         cur = self.db.cursor()
250         cur.execute("SELECT required FROM dependency WHERE pid = ?;",
251                     (pid,))
252         return set(row[0] for row in fetchiter(cur))
253
254     def cached_sharedstats(self, pid):
255         cur = self.db.cursor()
256         sharedstats = {}
257         cur.execute("SELECT pid2, package.name, func1, func2, files, size FROM sharing JOIN package ON sharing.pid2 = package.id WHERE pid1 = ?;",
258                     (pid,))
259         for pid2, package2, func1, func2, files, size in fetchiter(cur):
260             if (func1, func2) not in hash_functions:
261                 continue
262             curstats = sharedstats.setdefault(
263                     function_combination(func1, func2), list())
264             if pid2 == pid:
265                 package2 = None
266             curstats.append(dict(package=package2, duplicate=files, savable=size))
267         return sharedstats
268
269     def show_package(self, package):
270         params = self.get_details(package)
271         params["dependencies"] = self.get_dependencies(params["pid"])
272         params["shared"] = self.cached_sharedstats(params["pid"])
273         params["urlroot"] = ".."
274         return html_response(package_template.render(params))
275
276     def compute_comparison(self, pid1, pid2):
277         """Compute a sequence of comparison objects ordery by the size of the
278         object in the first package. Each element of the sequence is a dict
279         defining the following keys:
280          * filenames: A set of filenames in package 1 (pid1) all referring to
281            the same object.
282          * size: Size of the object in bytes.
283          * matches: A mapping from filenames in package 2 (pid2) to a mapping
284            from hash function pairs to hash values.
285         """
286         cur = self.db.cursor()
287         cur.execute("SELECT id, filename, size, hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid WHERE pid = ? AND function = 'sha512' ORDER BY size DESC;",
288                     (pid1,))
289         cursize = -1
290         files = dict()
291         minmatch = 2 if pid1 == pid2 else 1
292         for cid, filename, size, hashvalue in fetchiter(cur):
293             if cursize != size:
294                 for entry in files.values():
295                     if len(entry["matches"]) >= minmatch:
296                         yield entry
297                 files.clear()
298                 cursize = size
299
300             if hashvalue in files:
301                 files[hashvalue]["filenames"].add(filename)
302                 continue
303
304             entry = dict(filenames=set((filename,)), size=size, matches={})
305             files[hashvalue] = entry
306
307             cur2 = self.db.cursor()
308             cur2.execute("SELECT ha.function, ha.hash, hb.function, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id WHERE ha.cid = ? AND pid = ?;",
309                          (cid, pid2))
310             for func1, hashvalue, func2, filename in fetchiter(cur2):
311                 entry["matches"].setdefault(filename, {})[func1, func2] = \
312                         hashvalue
313             cur2.close()
314         cur.close()
315
316         for entry in files.values():
317             if len(entry["matches"]) >= minmatch:
318                 yield entry
319
320     def show_detail(self, package1, package2):
321         details1 = details2 = self.get_details(package1)
322         if package1 != package2:
323             details2 = self.get_details(package2)
324
325         shared = self.compute_comparison(details1["pid"], details2["pid"])
326         params = dict(
327             details1=details1,
328             details2=details2,
329             urlroot="../..",
330             shared=shared)
331         return html_response(detail_template.stream(params))
332
333     def show_hash(self, function, hashvalue):
334         cur = self.db.cursor()
335         cur.execute("SELECT package.name, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id WHERE hash = ?;",
336                     (hashvalue,))
337         entries = [dict(package=package, filename=filename, size=size,
338                         function=otherfunc)
339                    for package, filename, size, otherfunc in fetchiter(cur)
340                    if (function, otherfunc) in hash_functions]
341         if not entries:
342             raise NotFound()
343         params = dict(function=function, hashvalue=hashvalue, entries=entries,
344                       urlroot="../..")
345         return html_response(hash_template.render(params))
346
347     def show_source(self, package):
348         cur = self.db.cursor()
349         cur.execute("SELECT name FROM package WHERE source = ?;",
350                     (package,))
351         binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur))
352         if not binpkgs:
353             raise NotFound
354         cur.execute("SELECT p1.name, p2.name, sharing.func1, sharing.func2, sharing.files, sharing.size FROM sharing JOIN package AS p1 ON sharing.pid1 = p1.id JOIN package AS p2 ON sharing.pid2 = p2.id WHERE p1.source = ?;",
355                     (package,))
356         for binary, otherbin, func1, func2, files, size in fetchiter(cur):
357             entry = dict(package=otherbin,
358                          funccomb=function_combination(func1, func2),
359                          duplicate=files, savable=size)
360             oldentry = binpkgs.get(binary)
361             if not (oldentry and oldentry["savable"] >= size):
362                 binpkgs[binary] = entry
363         params = dict(source=package, packages=binpkgs, urlroot="..")
364         return html_response(source_template.render(params))
365
366 def main():
367     app = Application(sqlite3.connect("test.sqlite3"))
368     staticdir = os.path.join(os.path.dirname(__file__), "static")
369     app = SharedDataMiddleware(app, {"/": staticdir})
370     make_server("0.0.0.0", 8800, app).serve_forever()
371
372 if __name__ == "__main__":
373     main()