fix attribution of logo
[~helmut/debian-dedup.git] / webapp.py
1 #!/usr/bin/python
2
3 import datetime
4 import os.path
5 import sqlite3
6 from wsgiref.simple_server import make_server
7
8 import jinja2
9 from werkzeug.exceptions import HTTPException, NotFound
10 from werkzeug.routing import Map, Rule, RequestRedirect
11 from werkzeug.wrappers import Request, Response
12 from werkzeug.wsgi import SharedDataMiddleware
13
14 from dedup.utils import fetchiter
15
16 hash_functions = [
17         ("sha512", "sha512"),
18         ("image_sha512", "image_sha512"),
19         ("gzip_sha512", "gzip_sha512"),
20         ("sha512", "gzip_sha512"),
21         ("gzip_sha512", "sha512")]
22
23 jinjaenv = jinja2.Environment(loader=jinja2.FileSystemLoader("."))
24
25 def format_size(size):
26     size = float(size)
27     fmt = "%d B"
28     if size >= 1024:
29         size /= 1024
30         fmt = "%.1f KB"
31     if size >= 1024:
32         size /= 1024
33         fmt = "%.1f MB"
34     if size >= 1024:
35         size /= 1024
36         fmt = "%.1f GB"
37     return fmt % size
38
39 def function_combination(function1, function2):
40     if function1 == function2:
41         return function1
42     return "%s -> %s" % (function1, function2)
43
44 # Workaround for jinja bug #59 (broken filesizeformat)
45 jinjaenv.filters["filesizeformat"] = format_size
46
47 base_template = jinjaenv.get_template("base.html")
48
49 package_template = jinjaenv.from_string(
50 """{% extends "base.html" %}
51 {% block title %}duplication of {{ package|e }}{% endblock %}
52 {% block content %}<h1>{{ package|e }}</h1>
53 <p>Version: {{ version|e }}</p>
54 <p>Architecture: {{ architecture|e }}</p>
55 <p>Number of files: {{ num_files }}</p>
56 <p>Total size: {{ total_size|filesizeformat }}</p>
57 {%- if shared -%}
58     {%- for function, sharing in shared.items() -%}
59         <h3>sharing with respect to {{ function|e }}</h3>
60         <table border='1'><tr><th>package</th><th>files shared</th><th>data shared</th></tr>
61         {%- for entry in sharing|sort(attribute="savable", reverse=true) -%}
62             <tr><td{% if not entry.package or entry.package in dependencies %} class="dependency"{% endif %}>
63                 {%- if entry.package %}<a href="{{ entry.package|e }}"><span class="binary-package">{{ entry.package|e }}</span></a>{% else %}self{% endif %}
64                 <a href="../compare/{{ package|e }}/{{ entry.package|default(package, true)|e }}">compare</a></td>
65             <td>{{ entry.duplicate }} ({{ (100 * entry.duplicate / num_files)|int }}%)</td>
66             <td>{{ entry.savable|filesizeformat }} ({{ (100 * entry.savable / total_size)|int }}%)</td></tr>
67         {%- endfor -%}
68         </table>
69     {%- endfor -%}
70 <p>Note: Packages with yellow background are required to be installed when this package is installed.</p>
71 {%- endif -%}
72 {% endblock %}""")
73
74 detail_template = jinjaenv.from_string(
75 """{% extends "base.html" %}
76 {% block title %}sharing between {{ details1.package|e }} and {{ details2.package|e }}{% endblock%}
77 {% block content %}
78 <h1><a href="../../binary/{{ details1.package|e }}">{{ details1.package|e }}</a> &lt;-&gt; <a href="../../binary/{{ details2.package|e }}">{{ details2.package|e }}</a></h1>
79 <table border='1'><tr><th colspan="2">{{ details1.package|e }}</th><th colspan="2">{{ details2.package|e }}</th></tr>
80 <tr><th>size</th><th>filename</th><th>hash functions</th><th>filename</th></tr>
81 {%- for entry in shared -%}
82     <tr><td{% if entry.matches|length > 1 %} rowspan={{ entry.matches|length }}{% endif %}>{{ entry.size|filesizeformat }}</td><td{% if entry.matches|length > 1 %} rowspan={{ entry.matches|length }}{% endif %}>
83     {%- for filename in entry.filenames %}<span class="filename">{{ filename|e }}</span>{% endfor -%}</td><td>
84     {% for filename, match in entry.matches.items() -%}
85         {% if not loop.first %}<tr><td>{% endif -%}
86         {%- for funccomb, hashvalue in match.items() -%}
87             <a href="../../hash/{{ funccomb[0]|e }}/{{ hashvalue|e }}">{{ funccomb[0]|e }}</a>
88             {%- if funccomb[0] != funccomb[1] %} -&gt; <a href="../../hash/{{ funccomb[1]|e }}/{{ hashvalue|e }}">{{ funccomb[1]|e }}</a>{% endif %}
89             {%- if not loop.last %}, {% endif %}
90         {%- endfor -%}
91         </td><td><span class="filename">{{ filename|e }}</span></td></tr>
92     {%- endfor -%}
93 {%- endfor -%}
94 </table>
95 {% endblock %}""")
96
97 hash_template = jinjaenv.from_string(
98 """{% extends "base.html" %}
99 {% block title %}information on {{ function|e }} hash {{ hashvalue|e }}{% endblock %}
100 {% block content %}
101 <h1>{{ function|e }} {{ hashvalue|e }}</h1>
102 <table border='1'><tr><th>package</th><th>filename</th><th>size</th><th>different function</th></tr>
103 {%- for entry in entries -%}
104     <tr><td><a href="../../binary/{{ entry.package|e }}"><span class="binary-package">{{ entry.package|e }}</span></a></td>
105     <td><span class="filename">{{ entry.filename|e }}</span></td><td>{{ entry.size|filesizeformat }}</td>
106     <td>{% if function != entry.function %}{{ entry.function|e }}{% endif %}</td></tr>
107 {%- endfor -%}
108 </table>
109 {% endblock %}""")
110
111 index_template = jinjaenv.from_string(
112 """{% extends "base.html" %}
113 {% block title %}Debian duplication detector{% endblock %}
114 {% block header %}
115     <script type="text/javascript">
116         function getLinkTarget() {
117             var pkg = document.getElementById("pkg_name").value;
118             if(pkg) {
119                 return "/binary/"+pkg;
120             }
121             return '#';
122         }
123         function processData() {
124             var link = document.getElementById("perma_link");
125             link.href = getLinkTarget();
126             link.text = location.href + getLinkTarget();
127         }
128         window.onload = function() {
129             document.getElementById('pkg_name').onkeyup = processData;
130             document.getElementById("pkg_form").onsubmit = function () {
131                 location.href = getLinkTarget();
132                 return false;
133             }
134             processData();
135             document.getElementById("form_div").style.display = '';
136         }
137     </script>
138 {% endblock %}
139 {% block content %}
140 <h1>Debian duplication detector</h1>
141 <ul>
142 <li>To inspect a particlar binary package, go to <pre>binary/&lt;packagename&gt;</pre> Example: <a href="binary/git">binary/git</a>
143     <div style="display:none" id="form_div"><fieldset>
144             <legend>Inspect package</legend>
145             <noscript><b>This form is disfunctional when javascript is not enabled</b></noscript>
146             Enter binary package to inspect - Note: Non-existing packages will result in <b>404</b>-Errors
147             <form id="pkg_form">
148                 <label for="pkg_name">Name: <input type="text" size="30" name="pkg_name" id="pkg_name">
149                 <input type="submit" value="Go"> Permanent Link: <a id="perma_link" href="#"></a>
150             </form>
151     </fieldset></div></li>
152 <li>To inspect a combination of binary packages go to <pre>compare/&lt;firstpackage&gt;/&lt;secondpackage&gt;</pre> Example: <a href="compare/git/git">compare/git/git</a></li>
153 <li>To discover package shipping a particular file go to <pre>hash/sha512/&lt;hashvalue&gt;</pre> Example: <a href="hash/sha512/ed94df7781793f06f9426a600c1bde86397afc7b35cb3aa11b60214bd31e35ad893b53a04a2cf4676154982d7c204c4aa165d6ccdaac0170031364a05dbab3bc">hash/sha512/ed94df7781793f06f9426a600c1bde86397afc7b35cb3aa11b60214bd31e35ad893b53a04a2cf4676154982d7c204c4aa165d6ccdaac0170031364a05dbab3bc</a></li>
154 </ul>
155 {% endblock %}""")
156
157 source_template = jinjaenv.from_string(
158 """{% extends "base.html" %}
159 {% block title %}overview of {{ source|e }}{% endblock %}
160 {% block content %}
161 <h1>overview of {{ source|e }}</h1>
162 <table border='1'><tr><th>binary from {{ source|e }}</th><th>savable</th><th>other package</th></tr>
163 {% for package, sharing in packages.items() %}
164     <tr><td><a href="../binary/{{ package|e }}"><span class="binary-package">{{ package|e }}</span></a></td><td>
165     {%- if sharing -%}
166         {{ sharing.savable|filesizeformat }}</td><td><a href="../binary/{{ sharing.package|e }}"><span class="binary-package">{{ sharing.package|e }}</span></a> <a href="../compare/{{ package|e }}/{{ sharing.package|e }}">compare</a>
167     {%- else -%}</td><td>{%- endif -%}
168     </td></tr>
169 {% endfor %}
170 </table>
171 <p>Note: Not all sharing listed here. Click on binary packages with non-zero savable to see more.</p>
172 {% endblock %}""")
173
174 def encode_and_buffer(iterator):
175     buff = b""
176     for elem in iterator:
177         buff += elem.encode("utf8")
178         if len(buff) >= 2048:
179             yield buff
180             buff = b""
181     if buff:
182         yield buff
183
184 def html_response(unicode_iterator, max_age=24 * 60 * 60):
185     resp = Response(encode_and_buffer(unicode_iterator), mimetype="text/html")
186     resp.cache_control.max_age = max_age
187     resp.expires = datetime.datetime.now() + datetime.timedelta(seconds=max_age)
188     return resp
189
190 def generate_shared(rows):
191     """internal helper from show_detail"""
192     entry = None
193     for filename1, size1, func1, filename2, size2, func2, hashvalue in rows:
194         funccomb = (func1, func2)
195         if funccomb not in hash_functions:
196             continue
197         if entry and (entry["filename1"] != filename1 or
198                       entry["filename2"] != filename2):
199             yield entry
200             entry = None
201         if entry:
202             funcdict = entry["functions"]
203         else:
204             funcdict = dict()
205             entry = dict(filename1=filename1, filename2=filename2, size1=size1,
206                          size2=size2, functions=funcdict)
207         funcdict[funccomb] = hashvalue
208     if entry:
209         yield entry
210
211 class Application(object):
212     def __init__(self, db):
213         self.db = db
214         self.routingmap = Map([
215             Rule("/", methods=("GET",), endpoint="index"),
216             Rule("/binary/<package>", methods=("GET",), endpoint="package"),
217             Rule("/compare/<package1>/<package2>", methods=("GET",), endpoint="detail"),
218             Rule("/hash/<function>/<hashvalue>", methods=("GET",), endpoint="hash"),
219             Rule("/source/<package>", methods=("GET",), endpoint="source"),
220         ])
221
222     @Request.application
223     def __call__(self, request):
224         mapadapter = self.routingmap.bind_to_environ(request.environ)
225         try:
226             endpoint, args = mapadapter.match()
227             if endpoint == "package":
228                 return self.show_package(args["package"])
229             elif endpoint == "detail":
230                 return self.show_detail(args["package1"], args["package2"])
231             elif endpoint == "hash":
232                 return self.show_hash(args["function"], args["hashvalue"])
233             elif endpoint == "index":
234                 if not request.environ["PATH_INFO"]:
235                     raise RequestRedirect(request.environ["SCRIPT_NAME"] + "/")
236                 return html_response(index_template.render(dict(urlroot="")))
237             elif endpoint == "source":
238                 return self.show_source(args["package"])
239             raise NotFound()
240         except HTTPException as e:
241             return e
242
243     def get_details(self, package):
244         cur = self.db.cursor()
245         cur.execute("SELECT version, architecture FROM package WHERE package = ?;",
246                     (package,))
247         row = cur.fetchone()
248         if not row:
249             raise NotFound()
250         version, architecture = row
251         details = dict(package=package,
252                        version=version,
253                        architecture=architecture)
254         cur.execute("SELECT count(filename), sum(size) FROM content WHERE package = ?;",
255                     (package,))
256         num_files, total_size = cur.fetchone()
257         if total_size is None:
258             total_size = 0
259         details.update(dict(num_files=num_files, total_size=total_size))
260         return details
261
262     def get_dependencies(self, package):
263         cur = self.db.cursor()
264         cur.execute("SELECT required FROM dependency WHERE package = ?;",
265                     (package,))
266         return set(row[0] for row in fetchiter(cur))
267
268     def cached_sharedstats(self, package):
269         cur = self.db.cursor()
270         sharedstats = {}
271         cur.execute("SELECT package2, func1, func2, files, size FROM sharing WHERE package1 = ?;",
272                     (package,))
273         for package2, func1, func2, files, size in fetchiter(cur):
274             if (func1, func2) not in hash_functions:
275                 continue
276             curstats = sharedstats.setdefault(
277                     function_combination(func1, func2), list())
278             if package2 == package:
279                 package2 = None
280             curstats.append(dict(package=package2, duplicate=files, savable=size))
281         return sharedstats
282
283     def show_package(self, package):
284         params = self.get_details(package)
285         params["dependencies"] = self.get_dependencies(package)
286         params["shared"] = self.cached_sharedstats(package)
287         params["urlroot"] = ".."
288         return html_response(package_template.render(params))
289
290     def compute_comparison(self, package1, package2):
291         """Compute a sequence of comparison objects ordery by the size of the
292         object in the first package. Each element of the sequence is a dict
293         defining the following keys:
294          * filenames: A set of filenames in package1 all referring to the
295            same object.
296          * size: Size of the object in bytes.
297          * matches: A mapping from filenames in package2 to a mapping from
298            hash function pairs to hash values.
299         """
300         cur = self.db.cursor()
301         cur.execute("SELECT id, filename, size, hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid WHERE package = ? AND function = 'sha512' ORDER BY size DESC;",
302                     (package1,))
303         cursize = -1
304         files = dict()
305         minmatch = 2 if package1 == package2 else 1
306         for cid, filename, size, hashvalue in fetchiter(cur):
307             if cursize != size:
308                 for entry in files.values():
309                     if len(entry["matches"]) >= minmatch:
310                         yield entry
311                 files.clear()
312                 cursize = size
313
314             if hashvalue in files:
315                 files[hashvalue]["filenames"].add(filename)
316                 continue
317
318             entry = dict(filenames=set((filename,)), size=size, matches={})
319             files[hashvalue] = entry
320
321             cur2 = self.db.cursor()
322             cur2.execute("SELECT ha.function, ha.hash, hb.function, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id WHERE ha.cid = ? AND package = ?;",
323                          (cid, package2))
324             for func1, hashvalue, func2, filename in fetchiter(cur2):
325                 entry["matches"].setdefault(filename, {})[func1, func2] = \
326                         hashvalue
327             cur2.close()
328         cur.close()
329
330         for entry in files.values():
331             if len(entry["matches"]) >= minmatch:
332                 yield entry
333
334     def show_detail(self, package1, package2):
335         details1 = details2 = self.get_details(package1)
336         if package1 != package2:
337             details2 = self.get_details(package2)
338
339         shared = self.compute_comparison(package1, package2)
340         params = dict(
341             details1=details1,
342             details2=details2,
343             urlroot="../..",
344             shared=shared)
345         return html_response(detail_template.stream(params))
346
347     def show_hash(self, function, hashvalue):
348         cur = self.db.cursor()
349         cur.execute("SELECT content.package, content.filename, content.size, hash.function FROM content JOIN hash ON content.id = hash.cid WHERE hash = ?;",
350                     (hashvalue,))
351         entries = [dict(package=package, filename=filename, size=size,
352                         function=otherfunc)
353                    for package, filename, size, otherfunc in fetchiter(cur)
354                    if (function, otherfunc) in hash_functions]
355         if not entries:
356             raise NotFound()
357         params = dict(function=function, hashvalue=hashvalue, entries=entries,
358                       urlroot="../..")
359         return html_response(hash_template.render(params))
360
361     def show_source(self, package):
362         cur = self.db.cursor()
363         cur.execute("SELECT package FROM package WHERE source = ?;",
364                     (package,))
365         binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur))
366         if not binpkgs:
367             raise NotFound
368         cur.execute("SELECT package.package, sharing.package2, sharing.func1, sharing.func2, sharing.files, sharing.size FROM package JOIN sharing ON package.package = sharing.package1 WHERE package.source = ?;",
369                     (package,))
370         for binary, otherbin, func1, func2, files, size in fetchiter(cur):
371             entry = dict(package=otherbin,
372                          funccomb=function_combination(func1, func2),
373                          duplicate=files, savable=size)
374             oldentry = binpkgs.get(binary)
375             if not (oldentry and oldentry["savable"] >= size):
376                 binpkgs[binary] = entry
377         params = dict(source=package, packages=binpkgs, urlroot="..")
378         return html_response(source_template.render(params))
379
380 def main():
381     app = Application(sqlite3.connect("test.sqlite3"))
382     staticdir = os.path.join(os.path.dirname(__file__), "static")
383     app = SharedDataMiddleware(app, {"/": staticdir})
384     make_server("0.0.0.0", 8800, app).serve_forever()
385
386 if __name__ == "__main__":
387     main()