move templates to dedup package
[~helmut/debian-dedup.git] / webapp.py
1 #!/usr/bin/python
2
3 import datetime
4 import os.path
5 import sqlite3
6 from wsgiref.simple_server import make_server
7
8 import jinja2
9 from werkzeug.exceptions import HTTPException, NotFound
10 from werkzeug.routing import Map, Rule, RequestRedirect
11 from werkzeug.wrappers import Request, Response
12 from werkzeug.wsgi import SharedDataMiddleware
13
14 from dedup.utils import fetchiter
15
16 hash_functions = [
17         ("sha512", "sha512"),
18         ("image_sha512", "image_sha512"),
19         ("gzip_sha512", "gzip_sha512"),
20         ("sha512", "gzip_sha512"),
21         ("gzip_sha512", "sha512")]
22
23 jinjaenv = jinja2.Environment(loader=jinja2.PackageLoader("dedup", "templates"))
24
25 def format_size(size):
26     size = float(size)
27     fmt = "%d B"
28     if size >= 1024:
29         size /= 1024
30         fmt = "%.1f KB"
31     if size >= 1024:
32         size /= 1024
33         fmt = "%.1f MB"
34     if size >= 1024:
35         size /= 1024
36         fmt = "%.1f GB"
37     return fmt % size
38
39 def function_combination(function1, function2):
40     if function1 == function2:
41         return function1
42     return "%s -> %s" % (function1, function2)
43
44 # Workaround for jinja bug #59 (broken filesizeformat)
45 jinjaenv.filters["filesizeformat"] = format_size
46
47 base_template = jinjaenv.get_template("base.html")
48 package_template = jinjaenv.get_template("binary.html")
49 detail_template = jinjaenv.get_template("compare.html")
50 hash_template = jinjaenv.get_template("hash.html")
51 index_template = jinjaenv.get_template("index.html")
52 source_template = jinjaenv.get_template("source.html")
53
54 def encode_and_buffer(iterator):
55     buff = b""
56     for elem in iterator:
57         buff += elem.encode("utf8")
58         if len(buff) >= 2048:
59             yield buff
60             buff = b""
61     if buff:
62         yield buff
63
64 def html_response(unicode_iterator, max_age=24 * 60 * 60):
65     resp = Response(encode_and_buffer(unicode_iterator), mimetype="text/html")
66     resp.cache_control.max_age = max_age
67     resp.expires = datetime.datetime.now() + datetime.timedelta(seconds=max_age)
68     return resp
69
70 class Application(object):
71     def __init__(self, db):
72         self.db = db
73         self.routingmap = Map([
74             Rule("/", methods=("GET",), endpoint="index"),
75             Rule("/binary/<package>", methods=("GET",), endpoint="package"),
76             Rule("/compare/<package1>/<package2>", methods=("GET",), endpoint="detail"),
77             Rule("/hash/<function>/<hashvalue>", methods=("GET",), endpoint="hash"),
78             Rule("/source/<package>", methods=("GET",), endpoint="source"),
79         ])
80
81     @Request.application
82     def __call__(self, request):
83         mapadapter = self.routingmap.bind_to_environ(request.environ)
84         try:
85             endpoint, args = mapadapter.match()
86             if endpoint == "package":
87                 return self.show_package(args["package"])
88             elif endpoint == "detail":
89                 return self.show_detail(args["package1"], args["package2"])
90             elif endpoint == "hash":
91                 return self.show_hash(args["function"], args["hashvalue"])
92             elif endpoint == "index":
93                 if not request.environ["PATH_INFO"]:
94                     raise RequestRedirect(request.environ["SCRIPT_NAME"] + "/")
95                 return html_response(index_template.render(dict(urlroot="")))
96             elif endpoint == "source":
97                 return self.show_source(args["package"])
98             raise NotFound()
99         except HTTPException as e:
100             return e
101
102     def get_details(self, package):
103         cur = self.db.cursor()
104         cur.execute("SELECT id, version, architecture FROM package WHERE name = ?;",
105                     (package,))
106         row = cur.fetchone()
107         if not row:
108             raise NotFound()
109         pid, version, architecture = row
110         details = dict(pid=pid,
111                        package=package,
112                        version=version,
113                        architecture=architecture)
114         cur.execute("SELECT count(filename), sum(size) FROM content WHERE pid = ?;",
115                     (pid,))
116         num_files, total_size = cur.fetchone()
117         if total_size is None:
118             total_size = 0
119         details.update(dict(num_files=num_files, total_size=total_size))
120         return details
121
122     def get_dependencies(self, pid):
123         cur = self.db.cursor()
124         cur.execute("SELECT required FROM dependency WHERE pid = ?;",
125                     (pid,))
126         return set(row[0] for row in fetchiter(cur))
127
128     def cached_sharedstats(self, pid):
129         cur = self.db.cursor()
130         sharedstats = {}
131         cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ?;",
132                     (pid,))
133         for pid2, package2, func1, func2, files, size in fetchiter(cur):
134             if (func1, func2) not in hash_functions:
135                 continue
136             curstats = sharedstats.setdefault(
137                     function_combination(func1, func2), list())
138             if pid2 == pid:
139                 package2 = None
140             curstats.append(dict(package=package2, duplicate=files, savable=size))
141         return sharedstats
142
143     def show_package(self, package):
144         params = self.get_details(package)
145         params["dependencies"] = self.get_dependencies(params["pid"])
146         params["shared"] = self.cached_sharedstats(params["pid"])
147         params["urlroot"] = ".."
148         cur = self.db.cursor()
149         cur.execute("SELECT content.filename, issue.issue FROM content JOIN issue ON content.id = issue.cid WHERE content.pid = ?;",
150                     (params["pid"],))
151         params["issues"] = dict(cur.fetchall())
152         cur.close()
153         return html_response(package_template.render(params))
154
155     def compute_comparison(self, pid1, pid2):
156         """Compute a sequence of comparison objects ordery by the size of the
157         object in the first package. Each element of the sequence is a dict
158         defining the following keys:
159          * filenames: A set of filenames in package 1 (pid1) all referring to
160            the same object.
161          * size: Size of the object in bytes.
162          * matches: A mapping from filenames in package 2 (pid2) to a mapping
163            from hash function pairs to hash values.
164         """
165         cur = self.db.cursor()
166         cur.execute("SELECT content.id, content.filename, content.size, hash.hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid JOIN function ON hash.fid = function.id WHERE pid = ? AND function.name = 'sha512' ORDER BY size DESC;",
167                     (pid1,))
168         cursize = -1
169         files = dict()
170         minmatch = 2 if pid1 == pid2 else 1
171         for cid, filename, size, hashvalue in fetchiter(cur):
172             if cursize != size:
173                 for entry in files.values():
174                     if len(entry["matches"]) >= minmatch:
175                         yield entry
176                 files.clear()
177                 cursize = size
178
179             if hashvalue in files:
180                 files[hashvalue]["filenames"].add(filename)
181                 continue
182
183             entry = dict(filenames=set((filename,)), size=size, matches={})
184             files[hashvalue] = entry
185
186             cur2 = self.db.cursor()
187             cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ?;",
188                          (cid, pid2))
189             for func1, hashvalue, func2, filename in fetchiter(cur2):
190                 entry["matches"].setdefault(filename, {})[func1, func2] = \
191                         hashvalue
192             cur2.close()
193         cur.close()
194
195         for entry in files.values():
196             if len(entry["matches"]) >= minmatch:
197                 yield entry
198
199     def show_detail(self, package1, package2):
200         details1 = details2 = self.get_details(package1)
201         if package1 != package2:
202             details2 = self.get_details(package2)
203
204         shared = self.compute_comparison(details1["pid"], details2["pid"])
205         params = dict(
206             details1=details1,
207             details2=details2,
208             urlroot="../..",
209             shared=shared)
210         return html_response(detail_template.stream(params))
211
212     def show_hash(self, function, hashvalue):
213         cur = self.db.cursor()
214         cur.execute("SELECT package.name, content.filename, content.size, function.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function ON hash.fid = function.id WHERE hash = ?;",
215                     (hashvalue,))
216         entries = [dict(package=package, filename=filename, size=size,
217                         function=otherfunc)
218                    for package, filename, size, otherfunc in fetchiter(cur)
219                    if (function, otherfunc) in hash_functions]
220         if not entries:
221             raise NotFound()
222         params = dict(function=function, hashvalue=hashvalue, entries=entries,
223                       urlroot="../..")
224         return html_response(hash_template.render(params))
225
226     def show_source(self, package):
227         cur = self.db.cursor()
228         cur.execute("SELECT name FROM package WHERE source = ?;",
229                     (package,))
230         binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur))
231         if not binpkgs:
232             raise NotFound
233         cur.execute("SELECT p1.name, p2.name, f1.name, f2.name, sharing.files, sharing.size FROM sharing JOIN package AS p1 ON sharing.pid1 = p1.id JOIN package AS p2 ON sharing.pid2 = p2.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE p1.source = ?;",
234                     (package,))
235         for binary, otherbin, func1, func2, files, size in fetchiter(cur):
236             entry = dict(package=otherbin,
237                          funccomb=function_combination(func1, func2),
238                          duplicate=files, savable=size)
239             oldentry = binpkgs.get(binary)
240             if not (oldentry and oldentry["savable"] >= size):
241                 binpkgs[binary] = entry
242         params = dict(source=package, packages=binpkgs, urlroot="..")
243         return html_response(source_template.render(params))
244
245 def main():
246     app = Application(sqlite3.connect("test.sqlite3"))
247     staticdir = os.path.join(os.path.dirname(__file__), "static")
248     app = SharedDataMiddleware(app, {"/": staticdir})
249     make_server("0.0.0.0", 8800, app).serve_forever()
250
251 if __name__ == "__main__":
252     main()