9993cb0a21645d7d2745af2d4b7152d434a2e3a7
[~helmut/debian-dedup.git] / webapp.py
1 #!/usr/bin/python3
2
3 import argparse
4 import contextlib
5 import datetime
6 import sqlite3
7 from wsgiref.simple_server import make_server
8
9 import jinja2
10 from werkzeug.exceptions import HTTPException, NotFound
11 from werkzeug.routing import Map, Rule
12 from werkzeug.utils import redirect
13 from werkzeug.wrappers import Request, Response
14 try:
15     from werkzeug.middleware.shared_data import SharedDataMiddleware
16 except ImportError:
17     from werkzeug.wsgi import SharedDataMiddleware
18
19 from dedup.utils import fetchiter
20
21 jinjaenv = jinja2.Environment(loader=jinja2.PackageLoader("dedup", "templates"))
22
23 def format_size(size):
24     size = float(size)
25     fmt = "%d B"
26     if size >= 1024:
27         size /= 1024
28         fmt = "%.1f KB"
29     if size >= 1024:
30         size /= 1024
31         fmt = "%.1f MB"
32     if size >= 1024:
33         size /= 1024
34         fmt = "%.1f GB"
35     return fmt % size
36
37 def function_combination(function1, function2):
38     if function1 == function2:
39         return function1
40     return "%s -> %s" % (function1, function2)
41
42 # Workaround for jinja bug #59 (broken filesizeformat)
43 jinjaenv.filters["filesizeformat"] = format_size
44
45 base_template = jinjaenv.get_template("base.html")
46 package_template = jinjaenv.get_template("binary.html")
47 detail_template = jinjaenv.get_template("compare.html")
48 hash_template = jinjaenv.get_template("hash.html")
49 index_template = jinjaenv.get_template("index.html")
50 source_template = jinjaenv.get_template("source.html")
51
52 def encode_and_buffer(iterator):
53     buff = b""
54     for elem in iterator:
55         buff += elem.encode("utf8")
56         if len(buff) >= 2048:
57             yield buff
58             buff = b""
59     if buff:
60         yield buff
61
62 def html_response(unicode_iterator, max_age=24 * 60 * 60):
63     resp = Response(encode_and_buffer(unicode_iterator), mimetype="text/html")
64     resp.cache_control.max_age = max_age
65     resp.expires = datetime.datetime.now() + datetime.timedelta(seconds=max_age)
66     return resp
67
68 class InternalRedirect(Exception):
69     def __init__(self, target, code=301):
70         Exception.__init__(self)
71         self.target = target
72         self.code = code
73
74 class Application:
75     def __init__(self, db):
76         self.db = db
77         self.routingmap = Map([
78             Rule("/", methods=("GET",), endpoint="index"),
79             Rule("/binary/<package>", methods=("GET",), endpoint="package"),
80             Rule("/compare/<package1>/<package2>", methods=("GET",), endpoint="detail"),
81             Rule("/hash/<function>/<hashvalue>", methods=("GET",), endpoint="hash"),
82             Rule("/source/<package>", methods=("GET",), endpoint="source"),
83         ])
84
85     @Request.application
86     def __call__(self, request):
87         mapadapter = self.routingmap.bind_to_environ(request.environ)
88         try:
89             endpoint, args = mapadapter.match()
90             if endpoint == "package":
91                 return self.show_package(args["package"])
92             elif endpoint == "detail":
93                 return self.show_detail(args["package1"], args["package2"])
94             elif endpoint == "hash":
95                 if args["function"] == "image_sha512":
96                     # backwards compatibility
97                     raise InternalRedirect("/hash/png_sha512/%s" %
98                                            args["hashvalue"])
99                 return self.show_hash(args["function"], args["hashvalue"])
100             elif endpoint == "index":
101                 if not request.environ["PATH_INFO"]:
102                     raise InternalRedirect("/")
103                 return html_response(index_template.stream(dict(urlroot="")))
104             elif endpoint == "source":
105                 return self.show_source(args["package"])
106             raise NotFound()
107         except InternalRedirect as r:
108             return redirect(request.environ["SCRIPT_NAME"] + r.target, r.code)
109         except HTTPException as e:
110             return e
111
112     def get_details(self, package):
113         with contextlib.closing(self.db.cursor()) as cur:
114             cur.execute("SELECT id, version, architecture FROM package WHERE name = ?;",
115                         (package,))
116             row = cur.fetchone()
117             if not row:
118                 raise NotFound()
119             pid, version, architecture = row
120             details = dict(pid=pid,
121                            package=package,
122                            version=version,
123                            architecture=architecture)
124             cur.execute("SELECT count(filename), sum(size) FROM content WHERE pid = ?;",
125                         (pid,))
126             num_files, total_size = cur.fetchone()
127         if total_size is None:
128             total_size = 0
129         details.update(dict(num_files=num_files, total_size=total_size))
130         return details
131
132     def get_dependencies(self, pid):
133         with contextlib.closing(self.db.cursor()) as cur:
134             cur.execute("SELECT required FROM dependency WHERE pid = ?;",
135                         (pid,))
136             return set(row[0] for row in fetchiter(cur))
137
138     def cached_sharedstats(self, pid):
139         sharedstats = {}
140         with contextlib.closing(self.db.cursor()) as cur:
141             cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;",
142                         (pid,))
143             for pid2, package2, func1, func2, files, size in fetchiter(cur):
144                 curstats = sharedstats.setdefault(
145                         function_combination(func1, func2), list())
146                 if pid2 == pid:
147                     package2 = None
148                 curstats.append(dict(package=package2, duplicate=files,
149                                      savable=size))
150         return sharedstats
151
152     def show_package(self, package):
153         params = self.get_details(package)
154         params["dependencies"] = self.get_dependencies(params["pid"])
155         params["shared"] = self.cached_sharedstats(params["pid"])
156         params["urlroot"] = ".."
157         cur = self.db.cursor()
158         cur.execute("SELECT content.filename, issue.issue FROM content JOIN issue ON content.id = issue.cid WHERE content.pid = ?;",
159                     (params["pid"],))
160         params["issues"] = dict(cur.fetchall())
161         cur.close()
162         return html_response(package_template.stream(params))
163
164     def compute_comparison(self, pid1, pid2):
165         """Compute a sequence of comparison objects ordered by the size of the
166         object in the first package. Each element of the sequence is a dict
167         defining the following keys:
168          * filenames: A set of filenames in package 1 (pid1) all referring to
169            the same object.
170          * size: Size of the object in bytes.
171          * matches: A mapping from filenames in package 2 (pid2) to a mapping
172            from hash function pairs to hash values.
173         """
174         cur = self.db.cursor()
175         cur.execute("SELECT content.id, content.filename, content.size, hash.hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid JOIN function ON hash.fid = function.id WHERE pid = ? AND function.name = 'sha512' ORDER BY size DESC;",
176                     (pid1,))
177         cursize = -1
178         files = dict()
179         minmatch = 2 if pid1 == pid2 else 1
180         cur2 = self.db.cursor()
181         for cid, filename, size, hashvalue in fetchiter(cur):
182             if cursize != size:
183                 for entry in files.values():
184                     if len(entry["matches"]) >= minmatch:
185                         yield entry
186                 files.clear()
187                 cursize = size
188
189             if hashvalue in files:
190                 files[hashvalue]["filenames"].add(filename)
191                 continue
192
193             entry = dict(filenames=set((filename,)), size=size, matches={})
194             files[hashvalue] = entry
195
196             cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ? AND fa.eqclass = fb.eqclass;",
197                          (cid, pid2))
198             for func1, hashvalue, func2, filename in fetchiter(cur2):
199                 entry["matches"].setdefault(filename, {})[func1, func2] = \
200                         hashvalue
201         cur2.close()
202         cur.close()
203
204         for entry in files.values():
205             if len(entry["matches"]) >= minmatch:
206                 yield entry
207
208     def show_detail(self, package1, package2):
209         details1 = details2 = self.get_details(package1)
210         if package1 != package2:
211             details2 = self.get_details(package2)
212
213         shared = self.compute_comparison(details1["pid"], details2["pid"])
214         params = dict(
215             details1=details1,
216             details2=details2,
217             urlroot="../..",
218             shared=shared)
219         return html_response(detail_template.stream(params))
220
221     def show_hash(self, function, hashvalue):
222         with contextlib.closing(self.db.cursor()) as cur:
223             cur.execute("SELECT package.name, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;",
224                         (function, hashvalue,))
225             entries = [dict(package=package, filename=filename, size=size,
226                             function=otherfunc)
227                        for package, filename, size, otherfunc in fetchiter(cur)]
228             if not entries:
229                 # Assumption: '~' serves as an infinite character larger than
230                 # any other character in the hash column.
231                 cur.execute("SELECT DISTINCT hash.hash FROM hash JOIN function ON hash.fid = function.id WHERE function.name = ? AND hash.hash >= ? AND hash.hash <= ? LIMIT 2;",
232                             (function, hashvalue, hashvalue + '~'))
233                 values = cur.fetchall()
234                 if len(values) == 1:
235                     raise InternalRedirect("/hash/%s/%s" %
236                                            (function, values[0][0]), 302)
237                 raise NotFound()
238         params = dict(function=function, hashvalue=hashvalue, entries=entries,
239                       urlroot="../..")
240         return html_response(hash_template.stream(params))
241
242     def show_source(self, package):
243         with contextlib.closing(self.db.cursor()) as cur:
244             cur.execute("SELECT name FROM package WHERE source = ?;",
245                         (package,))
246             binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur))
247             if not binpkgs:
248                 raise NotFound
249             cur.execute("SELECT p1.name, p2.name, f1.name, f2.name, sharing.files, sharing.size FROM sharing JOIN package AS p1 ON sharing.pid1 = p1.id JOIN package AS p2 ON sharing.pid2 = p2.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE p1.source = ?;",
250                         (package,))
251             for binary, otherbin, func1, func2, files, size in fetchiter(cur):
252                 entry = dict(package=otherbin,
253                              funccomb=function_combination(func1, func2),
254                              duplicate=files, savable=size)
255                 oldentry = binpkgs.get(binary)
256                 if not (oldentry and oldentry["savable"] >= size):
257                     binpkgs[binary] = entry
258         params = dict(source=package, packages=binpkgs, urlroot="..")
259         return html_response(source_template.stream(params))
260
261 def main():
262     parser = argparse.ArgumentParser()
263     parser.add_argument("-d", "--database", action="store",
264                         default="test.sqlite3",
265                         help="path to the sqlite3 database file")
266     args = parser.parse_args()
267     app = Application(sqlite3.connect(args.database))
268     app = SharedDataMiddleware(app, {"/static": ("dedup", "static")})
269     make_server("0.0.0.0", 8800, app).serve_forever()
270
271 if __name__ == "__main__":
272     main()