webapp: open cursors less often
[~helmut/debian-dedup.git] / webapp.py
1 #!/usr/bin/python
2
3 import contextlib
4 import datetime
5 import optparse
6 import sqlite3
7 from wsgiref.simple_server import make_server
8
9 import jinja2
10 from werkzeug.exceptions import HTTPException, NotFound
11 from werkzeug.routing import Map, Rule, RequestRedirect
12 from werkzeug.wrappers import Request, Response
13 from werkzeug.wsgi import SharedDataMiddleware
14
15 from dedup.utils import fetchiter
16
17 jinjaenv = jinja2.Environment(loader=jinja2.PackageLoader("dedup", "templates"))
18
19 def format_size(size):
20     size = float(size)
21     fmt = "%d B"
22     if size >= 1024:
23         size /= 1024
24         fmt = "%.1f KB"
25     if size >= 1024:
26         size /= 1024
27         fmt = "%.1f MB"
28     if size >= 1024:
29         size /= 1024
30         fmt = "%.1f GB"
31     return fmt % size
32
33 def function_combination(function1, function2):
34     if function1 == function2:
35         return function1
36     return "%s -> %s" % (function1, function2)
37
38 # Workaround for jinja bug #59 (broken filesizeformat)
39 jinjaenv.filters["filesizeformat"] = format_size
40
41 base_template = jinjaenv.get_template("base.html")
42 package_template = jinjaenv.get_template("binary.html")
43 detail_template = jinjaenv.get_template("compare.html")
44 hash_template = jinjaenv.get_template("hash.html")
45 index_template = jinjaenv.get_template("index.html")
46 source_template = jinjaenv.get_template("source.html")
47
48 def encode_and_buffer(iterator):
49     buff = b""
50     for elem in iterator:
51         buff += elem.encode("utf8")
52         if len(buff) >= 2048:
53             yield buff
54             buff = b""
55     if buff:
56         yield buff
57
58 def html_response(unicode_iterator, max_age=24 * 60 * 60):
59     resp = Response(encode_and_buffer(unicode_iterator), mimetype="text/html")
60     resp.cache_control.max_age = max_age
61     resp.expires = datetime.datetime.now() + datetime.timedelta(seconds=max_age)
62     return resp
63
64 class Application(object):
65     def __init__(self, db):
66         self.db = db
67         self.routingmap = Map([
68             Rule("/", methods=("GET",), endpoint="index"),
69             Rule("/binary/<package>", methods=("GET",), endpoint="package"),
70             Rule("/compare/<package1>/<package2>", methods=("GET",), endpoint="detail"),
71             Rule("/hash/<function>/<hashvalue>", methods=("GET",), endpoint="hash"),
72             Rule("/source/<package>", methods=("GET",), endpoint="source"),
73         ])
74
75     @Request.application
76     def __call__(self, request):
77         mapadapter = self.routingmap.bind_to_environ(request.environ)
78         try:
79             endpoint, args = mapadapter.match()
80             if endpoint == "package":
81                 return self.show_package(args["package"])
82             elif endpoint == "detail":
83                 return self.show_detail(args["package1"], args["package2"])
84             elif endpoint == "hash":
85                 if args["function"] == "image_sha512":
86                     # backwards compatibility
87                     raise RequestRedirect("%s/hash/png_sha512/%s" %
88                                           (request.environ["SCRIPT_NAME"],
89                                            args["hashvalue"]))
90                 return self.show_hash(args["function"], args["hashvalue"])
91             elif endpoint == "index":
92                 if not request.environ["PATH_INFO"]:
93                     raise RequestRedirect(request.environ["SCRIPT_NAME"] + "/")
94                 return html_response(index_template.render(dict(urlroot="")))
95             elif endpoint == "source":
96                 return self.show_source(args["package"])
97             raise NotFound()
98         except HTTPException as e:
99             return e
100
101     def get_details(self, package):
102         with contextlib.closing(self.db.cursor()) as cur:
103             cur.execute("SELECT id, version, architecture FROM package WHERE name = ?;",
104                         (package,))
105             row = cur.fetchone()
106             if not row:
107                 raise NotFound()
108             pid, version, architecture = row
109             details = dict(pid=pid,
110                            package=package,
111                            version=version,
112                            architecture=architecture)
113             cur.execute("SELECT count(filename), sum(size) FROM content WHERE pid = ?;",
114                         (pid,))
115             num_files, total_size = cur.fetchone()
116         if total_size is None:
117             total_size = 0
118         details.update(dict(num_files=num_files, total_size=total_size))
119         return details
120
121     def get_dependencies(self, pid):
122         with contextlib.closing(self.db.cursor()) as cur:
123             cur.execute("SELECT required FROM dependency WHERE pid = ?;",
124                         (pid,))
125             return set(row[0] for row in fetchiter(cur))
126
127     def cached_sharedstats(self, pid):
128         sharedstats = {}
129         with contextlib.closing(self.db.cursor()) as cur:
130             cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;",
131                         (pid,))
132             for pid2, package2, func1, func2, files, size in fetchiter(cur):
133                 curstats = sharedstats.setdefault(
134                         function_combination(func1, func2), list())
135                 if pid2 == pid:
136                     package2 = None
137                 curstats.append(dict(package=package2, duplicate=files,
138                                      savable=size))
139         return sharedstats
140
141     def show_package(self, package):
142         params = self.get_details(package)
143         params["dependencies"] = self.get_dependencies(params["pid"])
144         params["shared"] = self.cached_sharedstats(params["pid"])
145         params["urlroot"] = ".."
146         cur = self.db.cursor()
147         cur.execute("SELECT content.filename, issue.issue FROM content JOIN issue ON content.id = issue.cid WHERE content.pid = ?;",
148                     (params["pid"],))
149         params["issues"] = dict(cur.fetchall())
150         cur.close()
151         return html_response(package_template.render(params))
152
153     def compute_comparison(self, pid1, pid2):
154         """Compute a sequence of comparison objects ordery by the size of the
155         object in the first package. Each element of the sequence is a dict
156         defining the following keys:
157          * filenames: A set of filenames in package 1 (pid1) all referring to
158            the same object.
159          * size: Size of the object in bytes.
160          * matches: A mapping from filenames in package 2 (pid2) to a mapping
161            from hash function pairs to hash values.
162         """
163         cur = self.db.cursor()
164         cur.execute("SELECT content.id, content.filename, content.size, hash.hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid JOIN function ON hash.fid = function.id WHERE pid = ? AND function.name = 'sha512' ORDER BY size DESC;",
165                     (pid1,))
166         cursize = -1
167         files = dict()
168         minmatch = 2 if pid1 == pid2 else 1
169         cur2 = self.db.cursor()
170         for cid, filename, size, hashvalue in fetchiter(cur):
171             if cursize != size:
172                 for entry in files.values():
173                     if len(entry["matches"]) >= minmatch:
174                         yield entry
175                 files.clear()
176                 cursize = size
177
178             if hashvalue in files:
179                 files[hashvalue]["filenames"].add(filename)
180                 continue
181
182             entry = dict(filenames=set((filename,)), size=size, matches={})
183             files[hashvalue] = entry
184
185             cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ?;",
186                          (cid, pid2))
187             for func1, hashvalue, func2, filename in fetchiter(cur2):
188                 entry["matches"].setdefault(filename, {})[func1, func2] = \
189                         hashvalue
190         cur2.close()
191         cur.close()
192
193         for entry in files.values():
194             if len(entry["matches"]) >= minmatch:
195                 yield entry
196
197     def show_detail(self, package1, package2):
198         details1 = details2 = self.get_details(package1)
199         if package1 != package2:
200             details2 = self.get_details(package2)
201
202         shared = self.compute_comparison(details1["pid"], details2["pid"])
203         params = dict(
204             details1=details1,
205             details2=details2,
206             urlroot="../..",
207             shared=shared)
208         return html_response(detail_template.stream(params))
209
210     def show_hash(self, function, hashvalue):
211         with contextlib.closing(self.db.cursor()) as cur:
212             cur.execute("SELECT package.name, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;",
213                         (function, hashvalue,))
214             entries = [dict(package=package, filename=filename, size=size,
215                             function=otherfunc)
216                        for package, filename, size, otherfunc in fetchiter(cur)]
217         if not entries:
218             raise NotFound()
219         params = dict(function=function, hashvalue=hashvalue, entries=entries,
220                       urlroot="../..")
221         return html_response(hash_template.render(params))
222
223     def show_source(self, package):
224         with contextlib.closing(self.db.cursor()) as cur:
225             cur.execute("SELECT name FROM package WHERE source = ?;",
226                         (package,))
227             binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur))
228             if not binpkgs:
229                 raise NotFound
230             cur.execute("SELECT p1.name, p2.name, f1.name, f2.name, sharing.files, sharing.size FROM sharing JOIN package AS p1 ON sharing.pid1 = p1.id JOIN package AS p2 ON sharing.pid2 = p2.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE p1.source = ?;",
231                         (package,))
232             for binary, otherbin, func1, func2, files, size in fetchiter(cur):
233                 entry = dict(package=otherbin,
234                              funccomb=function_combination(func1, func2),
235                              duplicate=files, savable=size)
236                 oldentry = binpkgs.get(binary)
237                 if not (oldentry and oldentry["savable"] >= size):
238                     binpkgs[binary] = entry
239         params = dict(source=package, packages=binpkgs, urlroot="..")
240         return html_response(source_template.render(params))
241
242 def main():
243     parser = optparse.OptionParser()
244     parser.add_option("-d", "--database", action="store",
245                       default="test.sqlite3",
246                       help="path to the sqlite3 database file")
247     options, args = parser.parse_args()
248     app = Application(sqlite3.connect(options.database))
249     app = SharedDataMiddleware(app, {"/static": ("dedup", "static")})
250     make_server("0.0.0.0", 8800, app).serve_forever()
251
252 if __name__ == "__main__":
253     main()