def compute_pkgdict(rows):
pkgdict = dict()
- for package, filename, size, function in rows:
+ for package, _, filename, size, function in rows:
funcdict = pkgdict.setdefault(package, {})
funcdict.setdefault(function, []).append((size, filename))
return pkgdict
cur = db.cursor()
cur.execute("PRAGMA foreign_keys = ON;")
cur.execute("DELETE FROM sharing;")
+ cur.execute("DELETE FROM duplicate;")
readcur = db.cursor()
readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;")
for hashvalue, in fetchiter(readcur):
- cur.execute("SELECT content.package, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;",
+ cur.execute("SELECT content.package, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;",
(hashvalue,))
rows = cur.fetchall()
print("processing hash %s with %d entries" % (hashvalue, len(rows)))
pkgdict = compute_pkgdict(rows)
+ cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);",
+ [(row[1],) for row in rows])
process_pkgdict(cur, pkgdict)
db.commit()
{% block title %}sharing between {{ details1.package|e }} and {{ details2.package|e }}{% endblock%}
{% block content %}
<h1><a href="../../binary/{{ details1.package|e }}">{{ details1.package|e }}</a> <-> <a href="../../binary/{{ details2.package|e }}">{{ details2.package|e }}</a></h1>
-<table border='1'><tr><th colspan="3">{{ details1.package|e }}</th><th colspan="3">{{ details2.package|e }}</th></tr>
-<tr><th>size</th><th>filename</th><th>hash functions</th><th>size</th><th>filename</th><th>hash functions</th></tr>
- {%- for entry in shared -%}
- <tr><td>{{ entry.size1|format_size }}</td><td>{{ entry.filename1 }}</td><td>
- {%- for funccomb, hashvalue in entry.functions.items() %}<a href="../../hash/{{ funccomb[0]|e }}/{{ hashvalue|e }}">{{ funccomb[0]|e }}</a> {% endfor %}</td>
- <td>{{ entry.size2|format_size }}</td><td>{{ entry.filename2 }}</td><td>
- {%- for funccomb, hashvalue in entry.functions.items() %}<a href="../../hash/{{ funccomb[1]|e }}/{{ hashvalue|e }}">{{ funccomb[1]|e }}</a> {% endfor %}</td></tr>
+<table border='1'><tr><th colspan="2">{{ details1.package|e }}</th><th colspan="2">{{ details2.package|e }}</th></tr>
+<tr><th>size</th><th>filename</th><th>hash functions</th><th>filename</th></tr>
+{%- for entry in shared -%}
+ <tr><td rowspan={{ entry.matches|length }}>{{ entry.size|format_size }}</td><td rowspan={{ entry.matches|length }}>
+ {%- for filename in entry.filenames %}{{ filename|e }}{% if not loop.last %}<br>{% endif %}{% endfor -%}</td><td>
+ {% for filename, match in entry.matches.items() -%}
+ {% if not loop.first %}<tr><td>{% endif -%}
+ {%- for funccomb, hashvalue in match.items() -%}
+ <a href="../../hash/{{ funccomb[0]|e }}/{{ hashvalue|e }}">{{ funccomb[0]|e }}</a>
+ {%- if funccomb[0] != funccomb[1] %} -> <a href="../../hash/{{ funccomb[1]|e }}/{{ hashvalue|e }}">{{ funccomb[1]|e }}</a>{% endif %}
+ {%- if not loop.last %}, {% endif %}
+ {%- endfor %}</td><td>
+ {{- filename|e }}</td></tr>
{%- endfor -%}
+{%- endfor -%}
</table>
{% endblock %}""")
params["shared"] = self.cached_sharedstats(package)
return html_response(package_template.render(params))
- def show_detail(self, package1, package2):
+ def compute_comparison(self, package1, package2):
+ """Compute a sequence of comparison objects ordery by the size of the
+ object in the first package. Each element of the sequence is a dict
+ defining the following keys:
+ * filenames: A set of filenames in package1 all referring to the
+ same object.
+ * size: Size of the object in bytes.
+ * matches: A mapping from filenames in package2 to a mapping from
+ hash function pairs to hash values.
+ """
cur = self.db.cursor()
- if package1 == package2:
- details1 = details2 = self.get_details(package1)
+ cur.execute("SELECT id, filename, size, hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid WHERE package = ? AND function = 'sha512' ORDER BY size DESC;",
+ (package1,))
+ cursize = -1
+ files = dict()
+ minmatch = 2 if package1 == package2 else 1
+ for cid, filename, size, hashvalue in fetchiter(cur):
+ if cursize != size:
+ for entry in files.values():
+ if len(entry["matches"]) >= minmatch:
+ yield entry
+ files.clear()
+ cursize = size
- cur.execute("SELECT a.filename, a.size, ha.function, b.filename, b.size, hb.function, ha.hash FROM content AS a JOIN hash AS ha ON a.id = ha.cid JOIN hash AS hb ON ha.hash = hb.hash JOIN content AS b ON b.id = hb.cid WHERE a.package = ? AND b.package = ? AND a.filename != b.filename ORDER BY a.size DESC, a.filename, b.filename;",
- (package1, package1))
- else:
- details1 = self.get_details(package1)
+ if hashvalue in files:
+ files[hashvalue]["filenames"].add(filename)
+ continue
+
+ entry = dict(filenames=set((filename,)), size=size, matches={})
+ files[hashvalue] = entry
+
+ cur2 = self.db.cursor()
+ cur2.execute("SELECT ha.function, ha.hash, hb.function, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id WHERE ha.cid = ? AND package = ?;",
+ (cid, package2))
+ for func1, hashvalue, func2, filename in fetchiter(cur2):
+ entry["matches"].setdefault(filename, {})[func1, func2] = \
+ hashvalue
+ cur2.close()
+ cur.close()
+
+ for entry in files.values():
+ if len(entry["matches"]) >= minmatch:
+ yield entry
+
+ def show_detail(self, package1, package2):
+ details1 = details2 = self.get_details(package1)
+ if package1 != package2:
details2 = self.get_details(package2)
- cur.execute("SELECT a.filename, a.size, ha.function, b.filename, b.size, hb.function, ha.hash FROM content AS a JOIN hash AS ha ON a.id = ha.cid JOIN hash AS hb ON ha.hash = hb.hash JOIN content AS b ON b.id = hb.cid WHERE a.package = ? AND b.package = ? ORDER BY a.size DESC, a.filename, b.filename;",
- (package1, package2))
- shared = generate_shared(fetchiter(cur))
- # The cursor will be in use until the template is fully rendered.
+ shared = self.compute_comparison(package1, package2)
params = dict(
details1=details1,
details2=details2,