webapp.py: fuse two sql queries in get_details
[~helmut/debian-dedup.git] / autoimport.py
1 #!/usr/bin/python3
2 """This scrip takes a directory or a http base url to a mirror and imports all
3 packages contained. It has rather strong assumptions on the working directory.
4 """
5
6 import argparse
7 import errno
8 import multiprocessing
9 import pathlib
10 import sqlite3
11 import subprocess
12 import sys
13 import tempfile
14 import urllib.parse
15 import concurrent.futures
16 from debian.debian_support import version_compare
17
18 from dedup.utils import iterate_packages
19
20 from readyaml import readyaml
21
22 def process_http(pkgs, url, addhash=True):
23     for pkg in iterate_packages(url, "amd64"):
24         name = pkg["Package"]
25         if name in pkgs and \
26                 version_compare(pkgs[name]["version"], pkg["Version"]) > 0:
27             continue
28         inst = dict(version=pkg["Version"],
29                     filename="%s/%s" % (url, pkg["Filename"]))
30         if addhash:
31             inst["sha256hash"] = pkg["SHA256"]
32         pkgs[name] = inst
33
34 def process_file(pkgs, filename):
35     if filename.suffix != ".deb":
36         raise ValueError("filename does not end in .deb")
37     parts = filename.name.split("_")
38     if len(parts) != 3:
39         raise ValueError("filename not in form name_version_arch.deb")
40     name, version, _ = parts
41     version = urllib.parse.unquote(version)
42     if name in pkgs and version_compare(pkgs[name]["version"], version) > 0:
43         return
44     pkgs[name] = dict(version=version, filename=str(filename))
45
46 def process_dir(pkgs, d):
47     for entry in d.iterdir():
48         try:
49             process_file(pkgs, entry)
50         except ValueError:
51             pass
52
53 def process_pkg(name, pkgdict, outpath):
54     filename = pkgdict["filename"]
55     print("importing %s" % filename)
56     importcmd = [sys.executable, "importpkg.py"]
57     if "sha256hash" in pkgdict:
58         importcmd.extend(["-H", pkgdict["sha256hash"]])
59     if filename.startswith(("http://", "https://", "ftp://", "file://")):
60         importcmd.append(filename)
61         with outpath.open("w") as outp:
62             subprocess.check_call(importcmd, stdout=outp, close_fds=True)
63     else:
64         with open(filename) as inp:
65             with outpath.open("w") as outp:
66                 subprocess.check_call(importcmd, stdin=inp, stdout=outp,
67                                       close_fds=True)
68     print("preprocessed %s" % name)
69
70 def main():
71     parser = argparse.ArgumentParser()
72     parser.add_argument("-n", "--new", action="store_true",
73                         help="avoid reimporting same versions")
74     parser.add_argument("-p", "--prune", action="store_true",
75                         help="prune packages old packages")
76     parser.add_argument("-d", "--database", action="store",
77                         default="test.sqlite3",
78                         help="path to the sqlite3 database file")
79     parser.add_argument("--noverify", action="store_true",
80                         help="do not verify binary package hashes")
81     parser.add_argument("files", nargs='+',
82                         help="files or directories or repository urls")
83     args = parser.parse_args()
84     tmpdir = pathlib.Path(tempfile.mkdtemp(prefix="debian-dedup"))
85     db = sqlite3.connect(args.database)
86     cur = db.cursor()
87     cur.execute("PRAGMA foreign_keys = ON;")
88     e = concurrent.futures.ThreadPoolExecutor(multiprocessing.cpu_count())
89     pkgs = {}
90     for d in args.files:
91         print("processing %s" % d)
92         if d.startswith(("http://", "https://", "ftp://", "file://")):
93             process_http(pkgs, d, not args.noverify)
94         else:
95             dp = pathlib.Path(d)
96             if dp.is_dir():
97                 process_dir(pkgs, dp)
98             else:
99                 process_file(pkgs, dp)
100
101     print("reading database")
102     cur.execute("SELECT name, version FROM package;")
103     knownpkgvers = dict((row[0], row[1]) for row in cur.fetchall())
104     distpkgs = set(pkgs.keys())
105     if args.new:
106         for name in distpkgs:
107             if name in knownpkgvers and \
108                version_compare(pkgs[name]["version"], knownpkgvers[name]) <= 0:
109                 del pkgs[name]
110     knownpkgs = set(knownpkgvers)
111     del knownpkgvers
112
113     with e:
114         fs = {}
115         for name, pkg in pkgs.items():
116             fs[e.submit(process_pkg, name, pkg, tmpdir / name)] = name
117
118         for f in concurrent.futures.as_completed(fs.keys()):
119             name = fs[f]
120             if f.exception():
121                 print("%s failed to import: %r" % (name, f.exception()))
122                 continue
123             inf = tmpdir / name
124             print("sqlimporting %s" % name)
125             with inf.open() as inp:
126                 try:
127                     readyaml(db, inp)
128                 except Exception as exc:
129                     print("%s failed sql with exception %r" % (name, exc))
130                 else:
131                     inf.unlink()
132
133     if args.prune:
134         delpkgs = knownpkgs - distpkgs
135         print("clearing packages %s" % " ".join(delpkgs))
136         cur.executemany("DELETE FROM package WHERE name = ?;",
137                         ((pkg,) for pkg in delpkgs))
138         # Tables content, dependency and sharing will also be pruned
139         # due to ON DELETE CASCADE clauses.
140         db.commit()
141     try:
142         tmpdir.rmdir()
143     except OSError as err:
144         if err.errno != errno.ENOTEMPTY:
145             raise
146         print("keeping temporary directory %s due to failed packages %s" %
147               (tmpdir, " ".join(map(str, tmpdir.iterdir()))))
148
149 if __name__ == "__main__":
150     main()