autoimport.py: convert to use pathlib
[~helmut/debian-dedup.git] / autoimport.py
1 #!/usr/bin/python3
2 """This scrip takes a directory or a http base url to a mirror and imports all
3 packages contained. It has rather strong assumptions on the working directory.
4 """
5
6 import argparse
7 import contextlib
8 import errno
9 import multiprocessing
10 import pathlib
11 import sqlite3
12 import subprocess
13 import sys
14 import tempfile
15 import urllib.parse
16 import concurrent.futures
17 from debian import deb822
18 from debian.debian_support import version_compare
19
20 from dedup.utils import open_compressed_mirror_url
21
22 from readyaml import readyaml
23
24 def process_http(pkgs, url, addhash=True):
25     listurl = url + "/dists/sid/main/binary-amd64/Packages"
26     with contextlib.closing(open_compressed_mirror_url(listurl)) as pkglist:
27         for pkg in deb822.Packages.iter_paragraphs(pkglist):
28             name = pkg["Package"]
29             if name in pkgs and \
30                     version_compare(pkgs[name]["version"], pkg["Version"]) > 0:
31                 continue
32             inst = dict(version=pkg["Version"],
33                         filename="%s/%s" % (url, pkg["Filename"]))
34             if addhash:
35                 inst["sha256hash"] = pkg["SHA256"]
36             pkgs[name] = inst
37
38 def process_file(pkgs, filename):
39     if filename.suffix != ".deb":
40         raise ValueError("filename does not end in .deb")
41     parts = filename.name.split("_")
42     if len(parts) != 3:
43         raise ValueError("filename not in form name_version_arch.deb")
44     name, version, _ = parts
45     version = urllib.parse.unquote(version)
46     if name in pkgs and version_compare(pkgs[name]["version"], version) > 0:
47         return
48     pkgs[name] = dict(version=version, filename=str(filename))
49
50 def process_dir(pkgs, d):
51     for entry in d.iterdir():
52         try:
53             process_file(pkgs, entry)
54         except ValueError:
55             pass
56
57 def process_pkg(name, pkgdict, outpath):
58     filename = pkgdict["filename"]
59     print("importing %s" % filename)
60     importcmd = [sys.executable, "importpkg.py"]
61     if "sha256hash" in pkgdict:
62         importcmd.extend(["-H", pkgdict["sha256hash"]])
63     if filename.startswith(("http://", "https://", "ftp://", "file://")):
64         importcmd.append(filename)
65         with outpath.open("w") as outp:
66             subprocess.check_call(importcmd, stdout=outp, close_fds=True)
67     else:
68         with open(filename) as inp:
69             with outpath.open("w") as outp:
70                 subprocess.check_call(importcmd, stdin=inp, stdout=outp,
71                                       close_fds=True)
72     print("preprocessed %s" % name)
73
74 def main():
75     parser = argparse.ArgumentParser()
76     parser.add_argument("-n", "--new", action="store_true",
77                         help="avoid reimporting same versions")
78     parser.add_argument("-p", "--prune", action="store_true",
79                         help="prune packages old packages")
80     parser.add_argument("-d", "--database", action="store",
81                         default="test.sqlite3",
82                         help="path to the sqlite3 database file")
83     parser.add_argument("--noverify", action="store_true",
84                         help="do not verify binary package hashes")
85     parser.add_argument("files", nargs='+',
86                         help="files or directories or repository urls")
87     args = parser.parse_args()
88     tmpdir = pathlib.Path(tempfile.mkdtemp(prefix="debian-dedup"))
89     db = sqlite3.connect(args.database)
90     cur = db.cursor()
91     cur.execute("PRAGMA foreign_keys = ON;")
92     e = concurrent.futures.ThreadPoolExecutor(multiprocessing.cpu_count())
93     pkgs = {}
94     for d in args.files:
95         print("processing %s" % d)
96         if d.startswith(("http://", "https://", "ftp://", "file://")):
97             process_http(pkgs, d, not args.noverify)
98         else:
99             dp = pathlib.Path(d)
100             if dp.is_dir():
101                 process_dir(pkgs, dp)
102             else:
103                 process_file(pkgs, dp)
104
105     print("reading database")
106     cur.execute("SELECT name, version FROM package;")
107     knownpkgs = dict((row[0], row[1]) for row in cur.fetchall())
108     distpkgs = set(pkgs.keys())
109     if args.new:
110         for name in distpkgs:
111             if name in knownpkgs and version_compare(pkgs[name]["version"],
112                     knownpkgs[name]) <= 0:
113                 del pkgs[name]
114     knownpkgs = set(knownpkgs)
115
116     with e:
117         fs = {}
118         for name, pkg in pkgs.items():
119             fs[e.submit(process_pkg, name, pkg, tmpdir / name)] = name
120
121         for f in concurrent.futures.as_completed(fs.keys()):
122             name = fs[f]
123             if f.exception():
124                 print("%s failed to import: %r" % (name, f.exception()))
125                 continue
126             inf = tmpdir / name
127             print("sqlimporting %s" % name)
128             with inf.open() as inp:
129                 try:
130                     readyaml(db, inp)
131                 except Exception as exc:
132                     print("%s failed sql with exception %r" % (name, exc))
133                 else:
134                     inf.unlink()
135
136     if args.prune:
137         delpkgs = knownpkgs - distpkgs
138         print("clearing packages %s" % " ".join(delpkgs))
139         cur.executemany("DELETE FROM package WHERE name = ?;",
140                         ((pkg,) for pkg in delpkgs))
141         # Tables content, dependency and sharing will also be pruned
142         # due to ON DELETE CASCADE clauses.
143         db.commit()
144     try:
145         tmpdir.rmdir()
146     except OSError as err:
147         if err.errno != errno.ENOTEMPTY:
148             raise
149         print("keeping temporary directory %s due to failed packages %s" %
150               (tmpdir, " ".join(map(str, tmpdir.iterdir()))))
151
152 if __name__ == "__main__":
153     main()