autoimport: add option to skip hash checking
[~helmut/debian-dedup.git] / autoimport.py
1 #!/usr/bin/python
2 """This scrip takes a directory or a http base url to a mirror and imports all
3 packages contained. It has rather strong assumptions on the working directory.
4 """
5
6 import argparse
7 import errno
8 import multiprocessing
9 import os
10 import sqlite3
11 import subprocess
12 import sys
13 import tempfile
14 try:
15     from urllib.parse import unquote
16 except ImportError:
17     from urllib import unquote
18 try:
19     from urllib.request import urlopen
20 except ImportError:
21     from urllib import urlopen
22
23 import concurrent.futures
24 from debian import deb822
25 from debian.debian_support import version_compare
26
27 from dedup.compression import decompress
28
29 from readyaml import readyaml
30
31 def process_http(pkgs, url, addhash=True):
32     pkglist = urlopen(url + "/dists/sid/main/binary-amd64/Packages.gz")
33     pkglist = decompress(pkglist, ".gz")
34     pkglist = deb822.Packages.iter_paragraphs(pkglist)
35     for pkg in pkglist:
36         name = pkg["Package"]
37         if name in pkgs and \
38                 version_compare(pkgs[name]["version"], pkg["Version"]) > 0:
39             continue
40         inst = dict(version=pkg["Version"],
41                     filename="%s/%s" % (url, pkg["Filename"]))
42         if addhash:
43             inst["sharing"] = pkg["SHA256"]
44         pkgs[name] = inst
45
46 def process_file(pkgs, filename):
47     base = os.path.basename(filename)
48     if not base.endswith(".deb"):
49         raise ValueError("filename does not end in .deb")
50     parts = base.split("_")
51     if len(parts) != 3:
52         raise ValueError("filename not in form name_version_arch.deb")
53     name, version, _ = parts
54     version = unquote(version)
55     if name in pkgs and version_compare(pkgs[name]["version"], version) > 0:
56         return
57     pkgs[name] = dict(version=version, filename=filename)
58
59 def process_dir(pkgs, d):
60     for entry in os.listdir(d):
61         try:
62             process_file(pkgs, os.path.join(d, entry))
63         except ValueError:
64             pass
65
66 def process_pkg(name, pkgdict, outpath):
67     filename = pkgdict["filename"]
68     print("importing %s" % filename)
69     importcmd = [sys.executable, "importpkg.py"]
70     if "sha256hash" in pkgdict:
71         importcmd.extend(["-H", pkgdict["sha256hash"]])
72     if filename.startswith(("http://", "https://", "ftp://", "file://")):
73         with open(outpath, "w") as outp:
74             dl = subprocess.Popen(["curl", "-s", filename],
75                                   stdout=subprocess.PIPE, close_fds=True)
76             imp = subprocess.Popen(importcmd, stdin=dl.stdout, stdout=outp,
77                                    close_fds=True)
78             if imp.wait():
79                 raise ValueError("importpkg failed")
80             if dl.wait():
81                 raise ValueError("curl failed")
82     else:
83         with open(filename) as inp:
84             with open(outpath, "w") as outp:
85                 subprocess.check_call(importcmd, stdin=inp, stdout=outp,
86                                       close_fds=True)
87     print("preprocessed %s" % name)
88
89 def main():
90     parser = argparse.ArgumentParser()
91     parser.add_argument("-n", "--new", action="store_true",
92                         help="avoid reimporting same versions")
93     parser.add_argument("-p", "--prune", action="store_true",
94                         help="prune packages old packages")
95     parser.add_argument("-d", "--database", action="store",
96                         default="test.sqlite3",
97                         help="path to the sqlite3 database file")
98     parser.add_argument("--noverify", action="store_true",
99                         help="do not verify binary package hashes")
100     parser.add_argument("files", nargs='+',
101                         help="files or directories or repository urls")
102     args = parser.parse_args()
103     tmpdir = tempfile.mkdtemp(prefix="debian-dedup")
104     db = sqlite3.connect(args.database)
105     cur = db.cursor()
106     cur.execute("PRAGMA foreign_keys = ON;")
107     e = concurrent.futures.ThreadPoolExecutor(multiprocessing.cpu_count())
108     pkgs = {}
109     for d in args.files:
110         print("processing %s" % d)
111         if d.startswith(("http://", "https://", "ftp://", "file://")):
112             process_http(pkgs, d, not args.noverify)
113         elif os.path.isdir(d):
114             process_dir(pkgs, d)
115         else:
116             process_file(pkgs, d)
117
118     print("reading database")
119     cur.execute("SELECT name, version FROM package;")
120     knownpkgs = dict((row[0], row[1]) for row in cur.fetchall())
121     distpkgs = set(pkgs.keys())
122     if args.new:
123         for name in distpkgs:
124             if name in knownpkgs and version_compare(pkgs[name]["version"],
125                     knownpkgs[name]) <= 0:
126                 del pkgs[name]
127     knownpkgs = set(knownpkgs)
128
129     with e:
130         fs = {}
131         for name, pkg in pkgs.items():
132             outpath = os.path.join(tmpdir, name)
133             fs[e.submit(process_pkg, name, pkg, outpath)] = name
134
135         for f in concurrent.futures.as_completed(fs.keys()):
136             name = fs[f]
137             if f.exception():
138                 print("%s failed to import: %r" % (name, f.exception()))
139                 continue
140             inf = os.path.join(tmpdir, name)
141             print("sqlimporting %s" % name)
142             with open(inf) as inp:
143                 try:
144                     readyaml(db, inp)
145                 except Exception as exc:
146                     print("%s failed sql with exception %r" % (name, exc))
147                 else:
148                     os.unlink(inf)
149
150     if args.prune:
151         delpkgs = knownpkgs - distpkgs
152         print("clearing packages %s" % " ".join(delpkgs))
153         cur.executemany("DELETE FROM package WHERE name = ?;",
154                         ((pkg,) for pkg in delpkgs))
155         # Tables content, dependency and sharing will also be pruned
156         # due to ON DELETE CASCADE clauses.
157         db.commit()
158     try:
159         os.rmdir(tmpdir)
160     except OSError as err:
161         if err.errno != errno.ENOTEMPTY:
162             raise
163         print("keeping temporary directory %s due to failed packages %s" %
164               (tmpdir, " ".join(os.listdir(tmpdir))))
165
166 if __name__ == "__main__":
167     main()