move templates to dedup package
[~helmut/debian-dedup.git] / autoimport.py
1 #!/usr/bin/python
2 """This scrip takes a directory or a http base url to a mirror and imports all
3 packages contained. It has rather strong assumptions on the working directory.
4 """
5
6 import gzip
7 import io
8 import multiprocessing
9 import optparse
10 import os
11 import sqlite3
12 import subprocess
13 import urllib
14
15 import concurrent.futures
16 from debian import deb822
17 from debian.debian_support import version_compare
18
19 from readyaml import readyaml
20
21 def process_http(pkgs, url):
22     pkglist = urllib.urlopen(url + "/dists/sid/main/binary-amd64/Packages.gz").read()
23     pkglist = gzip.GzipFile(fileobj=io.BytesIO(pkglist)).read()
24     pkglist = io.BytesIO(pkglist)
25     pkglist = deb822.Packages.iter_paragraphs(pkglist)
26     for pkg in pkglist:
27         name = pkg["Package"]
28         if name in pkgs and \
29                 version_compare(pkgs[name]["version"], pkg["Version"]) > 0:
30             continue
31         pkgs[name] = dict(version=pkg["Version"],
32                           filename="%s/%s" % (url, pkg["Filename"]),
33                           sha256hash=pkg["SHA256"])
34
35 def process_file(pkgs, filename):
36     base = os.path.basename(filename)
37     if not base.endswith(".deb"):
38         raise ValueError("filename does not end in .deb")
39     parts = base.split("_")
40     if len(parts) != 3:
41         raise ValueError("filename not in form name_version_arch.deb")
42     name, version, _ = parts
43     version = urllib.unquote(version)
44     if name in pkgs and version_compare(pkgs[name]["version"], version) > 0:
45         return
46     pkgs[name] = dict(version=version, filename=filename)
47
48 def process_dir(pkgs, d):
49     for entry in os.listdir(d):
50         try:
51             process_file(pkgs, os.path.join(d, entry))
52         except ValueError:
53             pass
54
55 def process_pkg(name, pkgdict):
56     filename = pkgdict["filename"]
57     print("importing %s" % filename)
58     importcmd = ["python", "importpkg.py"]
59     if "sha256hash" in pkgdict:
60         importcmd.extend(["-H", pkgdict["sha256hash"]])
61     if filename.startswith("http://"):
62         with open(os.path.join("tmp", name), "w") as outp:
63             dl = subprocess.Popen(["curl", "-s", filename],
64                                   stdout=subprocess.PIPE, close_fds=True)
65             imp = subprocess.Popen(importcmd, stdin=dl.stdout, stdout=outp,
66                                    close_fds=True)
67             if imp.wait():
68                 raise ValueError("importpkg failed")
69             if dl.wait():
70                 raise ValueError("curl failed")
71     else:
72         with open(filename) as inp:
73             with open(os.path.join("tmp", name), "w") as outp:
74                 subprocess.check_call(importcmd, stdin=inp, stdout=outp,
75                                       close_fds=True)
76     print("preprocessed %s" % name)
77
78 def main():
79     parser = optparse.OptionParser()
80     parser.add_option("-n", "--new", action="store_true",
81                       help="avoid reimporting same versions")
82     parser.add_option("-p", "--prune", action="store_true",
83                       help="prune packages old packages")
84     options, args = parser.parse_args()
85     subprocess.check_call(["mkdir", "-p", "tmp"])
86     db = sqlite3.connect("test.sqlite3")
87     cur = db.cursor()
88     cur.execute("PRAGMA foreign_keys = ON;")
89     e = concurrent.futures.ThreadPoolExecutor(multiprocessing.cpu_count())
90     pkgs = {}
91     for d in args:
92         print("processing %s" % d)
93         if d.startswith("http://"):
94             process_http(pkgs, d)
95         elif os.path.isdir(d):
96             process_dir(pkgs, d)
97         else:
98             process_file(pkgs, d)
99
100     print("reading database")
101     cur.execute("SELECT name, version FROM package;")
102     knownpkgs = dict((row[0], row[1]) for row in cur.fetchall())
103     distpkgs = set(pkgs.keys())
104     if options.new:
105         for name in distpkgs:
106             if name in knownpkgs and version_compare(pkgs[name]["version"],
107                     knownpkgs[name]) <= 0:
108                 del pkgs[name]
109     knownpkgs = set(knownpkgs)
110
111     with e:
112         fs = {}
113         for name, pkg in pkgs.items():
114             fs[e.submit(process_pkg, name, pkg)] = name
115
116         for f in concurrent.futures.as_completed(fs.keys()):
117             name = fs[f]
118             if f.exception():
119                 print("%s failed to import: %r" % (name, f.exception()))
120                 continue
121             inf = os.path.join("tmp", name)
122             print("sqlimporting %s" % name)
123             with open(inf) as inp:
124                 try:
125                     readyaml(db, inp)
126                 except Exception as exc:
127                     print("%s failed sql with exception %r" % (name, exc))
128                 else:
129                     os.unlink(inf)
130
131     if options.prune:
132         delpkgs = knownpkgs - distpkgs
133         print("clearing packages %s" % " ".join(delpkgs))
134         cur.executemany("DELETE FROM package WHERE name = ?;",
135                         ((pkg,) for pkg in delpkgs))
136         # Tables content, dependency and sharing will also be pruned
137         # due to ON DELETE CASCADE clauses.
138         db.commit()
139
140 if __name__ == "__main__":
141     main()