diff options
author | Paul Cristian Sarbu <paul.cristian.sarbu@huawei.com> | 2025-02-26 13:12:50 +0100 |
---|---|---|
committer | Paul Cristian Sarbu <paul.cristian.sarbu@huawei.com> | 2025-02-28 14:57:48 +0100 |
commit | 5f247107071db9542ebeed8e822fac42f78491a1 (patch) | |
tree | b4860c05aee8d12552998e192432b993043a9b94 | |
parent | e425563c790b668b9ba6cab45c706287a2b739fe (diff) | |
download | justbuild-5f247107071db9542ebeed8e822fac42f78491a1.tar.gz |
just-lock imports: Parallelize checkouts, subject to GIL
Python's Global Interpreter Lock (GIL) hugely reduces the amount of
parallelism one can achieve. However, as the checkout methods are
I/O bound, the total runtime is improved by running them in
parallel.
-rwxr-xr-x | bin/just-lock.py | 50 |
1 files changed, 42 insertions, 8 deletions
diff --git a/bin/just-lock.py b/bin/just-lock.py index 2157f9aa..ee21df3f 100755 --- a/bin/just-lock.py +++ b/bin/just-lock.py @@ -16,6 +16,7 @@ import fcntl import hashlib import json +import multiprocessing import os import shutil import stat @@ -29,6 +30,7 @@ from argparse import ArgumentParser, ArgumentError, RawTextHelpFormatter from pathlib import Path from typing import Any, Dict, List, NoReturn, Optional, Set, TextIO, Tuple, Union, cast from enum import Enum +from concurrent.futures import ThreadPoolExecutor # generic JSON type that avoids getter issues; proper use is being enforced by # return types of methods and typing vars holding return values of json getters @@ -2870,18 +2872,49 @@ def lock_config(input_file: str) -> Json: fail("Expected field \"keep\" to be a list, but found:\n%r" % (json.dumps(keep, indent=2), )) + # Acquire garbage collector locks + git_gc_lock = gc_repo_lock_acquire(is_shared=True) + storage_gc_lock = gc_storage_lock_acquire(is_shared=True) + + # Do checkouts asynchronously + checkouts: Dict[int, Optional[CheckoutInfo]] = {} + + def run_checkout(*, source: str, key: int, entry: Json) -> None: + """Run checkout and updates the outer variable 'checkouts'. Updates are + atomic, so no extra locking is needed.""" + if source == "git": + checkouts[key] = git_checkout(entry) + elif source == "archive": + checkouts[key] = archive_checkout(entry) + elif source == "git tree": + checkouts[key] = git_tree_checkout(entry) + + report("Check out sources") + with ThreadPoolExecutor(max_workers=multiprocessing.cpu_count()) as ts: + for index, entry in enumerate(imports): + if not isinstance(entry, dict): + fail("Expected import entries to be objects, but found:\n%r" % + (json.dumps(entry, indent=2), )) + entry = cast(Json, entry) + + source = entry.get("source") + if not isinstance(source, str): + fail( + "Expected field \"source\" to be a string, but found:\n%r" % + (json.dumps(source, indent=2), )) + + # Add task only for sources that do work + if source in ["git", "archive", "git tree"]: + ts.submit(run_checkout, source=source, key=index, entry=entry) + # Initialize the core config, which will be extended with imports core_config: Json = {} if main is not None: core_config["main"] = main core_config["repositories"] = repositories - # Acquire garbage collector locks - git_gc_lock = gc_repo_lock_acquire(is_shared=True) - storage_gc_lock = gc_storage_lock_acquire(is_shared=True) - # Handle imports - for entry in imports: + for index, entry in enumerate(imports): if not isinstance(entry, dict): fail("Expected import entries to be objects, but found:\n%r" % (json.dumps(entry, indent=2), )) @@ -2893,7 +2926,8 @@ def lock_config(input_file: str) -> Json: (json.dumps(source, indent=2), )) if source == "git": - checkout_info = git_checkout(entry) + # Get checkout info + checkout_info = checkouts[index] if checkout_info is not None: core_config["repositories"] = import_from_git( core_config["repositories"], entry, checkout_info) @@ -2901,12 +2935,12 @@ def lock_config(input_file: str) -> Json: core_config["repositories"] = import_from_file( core_config["repositories"], entry) elif source == "archive": - checkout_info = archive_checkout(entry) + checkout_info = checkouts[index] if checkout_info is not None: core_config["repositories"] = import_from_archive( core_config["repositories"], entry, checkout_info) elif source == "git tree": - checkout_info = git_tree_checkout(entry) + checkout_info = checkouts[index] if checkout_info is not None: core_config["repositories"] = import_from_git_tree( core_config["repositories"], entry, checkout_info) |