summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Cristian Sarbu <paul.cristian.sarbu@huawei.com>2025-02-26 13:12:50 +0100
committerPaul Cristian Sarbu <paul.cristian.sarbu@huawei.com>2025-02-28 14:57:48 +0100
commit5f247107071db9542ebeed8e822fac42f78491a1 (patch)
treeb4860c05aee8d12552998e192432b993043a9b94
parente425563c790b668b9ba6cab45c706287a2b739fe (diff)
downloadjustbuild-5f247107071db9542ebeed8e822fac42f78491a1.tar.gz
just-lock imports: Parallelize checkouts, subject to GIL
Python's Global Interpreter Lock (GIL) hugely reduces the amount of parallelism one can achieve. However, as the checkout methods are I/O bound, the total runtime is improved by running them in parallel.
-rwxr-xr-xbin/just-lock.py50
1 files changed, 42 insertions, 8 deletions
diff --git a/bin/just-lock.py b/bin/just-lock.py
index 2157f9aa..ee21df3f 100755
--- a/bin/just-lock.py
+++ b/bin/just-lock.py
@@ -16,6 +16,7 @@
import fcntl
import hashlib
import json
+import multiprocessing
import os
import shutil
import stat
@@ -29,6 +30,7 @@ from argparse import ArgumentParser, ArgumentError, RawTextHelpFormatter
from pathlib import Path
from typing import Any, Dict, List, NoReturn, Optional, Set, TextIO, Tuple, Union, cast
from enum import Enum
+from concurrent.futures import ThreadPoolExecutor
# generic JSON type that avoids getter issues; proper use is being enforced by
# return types of methods and typing vars holding return values of json getters
@@ -2870,18 +2872,49 @@ def lock_config(input_file: str) -> Json:
fail("Expected field \"keep\" to be a list, but found:\n%r" %
(json.dumps(keep, indent=2), ))
+ # Acquire garbage collector locks
+ git_gc_lock = gc_repo_lock_acquire(is_shared=True)
+ storage_gc_lock = gc_storage_lock_acquire(is_shared=True)
+
+ # Do checkouts asynchronously
+ checkouts: Dict[int, Optional[CheckoutInfo]] = {}
+
+ def run_checkout(*, source: str, key: int, entry: Json) -> None:
+ """Run checkout and updates the outer variable 'checkouts'. Updates are
+ atomic, so no extra locking is needed."""
+ if source == "git":
+ checkouts[key] = git_checkout(entry)
+ elif source == "archive":
+ checkouts[key] = archive_checkout(entry)
+ elif source == "git tree":
+ checkouts[key] = git_tree_checkout(entry)
+
+ report("Check out sources")
+ with ThreadPoolExecutor(max_workers=multiprocessing.cpu_count()) as ts:
+ for index, entry in enumerate(imports):
+ if not isinstance(entry, dict):
+ fail("Expected import entries to be objects, but found:\n%r" %
+ (json.dumps(entry, indent=2), ))
+ entry = cast(Json, entry)
+
+ source = entry.get("source")
+ if not isinstance(source, str):
+ fail(
+ "Expected field \"source\" to be a string, but found:\n%r" %
+ (json.dumps(source, indent=2), ))
+
+ # Add task only for sources that do work
+ if source in ["git", "archive", "git tree"]:
+ ts.submit(run_checkout, source=source, key=index, entry=entry)
+
# Initialize the core config, which will be extended with imports
core_config: Json = {}
if main is not None:
core_config["main"] = main
core_config["repositories"] = repositories
- # Acquire garbage collector locks
- git_gc_lock = gc_repo_lock_acquire(is_shared=True)
- storage_gc_lock = gc_storage_lock_acquire(is_shared=True)
-
# Handle imports
- for entry in imports:
+ for index, entry in enumerate(imports):
if not isinstance(entry, dict):
fail("Expected import entries to be objects, but found:\n%r" %
(json.dumps(entry, indent=2), ))
@@ -2893,7 +2926,8 @@ def lock_config(input_file: str) -> Json:
(json.dumps(source, indent=2), ))
if source == "git":
- checkout_info = git_checkout(entry)
+ # Get checkout info
+ checkout_info = checkouts[index]
if checkout_info is not None:
core_config["repositories"] = import_from_git(
core_config["repositories"], entry, checkout_info)
@@ -2901,12 +2935,12 @@ def lock_config(input_file: str) -> Json:
core_config["repositories"] = import_from_file(
core_config["repositories"], entry)
elif source == "archive":
- checkout_info = archive_checkout(entry)
+ checkout_info = checkouts[index]
if checkout_info is not None:
core_config["repositories"] = import_from_archive(
core_config["repositories"], entry, checkout_info)
elif source == "git tree":
- checkout_info = git_tree_checkout(entry)
+ checkout_info = checkouts[index]
if checkout_info is not None:
core_config["repositories"] = import_from_git_tree(
core_config["repositories"], entry, checkout_info)