diff options
author | Paul Cristian Sarbu <paul.cristian.sarbu@huawei.com> | 2024-08-22 13:00:40 +0200 |
---|---|---|
committer | Paul Cristian Sarbu <paul.cristian.sarbu@huawei.com> | 2024-08-26 11:32:40 +0200 |
commit | 426f01ee6be96f3ad7298c09adbe5297e3537f1d (patch) | |
tree | eeb2c59a36b74eec6fec78ea511366f68ac6d0d4 | |
parent | c3e6a6f527f590dcbb6411f4e778b10bd5a9d74e (diff) | |
download | justbuild-426f01ee6be96f3ad7298c09adbe5297e3537f1d.tar.gz |
just-mr.py: Accept all tree entries for bootstrapped just
The Python script used for the first stage of bootstrapping just
uses the Git index to create trees from directories (be it file
repositories, unpacked archives, or distfiles), therefore it has
the limitations of Git itself in committing trees that contain
entries ignored by Git, such as empty directories, the .git folder,
.gitignore files and files referenced there, or other entries with
Git-specific magic names.
This commit updates the Python script to replace the use of the
Git index for importing directories to directly writing the needed
blobs and trees to the object database, then commit the resulting
top tree explicitly.
While there, fix a typing issue from our relaxed approach in using
os.environ to set the subprocess env when running commands. As on
POSIX the type is _Environ, not simple Dict[str, str], use implicit
dictionary merging (Python v3.5+) to set the Git envars. The issue
was initially flagged by pyright.
-rwxr-xr-x | bin/just-mr.py | 171 |
1 files changed, 145 insertions, 26 deletions
diff --git a/bin/just-mr.py b/bin/just-mr.py index 39d9d7c2..21413546 100755 --- a/bin/just-mr.py +++ b/bin/just-mr.py @@ -21,11 +21,14 @@ import subprocess import sys import tempfile import time +import zlib from typing import Any, Dict, List, Optional, Set, Tuple, Union, cast from argparse import ArgumentParser from pathlib import Path +from enum import Enum + # generic JSON type that avoids getter issues; proper use is being enforced by # return types of methods and typing vars holding return values of json getters Json = Dict[str, Any] @@ -126,6 +129,13 @@ DEFAULT_CONFIG_LOCATIONS: List[Json] = [{ }] +class ObjectType(Enum): + FILE = 1 + EXEC = 2 + LINK = 3 + DIR = 4 + + def log(*args: str, **kwargs: Any) -> None: print(*args, file=sys.stderr, **kwargs) @@ -212,12 +222,13 @@ def git_keep(commit: str, *, upstream: Optional[str]) -> None: # for those, we assume the referenced commit is kept by # some branch anyway return + git_env = {**os.environ, **GIT_NOBODY_ENV} run_cmd([ "git", "tag", "-f", "-m", "Keep referenced tree alive", "keep-%s" % (commit, ), commit ], cwd=git_root(upstream=upstream), - env=dict(os.environ, **GIT_NOBODY_ENV), + env=git_env, attempts=3) @@ -327,18 +338,19 @@ def update_git(desc: Json) -> None: desc["commit"] = lsremote.decode('utf-8').split('\t')[0] -def git_hash(content: bytes) -> str: - header = "blob {}\0".format(len(content)).encode('utf-8') +def git_hash(content: bytes, type: str = "blob") -> Tuple[str, bytes]: + # Returns the git hash of the object, as well as the header to be stored + header = "{} {}\0".format(type, len(content)).encode('utf-8') h = hashlib.sha1() h.update(header) h.update(content) - return h.hexdigest() + return h.hexdigest(), header def add_to_cas(data: Union[str, bytes]) -> str: if isinstance(data, str): data = data.encode('utf-8') - h = git_hash(data) + h, _ = git_hash(data) cas_root = os.path.join( g_ROOT, f"protocol-dependent/generation-0/git-sha1/casf/{h[0:2]}") basename = h[2:] @@ -393,8 +405,8 @@ def archive_tmp_checkout_dir(content: str, repo_type: str) -> str: def archive_tree_id_file(content: str, repo_type: str) -> str: - return os.path.join(g_ROOT, "repositories/generation-0/tree-map", - repo_type, content) + return os.path.join(g_ROOT, "repositories/generation-0/tree-map", repo_type, + content) def get_distfile(desc: Json) -> str: @@ -484,33 +496,140 @@ def archive_checkout(desc: Json, repo_type: str = "archive") -> List[str]: ] +def import_tmp_dir(content: str) -> str: + return os.path.join(g_ROOT, "tmp-workspaces", "import", + "%d-%s" % (os.getpid(), content)) + + +def type_to_perm(obj_type: ObjectType) -> str: + if obj_type == ObjectType.DIR: + return "40000" + elif obj_type == ObjectType.LINK: + return "120000" + elif obj_type == ObjectType.EXEC: + return "100755" + else: # obj_type == ObjectType.FILE + return "100644" + + +def write_blob_to_repo(repo_root: str, data: bytes) -> bytes: + # Get blob hash and header to be stored + h, header = git_hash(data, type="blob") + + # Write repository object + obj_dir = "{}/.git/objects/{}".format(repo_root, h[0:2]) + obj_file = "{}/{}".format(obj_dir, h[2:]) + os.makedirs(obj_dir, exist_ok=True) + with open(obj_file, "wb") as f: + f.write(zlib.compress(header + data)) + + return bytes.fromhex(h) # raw id + + +def write_tree_to_repo(repo_root: str, + entries: Dict[str, Tuple[bytes, ObjectType]]) -> bytes: + # Tree entries have as key their filename and as value a tuple of raw id and + # object type. They must be sorted by filename. + tree_content: bytes = b"" + for fname, entry in sorted(entries.items()): + if entry[1] == ObjectType.DIR: + fname = fname[:-1] # remove trailing '/' + tree_content += "{} {}\0".format(type_to_perm(entry[1]), + fname).encode('utf-8') + entry[0] + + # Get tree hash and header to be stored + h, header = git_hash(tree_content, type="tree") + + # Write repository object + obj_dir = "{}/.git/objects/{}".format(repo_root, h[0:2]) + obj_file = "{}/{}".format(obj_dir, h[2:]) + os.makedirs(obj_dir, exist_ok=True) + with open(obj_file, "wb") as f: + f.write(zlib.compress(header + tree_content)) + + return bytes.fromhex(h) # raw id + + +def path_to_type(fpath: str) -> ObjectType: + if os.path.islink(fpath): + return ObjectType.LINK + elif os.path.isdir(fpath): + return ObjectType.DIR + else: + if os.access(fpath, os.X_OK): + return ObjectType.EXEC + else: + return ObjectType.FILE + + +def get_tree_raw_id(source_dir: str, repo_root: str) -> bytes: + # Writes the content of the directory recursively to the repository and + # returns its sha1 hash and its raw bytes representation + entries: Dict[str, Tuple[bytes, ObjectType]] = {} + for fname in os.listdir(source_dir): + fpath = source_dir + "/" + fname + obj_type = path_to_type(fpath) + raw_h: bytes = b"" + if obj_type == ObjectType.DIR: + raw_h = get_tree_raw_id(fpath, repo_root) + fname = fname + '/' # trailing '/' added for correct sorting + elif obj_type == ObjectType.LINK: + data = os.readlink(fpath).encode('utf-8') + raw_h = write_blob_to_repo(repo_root, data) + else: + with open(fpath, "rb") as f: + data = f.read() + raw_h = write_blob_to_repo(repo_root, data) + # Add entry to map + entries[fname] = (raw_h, obj_type) + + return write_tree_to_repo(repo_root, entries) + + def import_to_git(target: str, repo_type: str, content_id: str) -> str: + # In order to import content that might otherwise be ignored by Git, such + # as empty directories or magic-named files and folders (e.g., .git, + # .gitignore), add entries manually to the repository, which should be in + # its own separate location + repo_tmp_dir = import_tmp_dir(content_id) + if os.path.exists(repo_tmp_dir): + try_rmtree(repo_tmp_dir) + os.makedirs(repo_tmp_dir) + + # Initialize repo to have access to its storage + git_env = {**os.environ, **GIT_NOBODY_ENV} run_cmd( ["git", "init"], - cwd=target, - env=dict(os.environ, **GIT_NOBODY_ENV), - ) - run_cmd( - ["git", "add", "-f", "."], - cwd=target, - env=dict(os.environ, **GIT_NOBODY_ENV), + cwd=repo_tmp_dir, + env=git_env, ) + + # Get tree id of added directory + tree_id: str = get_tree_raw_id(target, repo_tmp_dir).hex() + + # Commit the tree + commit: str = subprocess.run( + [ + "git", "commit-tree", tree_id, "-m", + "Content of %s %r" % (repo_type, content_id) + ], + stdout=subprocess.PIPE, + cwd=repo_tmp_dir, + env=git_env, + ).stdout.decode('utf-8').strip() + + # Update the HEAD to make the tree fetchable run_cmd( - ["git", "commit", "-m", - "Content of %s %r" % (repo_type, content_id)], - cwd=target, - env=dict(os.environ, **GIT_NOBODY_ENV), + ["git", "update-ref", "HEAD", commit], + cwd=repo_tmp_dir, + env=git_env, ) + # Fetch commit into Git cache repository and tag it ensure_git(upstream=None) - run_cmd(["git", "fetch", target], cwd=git_root(upstream=None)) - commit = subprocess.run(["git", "log", "-n", "1", "--format=%H"], - stdout=subprocess.PIPE, - cwd=target).stdout.decode('utf-8').strip() + run_cmd(["git", "fetch", repo_tmp_dir], cwd=git_root(upstream=None)) git_keep(commit, upstream=None) - return subprocess.run(["git", "log", "-n", "1", "--format=%T"], - stdout=subprocess.PIPE, - cwd=target).stdout.decode('utf-8').strip() + return tree_id def file_as_git(fpath: str) -> List[str]: @@ -604,7 +723,7 @@ def distdir_checkout(desc: Json, repos: Json): content[get_distfile(repo_desc)] = content_id # Hash the map as unique id for the distdir repo entry - distdir_content_id = git_hash( + distdir_content_id, _ = git_hash( json.dumps(content, sort_keys=True, separators=(',', ':')).encode('utf-8')) target_distdir_dir = distdir_repo_dir(distdir_content_id) |