summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Cristian Sarbu <paul.cristian.sarbu@huawei.com>2024-08-22 13:00:40 +0200
committerPaul Cristian Sarbu <paul.cristian.sarbu@huawei.com>2024-08-26 11:32:40 +0200
commit426f01ee6be96f3ad7298c09adbe5297e3537f1d (patch)
treeeeb2c59a36b74eec6fec78ea511366f68ac6d0d4
parentc3e6a6f527f590dcbb6411f4e778b10bd5a9d74e (diff)
downloadjustbuild-426f01ee6be96f3ad7298c09adbe5297e3537f1d.tar.gz
just-mr.py: Accept all tree entries for bootstrapped just
The Python script used for the first stage of bootstrapping just uses the Git index to create trees from directories (be it file repositories, unpacked archives, or distfiles), therefore it has the limitations of Git itself in committing trees that contain entries ignored by Git, such as empty directories, the .git folder, .gitignore files and files referenced there, or other entries with Git-specific magic names. This commit updates the Python script to replace the use of the Git index for importing directories to directly writing the needed blobs and trees to the object database, then commit the resulting top tree explicitly. While there, fix a typing issue from our relaxed approach in using os.environ to set the subprocess env when running commands. As on POSIX the type is _Environ, not simple Dict[str, str], use implicit dictionary merging (Python v3.5+) to set the Git envars. The issue was initially flagged by pyright.
-rwxr-xr-xbin/just-mr.py171
1 files changed, 145 insertions, 26 deletions
diff --git a/bin/just-mr.py b/bin/just-mr.py
index 39d9d7c2..21413546 100755
--- a/bin/just-mr.py
+++ b/bin/just-mr.py
@@ -21,11 +21,14 @@ import subprocess
import sys
import tempfile
import time
+import zlib
from typing import Any, Dict, List, Optional, Set, Tuple, Union, cast
from argparse import ArgumentParser
from pathlib import Path
+from enum import Enum
+
# generic JSON type that avoids getter issues; proper use is being enforced by
# return types of methods and typing vars holding return values of json getters
Json = Dict[str, Any]
@@ -126,6 +129,13 @@ DEFAULT_CONFIG_LOCATIONS: List[Json] = [{
}]
+class ObjectType(Enum):
+ FILE = 1
+ EXEC = 2
+ LINK = 3
+ DIR = 4
+
+
def log(*args: str, **kwargs: Any) -> None:
print(*args, file=sys.stderr, **kwargs)
@@ -212,12 +222,13 @@ def git_keep(commit: str, *, upstream: Optional[str]) -> None:
# for those, we assume the referenced commit is kept by
# some branch anyway
return
+ git_env = {**os.environ, **GIT_NOBODY_ENV}
run_cmd([
"git", "tag", "-f", "-m", "Keep referenced tree alive",
"keep-%s" % (commit, ), commit
],
cwd=git_root(upstream=upstream),
- env=dict(os.environ, **GIT_NOBODY_ENV),
+ env=git_env,
attempts=3)
@@ -327,18 +338,19 @@ def update_git(desc: Json) -> None:
desc["commit"] = lsremote.decode('utf-8').split('\t')[0]
-def git_hash(content: bytes) -> str:
- header = "blob {}\0".format(len(content)).encode('utf-8')
+def git_hash(content: bytes, type: str = "blob") -> Tuple[str, bytes]:
+ # Returns the git hash of the object, as well as the header to be stored
+ header = "{} {}\0".format(type, len(content)).encode('utf-8')
h = hashlib.sha1()
h.update(header)
h.update(content)
- return h.hexdigest()
+ return h.hexdigest(), header
def add_to_cas(data: Union[str, bytes]) -> str:
if isinstance(data, str):
data = data.encode('utf-8')
- h = git_hash(data)
+ h, _ = git_hash(data)
cas_root = os.path.join(
g_ROOT, f"protocol-dependent/generation-0/git-sha1/casf/{h[0:2]}")
basename = h[2:]
@@ -393,8 +405,8 @@ def archive_tmp_checkout_dir(content: str, repo_type: str) -> str:
def archive_tree_id_file(content: str, repo_type: str) -> str:
- return os.path.join(g_ROOT, "repositories/generation-0/tree-map",
- repo_type, content)
+ return os.path.join(g_ROOT, "repositories/generation-0/tree-map", repo_type,
+ content)
def get_distfile(desc: Json) -> str:
@@ -484,33 +496,140 @@ def archive_checkout(desc: Json, repo_type: str = "archive") -> List[str]:
]
+def import_tmp_dir(content: str) -> str:
+ return os.path.join(g_ROOT, "tmp-workspaces", "import",
+ "%d-%s" % (os.getpid(), content))
+
+
+def type_to_perm(obj_type: ObjectType) -> str:
+ if obj_type == ObjectType.DIR:
+ return "40000"
+ elif obj_type == ObjectType.LINK:
+ return "120000"
+ elif obj_type == ObjectType.EXEC:
+ return "100755"
+ else: # obj_type == ObjectType.FILE
+ return "100644"
+
+
+def write_blob_to_repo(repo_root: str, data: bytes) -> bytes:
+ # Get blob hash and header to be stored
+ h, header = git_hash(data, type="blob")
+
+ # Write repository object
+ obj_dir = "{}/.git/objects/{}".format(repo_root, h[0:2])
+ obj_file = "{}/{}".format(obj_dir, h[2:])
+ os.makedirs(obj_dir, exist_ok=True)
+ with open(obj_file, "wb") as f:
+ f.write(zlib.compress(header + data))
+
+ return bytes.fromhex(h) # raw id
+
+
+def write_tree_to_repo(repo_root: str,
+ entries: Dict[str, Tuple[bytes, ObjectType]]) -> bytes:
+ # Tree entries have as key their filename and as value a tuple of raw id and
+ # object type. They must be sorted by filename.
+ tree_content: bytes = b""
+ for fname, entry in sorted(entries.items()):
+ if entry[1] == ObjectType.DIR:
+ fname = fname[:-1] # remove trailing '/'
+ tree_content += "{} {}\0".format(type_to_perm(entry[1]),
+ fname).encode('utf-8') + entry[0]
+
+ # Get tree hash and header to be stored
+ h, header = git_hash(tree_content, type="tree")
+
+ # Write repository object
+ obj_dir = "{}/.git/objects/{}".format(repo_root, h[0:2])
+ obj_file = "{}/{}".format(obj_dir, h[2:])
+ os.makedirs(obj_dir, exist_ok=True)
+ with open(obj_file, "wb") as f:
+ f.write(zlib.compress(header + tree_content))
+
+ return bytes.fromhex(h) # raw id
+
+
+def path_to_type(fpath: str) -> ObjectType:
+ if os.path.islink(fpath):
+ return ObjectType.LINK
+ elif os.path.isdir(fpath):
+ return ObjectType.DIR
+ else:
+ if os.access(fpath, os.X_OK):
+ return ObjectType.EXEC
+ else:
+ return ObjectType.FILE
+
+
+def get_tree_raw_id(source_dir: str, repo_root: str) -> bytes:
+ # Writes the content of the directory recursively to the repository and
+ # returns its sha1 hash and its raw bytes representation
+ entries: Dict[str, Tuple[bytes, ObjectType]] = {}
+ for fname in os.listdir(source_dir):
+ fpath = source_dir + "/" + fname
+ obj_type = path_to_type(fpath)
+ raw_h: bytes = b""
+ if obj_type == ObjectType.DIR:
+ raw_h = get_tree_raw_id(fpath, repo_root)
+ fname = fname + '/' # trailing '/' added for correct sorting
+ elif obj_type == ObjectType.LINK:
+ data = os.readlink(fpath).encode('utf-8')
+ raw_h = write_blob_to_repo(repo_root, data)
+ else:
+ with open(fpath, "rb") as f:
+ data = f.read()
+ raw_h = write_blob_to_repo(repo_root, data)
+ # Add entry to map
+ entries[fname] = (raw_h, obj_type)
+
+ return write_tree_to_repo(repo_root, entries)
+
+
def import_to_git(target: str, repo_type: str, content_id: str) -> str:
+ # In order to import content that might otherwise be ignored by Git, such
+ # as empty directories or magic-named files and folders (e.g., .git,
+ # .gitignore), add entries manually to the repository, which should be in
+ # its own separate location
+ repo_tmp_dir = import_tmp_dir(content_id)
+ if os.path.exists(repo_tmp_dir):
+ try_rmtree(repo_tmp_dir)
+ os.makedirs(repo_tmp_dir)
+
+ # Initialize repo to have access to its storage
+ git_env = {**os.environ, **GIT_NOBODY_ENV}
run_cmd(
["git", "init"],
- cwd=target,
- env=dict(os.environ, **GIT_NOBODY_ENV),
- )
- run_cmd(
- ["git", "add", "-f", "."],
- cwd=target,
- env=dict(os.environ, **GIT_NOBODY_ENV),
+ cwd=repo_tmp_dir,
+ env=git_env,
)
+
+ # Get tree id of added directory
+ tree_id: str = get_tree_raw_id(target, repo_tmp_dir).hex()
+
+ # Commit the tree
+ commit: str = subprocess.run(
+ [
+ "git", "commit-tree", tree_id, "-m",
+ "Content of %s %r" % (repo_type, content_id)
+ ],
+ stdout=subprocess.PIPE,
+ cwd=repo_tmp_dir,
+ env=git_env,
+ ).stdout.decode('utf-8').strip()
+
+ # Update the HEAD to make the tree fetchable
run_cmd(
- ["git", "commit", "-m",
- "Content of %s %r" % (repo_type, content_id)],
- cwd=target,
- env=dict(os.environ, **GIT_NOBODY_ENV),
+ ["git", "update-ref", "HEAD", commit],
+ cwd=repo_tmp_dir,
+ env=git_env,
)
+ # Fetch commit into Git cache repository and tag it
ensure_git(upstream=None)
- run_cmd(["git", "fetch", target], cwd=git_root(upstream=None))
- commit = subprocess.run(["git", "log", "-n", "1", "--format=%H"],
- stdout=subprocess.PIPE,
- cwd=target).stdout.decode('utf-8').strip()
+ run_cmd(["git", "fetch", repo_tmp_dir], cwd=git_root(upstream=None))
git_keep(commit, upstream=None)
- return subprocess.run(["git", "log", "-n", "1", "--format=%T"],
- stdout=subprocess.PIPE,
- cwd=target).stdout.decode('utf-8').strip()
+ return tree_id
def file_as_git(fpath: str) -> List[str]:
@@ -604,7 +723,7 @@ def distdir_checkout(desc: Json, repos: Json):
content[get_distfile(repo_desc)] = content_id
# Hash the map as unique id for the distdir repo entry
- distdir_content_id = git_hash(
+ distdir_content_id, _ = git_hash(
json.dumps(content, sort_keys=True,
separators=(',', ':')).encode('utf-8'))
target_distdir_dir = distdir_repo_dir(distdir_content_id)