summaryrefslogtreecommitdiff
path: root/bin/just-mr.py
diff options
context:
space:
mode:
Diffstat (limited to 'bin/just-mr.py')
-rwxr-xr-xbin/just-mr.py171
1 files changed, 145 insertions, 26 deletions
diff --git a/bin/just-mr.py b/bin/just-mr.py
index 39d9d7c2..21413546 100755
--- a/bin/just-mr.py
+++ b/bin/just-mr.py
@@ -21,11 +21,14 @@ import subprocess
import sys
import tempfile
import time
+import zlib
from typing import Any, Dict, List, Optional, Set, Tuple, Union, cast
from argparse import ArgumentParser
from pathlib import Path
+from enum import Enum
+
# generic JSON type that avoids getter issues; proper use is being enforced by
# return types of methods and typing vars holding return values of json getters
Json = Dict[str, Any]
@@ -126,6 +129,13 @@ DEFAULT_CONFIG_LOCATIONS: List[Json] = [{
}]
+class ObjectType(Enum):
+ FILE = 1
+ EXEC = 2
+ LINK = 3
+ DIR = 4
+
+
def log(*args: str, **kwargs: Any) -> None:
print(*args, file=sys.stderr, **kwargs)
@@ -212,12 +222,13 @@ def git_keep(commit: str, *, upstream: Optional[str]) -> None:
# for those, we assume the referenced commit is kept by
# some branch anyway
return
+ git_env = {**os.environ, **GIT_NOBODY_ENV}
run_cmd([
"git", "tag", "-f", "-m", "Keep referenced tree alive",
"keep-%s" % (commit, ), commit
],
cwd=git_root(upstream=upstream),
- env=dict(os.environ, **GIT_NOBODY_ENV),
+ env=git_env,
attempts=3)
@@ -327,18 +338,19 @@ def update_git(desc: Json) -> None:
desc["commit"] = lsremote.decode('utf-8').split('\t')[0]
-def git_hash(content: bytes) -> str:
- header = "blob {}\0".format(len(content)).encode('utf-8')
+def git_hash(content: bytes, type: str = "blob") -> Tuple[str, bytes]:
+ # Returns the git hash of the object, as well as the header to be stored
+ header = "{} {}\0".format(type, len(content)).encode('utf-8')
h = hashlib.sha1()
h.update(header)
h.update(content)
- return h.hexdigest()
+ return h.hexdigest(), header
def add_to_cas(data: Union[str, bytes]) -> str:
if isinstance(data, str):
data = data.encode('utf-8')
- h = git_hash(data)
+ h, _ = git_hash(data)
cas_root = os.path.join(
g_ROOT, f"protocol-dependent/generation-0/git-sha1/casf/{h[0:2]}")
basename = h[2:]
@@ -393,8 +405,8 @@ def archive_tmp_checkout_dir(content: str, repo_type: str) -> str:
def archive_tree_id_file(content: str, repo_type: str) -> str:
- return os.path.join(g_ROOT, "repositories/generation-0/tree-map",
- repo_type, content)
+ return os.path.join(g_ROOT, "repositories/generation-0/tree-map", repo_type,
+ content)
def get_distfile(desc: Json) -> str:
@@ -484,33 +496,140 @@ def archive_checkout(desc: Json, repo_type: str = "archive") -> List[str]:
]
+def import_tmp_dir(content: str) -> str:
+ return os.path.join(g_ROOT, "tmp-workspaces", "import",
+ "%d-%s" % (os.getpid(), content))
+
+
+def type_to_perm(obj_type: ObjectType) -> str:
+ if obj_type == ObjectType.DIR:
+ return "40000"
+ elif obj_type == ObjectType.LINK:
+ return "120000"
+ elif obj_type == ObjectType.EXEC:
+ return "100755"
+ else: # obj_type == ObjectType.FILE
+ return "100644"
+
+
+def write_blob_to_repo(repo_root: str, data: bytes) -> bytes:
+ # Get blob hash and header to be stored
+ h, header = git_hash(data, type="blob")
+
+ # Write repository object
+ obj_dir = "{}/.git/objects/{}".format(repo_root, h[0:2])
+ obj_file = "{}/{}".format(obj_dir, h[2:])
+ os.makedirs(obj_dir, exist_ok=True)
+ with open(obj_file, "wb") as f:
+ f.write(zlib.compress(header + data))
+
+ return bytes.fromhex(h) # raw id
+
+
+def write_tree_to_repo(repo_root: str,
+ entries: Dict[str, Tuple[bytes, ObjectType]]) -> bytes:
+ # Tree entries have as key their filename and as value a tuple of raw id and
+ # object type. They must be sorted by filename.
+ tree_content: bytes = b""
+ for fname, entry in sorted(entries.items()):
+ if entry[1] == ObjectType.DIR:
+ fname = fname[:-1] # remove trailing '/'
+ tree_content += "{} {}\0".format(type_to_perm(entry[1]),
+ fname).encode('utf-8') + entry[0]
+
+ # Get tree hash and header to be stored
+ h, header = git_hash(tree_content, type="tree")
+
+ # Write repository object
+ obj_dir = "{}/.git/objects/{}".format(repo_root, h[0:2])
+ obj_file = "{}/{}".format(obj_dir, h[2:])
+ os.makedirs(obj_dir, exist_ok=True)
+ with open(obj_file, "wb") as f:
+ f.write(zlib.compress(header + tree_content))
+
+ return bytes.fromhex(h) # raw id
+
+
+def path_to_type(fpath: str) -> ObjectType:
+ if os.path.islink(fpath):
+ return ObjectType.LINK
+ elif os.path.isdir(fpath):
+ return ObjectType.DIR
+ else:
+ if os.access(fpath, os.X_OK):
+ return ObjectType.EXEC
+ else:
+ return ObjectType.FILE
+
+
+def get_tree_raw_id(source_dir: str, repo_root: str) -> bytes:
+ # Writes the content of the directory recursively to the repository and
+ # returns its sha1 hash and its raw bytes representation
+ entries: Dict[str, Tuple[bytes, ObjectType]] = {}
+ for fname in os.listdir(source_dir):
+ fpath = source_dir + "/" + fname
+ obj_type = path_to_type(fpath)
+ raw_h: bytes = b""
+ if obj_type == ObjectType.DIR:
+ raw_h = get_tree_raw_id(fpath, repo_root)
+ fname = fname + '/' # trailing '/' added for correct sorting
+ elif obj_type == ObjectType.LINK:
+ data = os.readlink(fpath).encode('utf-8')
+ raw_h = write_blob_to_repo(repo_root, data)
+ else:
+ with open(fpath, "rb") as f:
+ data = f.read()
+ raw_h = write_blob_to_repo(repo_root, data)
+ # Add entry to map
+ entries[fname] = (raw_h, obj_type)
+
+ return write_tree_to_repo(repo_root, entries)
+
+
def import_to_git(target: str, repo_type: str, content_id: str) -> str:
+ # In order to import content that might otherwise be ignored by Git, such
+ # as empty directories or magic-named files and folders (e.g., .git,
+ # .gitignore), add entries manually to the repository, which should be in
+ # its own separate location
+ repo_tmp_dir = import_tmp_dir(content_id)
+ if os.path.exists(repo_tmp_dir):
+ try_rmtree(repo_tmp_dir)
+ os.makedirs(repo_tmp_dir)
+
+ # Initialize repo to have access to its storage
+ git_env = {**os.environ, **GIT_NOBODY_ENV}
run_cmd(
["git", "init"],
- cwd=target,
- env=dict(os.environ, **GIT_NOBODY_ENV),
- )
- run_cmd(
- ["git", "add", "-f", "."],
- cwd=target,
- env=dict(os.environ, **GIT_NOBODY_ENV),
+ cwd=repo_tmp_dir,
+ env=git_env,
)
+
+ # Get tree id of added directory
+ tree_id: str = get_tree_raw_id(target, repo_tmp_dir).hex()
+
+ # Commit the tree
+ commit: str = subprocess.run(
+ [
+ "git", "commit-tree", tree_id, "-m",
+ "Content of %s %r" % (repo_type, content_id)
+ ],
+ stdout=subprocess.PIPE,
+ cwd=repo_tmp_dir,
+ env=git_env,
+ ).stdout.decode('utf-8').strip()
+
+ # Update the HEAD to make the tree fetchable
run_cmd(
- ["git", "commit", "-m",
- "Content of %s %r" % (repo_type, content_id)],
- cwd=target,
- env=dict(os.environ, **GIT_NOBODY_ENV),
+ ["git", "update-ref", "HEAD", commit],
+ cwd=repo_tmp_dir,
+ env=git_env,
)
+ # Fetch commit into Git cache repository and tag it
ensure_git(upstream=None)
- run_cmd(["git", "fetch", target], cwd=git_root(upstream=None))
- commit = subprocess.run(["git", "log", "-n", "1", "--format=%H"],
- stdout=subprocess.PIPE,
- cwd=target).stdout.decode('utf-8').strip()
+ run_cmd(["git", "fetch", repo_tmp_dir], cwd=git_root(upstream=None))
git_keep(commit, upstream=None)
- return subprocess.run(["git", "log", "-n", "1", "--format=%T"],
- stdout=subprocess.PIPE,
- cwd=target).stdout.decode('utf-8').strip()
+ return tree_id
def file_as_git(fpath: str) -> List[str]:
@@ -604,7 +723,7 @@ def distdir_checkout(desc: Json, repos: Json):
content[get_distfile(repo_desc)] = content_id
# Hash the map as unique id for the distdir repo entry
- distdir_content_id = git_hash(
+ distdir_content_id, _ = git_hash(
json.dumps(content, sort_keys=True,
separators=(',', ':')).encode('utf-8'))
target_distdir_dir = distdir_repo_dir(distdir_content_id)