Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CI: use git-lfs fork for git_lfs_probe.py #17969

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions MODULE.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,21 @@ go_deps = use_extension("@gazelle//:extensions.bzl", "go_deps")
go_deps.from_file(go_mod = "//go/extractor:go.mod")
use_repo(go_deps, "org_golang_x_mod", "org_golang_x_tools")

git_lfs_binary = use_repo_rule("//misc/bazel:lfs.bzl", "git_lfs_binary")

# to update, check out dsp-testing/codeql-git-lfs, do changes there, and push a tag with
# `git tag $(git describe)-ls-urls && git push --tags`
# then wait for https://github.com/dsp-testing/codeql-git-lfs/actions/runs/11800398535 to end,
# then copy here information from https://github.com/dsp-testing/codeql-git-lfs/releases/latest
git_lfs_binary(
name = "git-lfs",
sha256_linux = "08b75033a98f77f7e60b0928e160a6f0a5c5cd9d91b8605537969eec6980219a",
sha256_macos_arm64 = "8a17c488c975dbd050610a0b2692567064dbfef33b6c58ee89ea02f649cc0114",
sha256_macos_x86 = "9fc7265c5345901ca5cb83707ed5374fc6dfbf7ed45d2c047d5929bfe0b5f64a",
sha256_windows = "ef2f5794667584b155786291d4f839c59bfe10fcc5f870902c64f3063ffd9923",
version = "v3.5.0-179-gfd031ea1",
)

lfs_files = use_repo_rule("//misc/bazel:lfs.bzl", "lfs_files")

lfs_files(
Expand Down
200 changes: 28 additions & 172 deletions misc/bazel/internal/git_lfs_probe.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,201 +24,57 @@
import argparse

def options():
def resolved_path(path):
return pathlib.Path(path).expanduser().resolve()
p = argparse.ArgumentParser(description=__doc__)
p.add_argument("--hash-only", action="store_true")
p.add_argument("sources", type=pathlib.Path, nargs="+")
return p.parse_args()


TIMEOUT = 20

def warn(message: str) -> None:
print(f"WARNING: {message}", file=sys.stderr)


@dataclass
class Endpoint:
name: str
href: str
ssh: typing.Optional[str] = None
headers: typing.Dict[str, str] = dataclasses.field(default_factory=dict)

def update_headers(self, d: typing.Iterable[typing.Tuple[str, str]]):
self.headers.update((k.capitalize(), v) for k, v in d)


class NoEndpointsFound(Exception):
pass


opts = options()
sources = [p.resolve() for p in opts.sources]
source_dir = pathlib.Path(os.path.commonpath(src.parent for src in sources))
source_dir = subprocess.check_output(
["git", "rev-parse", "--show-toplevel"], cwd=source_dir, text=True
).strip()
excl = p.add_mutually_exclusive_group(required=True)
excl.add_argument("--hash-only", action="store_true")
excl.add_argument("--git-lfs", type=resolved_path)
p.add_argument("sources", type=resolved_path, nargs="+")
opts = p.parse_args()
source_dir = pathlib.Path(os.path.commonpath(src.parent for src in opts.sources))
opts.source_dir = subprocess.check_output(
["git", "rev-parse", "--show-toplevel"], cwd=source_dir, text=True
).strip()
return opts


def get_env(s: str, sep: str = "=") -> typing.Iterable[typing.Tuple[str, str]]:
for m in re.finditer(rf"(.*?){sep}(.*)", s, re.M):
yield m.groups()


def git(*args, **kwargs):
proc = subprocess.run(
("git",) + args, stdout=subprocess.PIPE, text=True, cwd=source_dir, **kwargs
)
return proc.stdout.strip() if proc.returncode == 0 else None


endpoint_re = re.compile(r"^Endpoint(?: \((.*)\))?$")


def get_endpoint_addresses() -> typing.Iterable[Endpoint]:
"""Get all lfs endpoints, including SSH if present"""
lfs_env_items = get_env(
subprocess.check_output(["git", "lfs", "env"], text=True, cwd=source_dir)
)
current_endpoint = None
for k, v in lfs_env_items:
m = endpoint_re.match(k)
if m:
if current_endpoint:
yield current_endpoint
href, _, _ = v.partition(" ")
current_endpoint = Endpoint(name=m[1] or "default", href=href)
elif k == " SSH" and current_endpoint:
current_endpoint.ssh = v
if current_endpoint:
yield current_endpoint


def get_endpoints() -> typing.Iterable[Endpoint]:
for endpoint in get_endpoint_addresses():
endpoint.headers = {
"Content-Type": "application/vnd.git-lfs+json",
"Accept": "application/vnd.git-lfs+json",
}
if endpoint.ssh:
# see https://github.com/git-lfs/git-lfs/blob/main/docs/api/authentication.md
server, _, path = endpoint.ssh.partition(":")
ssh_command = shutil.which(
os.environ.get("GIT_SSH", os.environ.get("GIT_SSH_COMMAND", "ssh"))
)
assert ssh_command, "no ssh command found"
cmd = [
ssh_command,
"-oStrictHostKeyChecking=accept-new",
server,
"git-lfs-authenticate",
path,
"download",
]
try:
res = subprocess.run(cmd, stdout=subprocess.PIPE, timeout=TIMEOUT)
except subprocess.TimeoutExpired:
warn(f"ssh timed out when connecting to {server}, ignoring {endpoint.name} endpoint")
continue
if res.returncode != 0:
warn(f"ssh failed when connecting to {server}, ignoring {endpoint.name} endpoint")
continue
ssh_resp = json.loads(res.stdout)
endpoint.href = ssh_resp.get("href", endpoint)
endpoint.update_headers(ssh_resp.get("header", {}).items())
url = urlparse(endpoint.href)
# this is how actions/checkout persist credentials
# see https://github.com/actions/checkout/blob/44c2b7a8a4ea60a981eaca3cf939b5f4305c123b/src/git-auth-helper.ts#L56-L63
auth = git("config", f"http.{url.scheme}://{url.netloc}/.extraheader") or ""
endpoint.update_headers(get_env(auth, sep=": "))
if os.environ.get("GITHUB_TOKEN"):
endpoint.headers["Authorization"] = f"token {os.environ['GITHUB_TOKEN']}"
if "Authorization" not in endpoint.headers:
# last chance: use git credentials (possibly backed by a credential helper like the one installed by gh)
# see https://git-scm.com/docs/git-credential
credentials = git(
"credential",
"fill",
check=True,
# drop leading / from url.path
input=f"protocol={url.scheme}\nhost={url.netloc}\npath={url.path[1:]}\n",
)
if credentials is None:
warn(f"no authorization method found, ignoring {endpoint.name} endpoint")
continue
credentials = dict(get_env(credentials))
auth = base64.b64encode(
f'{credentials["username"]}:{credentials["password"]}'.encode()
).decode("ascii")
endpoint.headers["Authorization"] = f"Basic {auth}"
yield endpoint


# see https://github.com/git-lfs/git-lfs/blob/310d1b4a7d01e8d9d884447df4635c7a9c7642c2/docs/api/basic-transfers.md
def get_locations(objects):
def get_locations(objects, opts):
ret = ["local" for _ in objects]
indexes = [i for i, o in enumerate(objects) if o]
if not indexes:
# all objects are local, do not send an empty request as that would be an error
return ret
if opts.hash_only:
for i in indexes:
ret[i] = objects[i]["oid"]
return ret
data = {
"operation": "download",
"transfers": ["basic"],
"objects": [objects[i] for i in indexes],
"hash_algo": "sha256",
}
for endpoint in get_endpoints():
req = urllib.request.Request(
f"{endpoint.href}/objects/batch",
headers=endpoint.headers,
data=json.dumps(data).encode("ascii"),
)
try:
with urllib.request.urlopen(req, timeout=TIMEOUT) as resp:
data = json.load(resp)
assert len(data["objects"]) == len(
indexes
), f"received {len(data)} objects, expected {len(indexes)}"
for i, resp in zip(indexes, data["objects"]):
ret[i] = f'{resp["oid"]} {resp["actions"]["download"]["href"]}'
return ret
except urllib.error.URLError as e:
warn(f"encountered {type(e).__name__} {e}, ignoring endpoint {endpoint.name}")
continue
except KeyError:
warn(f"encountered malformed response, ignoring endpoint {endpoint.name}:\n{json.dumps(data, indent=2)}")
continue
raise NoEndpointsFound

else:
cmd = [opts.git_lfs, "ls-urls", "--json"]
cmd.extend(objects[i]["path"] for i in indexes)
data = json.loads(subprocess.check_output(cmd, cwd=opts.source_dir))
for i, f in zip(indexes, data["files"]):
ret[i] = f'{f["oid"]} {f["url"]}'
return ret

def get_lfs_object(path):
with open(path, "rb") as fileobj:
lfs_header = "version https://git-lfs.github.com/spec".encode()
actual_header = fileobj.read(len(lfs_header))
sha256 = size = None
if lfs_header != actual_header:
return None
data = dict(get_env(fileobj.read().decode("ascii"), sep=" "))
assert data["oid"].startswith("sha256:"), f"unknown oid type: {data['oid']}"
_, _, sha256 = data["oid"].partition(":")
size = int(data["size"])
return {"oid": sha256, "size": size}
return {"path": path, "oid": sha256}


try:
objects = [get_lfs_object(src) for src in sources]
for resp in get_locations(objects):
def main():
opts = options()
objects = [get_lfs_object(src) for src in opts.sources]
for resp in get_locations(objects, opts):
print(resp)
except NoEndpointsFound as e:
print("""\
ERROR: no valid endpoints found, your git authentication method might be currently unsupported by this script.
You can bypass this error by running from semmle-code (this might take a while):
git config lfs.fetchexclude ""
git -C ql config lfs.fetchinclude \\*
git lfs fetch && git lfs checkout
cd ql
git lfs fetch && git lfs checkout""", file=sys.stderr)
sys.exit(1)

if __name__ == "__main__":
main()
50 changes: 49 additions & 1 deletion misc/bazel/lfs.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,16 @@ def lfs_smudge(repository_ctx, srcs, *, extract = False, stripPrefix = None, exe
python = repository_ctx.which("python3") or repository_ctx.which("python")
if not python:
fail("Neither python3 nor python executables found")
script = Label("//misc/bazel/internal:git_lfs_probe.py")
script = repository_ctx.path(Label("//misc/bazel/internal:git_lfs_probe.py"))
git_lfs_binary = repository_ctx.path(Label("@git-lfs"))

def probe(srcs, hash_only = False):
repository_ctx.report_progress("querying LFS url(s) for: %s" % ", ".join([src.basename for src in srcs]))
cmd = [python, script]
if hash_only:
cmd.append("--hash-only")
else:
cmd += ["--git-lfs", git_lfs_binary]
cmd.extend(srcs)
res = repository_ctx.execute(cmd, quiet = True)
if res.return_code != 0:
Expand Down Expand Up @@ -102,3 +105,48 @@ lfs_files = repository_rule(
"executable": attr.bool(doc = "Whether files should be marked as executable"),
},
)

def _lfs_binary_impl(repository_ctx):
suffix = ""
if repository_ctx.os.name.startswith("windows"):
arch = "windows-amd64"
sha256 = repository_ctx.attr.sha256_windows
suffix = ".exe"
elif repository_ctx.os.name.startswith("mac"):
if repository_ctx.os.arch == "x86":
arch = "darwin-amd64"
sha256 = repository_ctx.attr.sha256_macos_x86
else:
arch = "darwin-arm64"
sha256 = repository_ctx.attr.sha256_macos_arm64
else:
arch = "linux-amd64"
sha256 = repository_ctx.attr.sha256_linux
url = "https://github.com/dsp-testing/codeql-git-lfs/releases/download/%s/git-lfs-%s%s" % (
repository_ctx.attr.version,
arch,
suffix,
)
exe = "git-lfs" + suffix
repository_ctx.download(
url = url,
output = exe,
sha256 = sha256,
executable = True,
)
name = repository_ctx.name.split("+")[-1]
if suffix:
repository_ctx.file("BUILD.bazel", "filegroup(name = %r, srcs = [%r], visibility = ['//visibility:public'])" % (name, exe))
else:
repository_ctx.file("BUILD.bazel", "exports_files([%r])" % exe)

git_lfs_binary = repository_rule(
implementation = _lfs_binary_impl,
attrs = {
"version": attr.string(mandatory = True),
"sha256_linux": attr.string(mandatory = True),
"sha256_macos_x86": attr.string(mandatory = True),
"sha256_macos_arm64": attr.string(mandatory = True),
"sha256_windows": attr.string(mandatory = True),
},
)
Loading