Open tdesveaux opened 2 months ago
Here is a few information:
This can be done in the pre-receive
hook server-side.
In pre-receive, Git provide on Stdin a line per ref pushed with each line in the formatprev_rev new_rev ref
.
From this, a simple git log --format=%H new_rev ^prev_rev
list the new revs that are being pushed.
From this list of revs, running git lfs fsck [revs ...]
should be fine as a sanity check.
An alternative to fsck
is ls-files
, which take only one ref at a time, but would allow to provide better feedback about which rev and which files have missing lfs objects.
git lfs ls-files -l rev
will output one line per LFS ref in the commit in the format lfs_hash - filepath
Here is a toy python script I used to validate this:
#!/usr/bin/python3
import sys
import subprocess
for line in sys.stdin.readlines():
prev_rev, new_rev, ref = line.strip().split()
output = subprocess.check_output(['git', 'log', "--format=%H", new_rev, f"^{prev_rev}"], text=True)
revs = [
r.strip() for r in output.splitlines(False)
if r.strip()
]
print(revs)
if revs:
subprocess.check_call(['git', 'lfs', 'fsck', *revs])
Welp, my toy script was FAR too naive and does not work at all. I did a better implementation, but did not test it extensively yet.
I'm open to work on this as LFS is quite critical to the sanity of our projects.
Here is a quick patch on were I felt it should be added, can I get a confirmation that it's the correct place? Also, would appreciate pointers on how to add tests for this.
diff --git a/cmd/hook.go b/cmd/hook.go
index 6e31710caf..37c23c97d0 100644
--- a/cmd/hook.go
+++ b/cmd/hook.go
@@ -242,7 +242,7 @@ Gitea or set your environment appropriately.`, "")
// If the ref is a branch or tag, check if it's protected
// if supportProcReceive all ref should be checked because
// permission check was delayed
- if supportProcReceive || refFullName.IsBranch() || refFullName.IsTag() {
+ if supportProcReceive || refFullName.IsBranch() || refFullName.IsTag() || setting.LFS.StartServer {
oldCommitIDs[count] = oldCommitID
newCommitIDs[count] = newCommitID
refFullNames[count] = refFullName
diff --git a/routers/private/hook_pre_receive.go b/routers/private/hook_pre_receive.go
index 0a3c8e2559..c099979b10 100644
--- a/routers/private/hook_pre_receive.go
+++ b/routers/private/hook_pre_receive.go
@@ -4,9 +4,11 @@
package private
import (
+ "context"
"fmt"
"net/http"
"os"
+ "strings"
"code.gitea.io/gitea/models"
asymkey_model "code.gitea.io/gitea/models/asymkey"
@@ -19,6 +21,7 @@ import (
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/private"
+ "code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/web"
gitea_context "code.gitea.io/gitea/services/context"
pull_service "code.gitea.io/gitea/services/pull"
@@ -117,6 +120,10 @@ func HookPreReceive(ctx *gitea_context.PrivateContext) {
newCommitID := opts.NewCommitIDs[i]
refFullName := opts.RefFullNames[i]
+ if setting.LFS.StartServer {
+ preReceiveLFS(ourCtx, oldCommitID, newCommitID)
+ }
+
switch {
case refFullName.IsBranch():
preReceiveBranch(ourCtx, oldCommitID, newCommitID, refFullName)
@@ -135,6 +142,59 @@ func HookPreReceive(ctx *gitea_context.PrivateContext) {
ctx.PlainText(http.StatusOK, "ok")
}
+func preReceiveLFS(ctx *preReceiveContext, oldCommitID, newCommitID string) {
+ if !ctx.AssertCanWriteCode() {
+ return
+ }
+
+ objectFormat := ctx.Repo.GetObjectFormat()
+
+ // Branch deletion, nothing to check
+ if newCommitID == objectFormat.EmptyObjectID().String() {
+ return
+ }
+
+ repo := ctx.Repo.Repository
+
+ // TODO: IMPLEMENT CHECKS HERE
+}
+
func preReceiveBranch(ctx *preReceiveContext, oldCommitID, newCommitID string, refFullName git.RefName) {
branchName := refFullName.BranchName()
ctx.branchName = branchName
And here is a "better" (but not thoroughly tested) version of the python pre-receive hook:
#!/usr/bin/python3
from __future__ import annotations
import re
import sys
import subprocess
from pathlib import Path
# Install with
# sudo -u git ln -s /var/lib/gitea/dne_hooks/fail_lfs_no_objects.pre-receive /var/lib/gitea/repositories/{owner}/{repository}.git/hooks/pre-receive.d/fail_lfs_no_objects
GITEA_LFS_STORAGE_DIR = Path('/var/lib/gitea/data/lfs')
LFS_LSFILES_RE = re.compile("^([a-zA-Z0-9]+) ([*|-]) (.*)$")
def _lfs_object_exists(oid: str) -> bool:
# gitea store LFS files with CAS in subdirs
object_path = (GITEA_LFS_STORAGE_DIR / oid[0:2] / oid[2:4] / oid[4:]).absolute()
return object_path.is_file()
def _is_default_sha(rev: str) -> bool:
return all(c == '0' for c in rev)
def _find_first_unique_rev(rev: str) -> str:
all_refs = set(subprocess.check_output(['git', 'show-ref', '--hash'], text=True).splitlines(False))
# remove self
if rev in all_refs:
all_refs.remove(rev)
if not all_refs:
# gen default_sha
return len(rev) * '0'
return subprocess.check_output(['git', 'merge-base', rev, *all_refs] , text=True).strip()
def check_missing_lfs_objects(rev: str, parent_rev: str | None) -> bool:
ls_files_cmd = ['git', 'lfs', 'ls-files', '--long']
if parent_rev:
ls_files_cmd.append(parent_rev)
ls_files_cmd.append(rev)
lfs_oids = [
LFS_LSFILES_RE.match(line).groups()[0]
for line in subprocess.check_output(ls_files_cmd, text=True).splitlines(False)
]
first_missing_lfs_object = next(
(
oid
for oid in lfs_oids
if not _lfs_object_exists(oid)
),
None,
)
if first_missing_lfs_object is not None:
print(
(
f"Error: Your push referenced at least 1 unknown Git LFS object:\n"
f"\t{first_missing_lfs_object}\n"
"\n"
"Try to push them with 'git lfs push --all'."
),
file=sys.stderr,
)
sys.exit(1)
for line in sys.stdin.readlines():
prev_rev, new_rev, ref = line.strip().split()
if _is_default_sha(new_rev):
# branch deletion
continue
if _is_default_sha(prev_rev):
prev_rev = _find_first_unique_rev(new_rev)
is_new_ref = _is_default_sha(prev_rev)
rev_list_cmd = ['git', 'rev-list', new_rev]
if not is_new_ref:
rev_list_cmd.append(f"^{prev_rev}")
output = subprocess.check_output(rev_list_cmd, text=True)
revs = [
r.strip() for r in output.splitlines(False)
if r.strip()
]
for rev in revs[:-1]:
check_missing_lfs_objects(rev, f"{rev}~")
check_missing_lfs_objects(
revs[-1],
# New ref will list ALL history, first commit does not have a parent
f"{revs[-1]}~" if not is_new_ref else None,
)
Looks like there is no easy way to get all LFS objects from a commit. We have to read all changed objects on this commit which is too heavy. Maybe we need to have a new database table to track repo_id
, commit_id
, path
, oid
for LFS.
Do you mean for caching it and avoid checking multiple times?
I'm not sure I understand correctly We have to read all changed objects on this commit which is too heavy
.
git lfs ls-files {SHA}~ {SHA}
will only list for the commit SHA
. Or is that already too heavy?
note: git lfs ls-files {SHA}
will list all LFS objects accessible at this commit. as in if you checkout this commit, all these LFS objects need to be checked out too.
Feature Description
We had an issue where a client hooks manager made the
lfs pre-push
not work correctly, thus resulting in commits being push without their LFS objects.While reproducing and testing things around, I encountered the following fail on a push to a Github repository:
It would be great if Gitea did the same sanity check preventing users from pushing commits without their LFS objects.
Screenshots
No response