From ae0af8ea5b2de99a49add2b7f7b76dde62a8a617 Mon Sep 17 00:00:00 2001 From: Lunny Xiao Date: Fri, 11 Apr 2025 06:41:29 -0700 Subject: [PATCH] Refactor Git Attribute & performance optimization (#34154) This PR moved git attributes related code to `modules/git/attribute` sub package and moved language stats related code to `modules/git/languagestats` sub package to make it easier to maintain. And it also introduced a performance improvement which use the `git check-attr --source` which can be run in a bare git repository so that we don't need to create a git index file. The new parameter need a git version >= 2.40 . If git version less than 2.40, it will fall back to previous implementation. --------- Co-authored-by: wxiaoguang Co-authored-by: yp05327 <576951401@qq.com> --- modules/git/attribute.go | 35 -- modules/git/attribute/attribute.go | 114 ++++++ modules/git/attribute/attribute_test.go | 37 ++ modules/git/attribute/batch.go | 216 +++++++++++ .../batch_test.go} | 87 ++++- modules/git/attribute/checker.go | 96 +++++ modules/git/attribute/checker_test.go | 74 ++++ modules/git/attribute/main_test.go | 41 +++ modules/git/command.go | 7 + modules/git/git.go | 2 + .../language_stats.go} | 30 +- .../language_stats_gogit.go} | 81 +++-- .../language_stats_nogogit.go} | 90 +++-- .../language_stats_test.go} | 11 +- modules/git/languagestats/main_test.go | 41 +++ modules/git/repo_attribute.go | 341 ------------------ .../tests/repos/language_stats_repo/config | 2 +- modules/git/tests/repos/repo3_notes/config | 2 +- .../tests/repos/repo4_commitsbetween/config | 2 +- modules/indexer/stats/db.go | 3 +- routers/web/repo/blame.go | 4 +- routers/web/repo/setting/lfs.go | 32 +- routers/web/repo/view_file.go | 37 +- services/gitdiff/gitdiff.go | 26 +- services/markup/renderhelper_codepreview.go | 4 +- services/repository/files/content.go | 25 -- services/repository/files/update.go | 8 +- services/repository/files/upload.go | 14 +- 28 files changed, 875 insertions(+), 587 deletions(-) delete mode 100644 modules/git/attribute.go create mode 100644 modules/git/attribute/attribute.go create mode 100644 modules/git/attribute/attribute_test.go create mode 100644 modules/git/attribute/batch.go rename modules/git/{repo_attribute_test.go => attribute/batch_test.go} (50%) create mode 100644 modules/git/attribute/checker.go create mode 100644 modules/git/attribute/checker_test.go create mode 100644 modules/git/attribute/main_test.go rename modules/git/{repo_language_stats.go => languagestats/language_stats.go} (59%) rename modules/git/{repo_language_stats_gogit.go => languagestats/language_stats_gogit.go} (73%) rename modules/git/{repo_language_stats_nogogit.go => languagestats/language_stats_nogogit.go} (73%) rename modules/git/{repo_language_stats_test.go => languagestats/language_stats_test.go} (75%) create mode 100644 modules/git/languagestats/main_test.go delete mode 100644 modules/git/repo_attribute.go diff --git a/modules/git/attribute.go b/modules/git/attribute.go deleted file mode 100644 index 4dfa510369..0000000000 --- a/modules/git/attribute.go +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2024 The Gitea Authors. All rights reserved. -// SPDX-License-Identifier: MIT - -package git - -import ( - "code.gitea.io/gitea/modules/optional" -) - -const ( - AttributeLinguistVendored = "linguist-vendored" - AttributeLinguistGenerated = "linguist-generated" - AttributeLinguistDocumentation = "linguist-documentation" - AttributeLinguistDetectable = "linguist-detectable" - AttributeLinguistLanguage = "linguist-language" - AttributeGitlabLanguage = "gitlab-language" -) - -// true if "set"/"true", false if "unset"/"false", none otherwise -func AttributeToBool(attr map[string]string, name string) optional.Option[bool] { - switch attr[name] { - case "set", "true": - return optional.Some(true) - case "unset", "false": - return optional.Some(false) - } - return optional.None[bool]() -} - -func AttributeToString(attr map[string]string, name string) optional.Option[string] { - if value, has := attr[name]; has && value != "unspecified" { - return optional.Some(value) - } - return optional.None[string]() -} diff --git a/modules/git/attribute/attribute.go b/modules/git/attribute/attribute.go new file mode 100644 index 0000000000..adf323ef41 --- /dev/null +++ b/modules/git/attribute/attribute.go @@ -0,0 +1,114 @@ +// Copyright 2025 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package attribute + +import ( + "strings" + + "code.gitea.io/gitea/modules/optional" +) + +type Attribute string + +const ( + LinguistVendored = "linguist-vendored" + LinguistGenerated = "linguist-generated" + LinguistDocumentation = "linguist-documentation" + LinguistDetectable = "linguist-detectable" + LinguistLanguage = "linguist-language" + GitlabLanguage = "gitlab-language" + Lockable = "lockable" + Filter = "filter" +) + +var LinguistAttributes = []string{ + LinguistVendored, + LinguistGenerated, + LinguistDocumentation, + LinguistDetectable, + LinguistLanguage, + GitlabLanguage, +} + +func (a Attribute) IsUnspecified() bool { + return a == "" || a == "unspecified" +} + +func (a Attribute) ToString() optional.Option[string] { + if !a.IsUnspecified() { + return optional.Some(string(a)) + } + return optional.None[string]() +} + +// ToBool converts the attribute value to optional boolean: true if "set"/"true", false if "unset"/"false", none otherwise +func (a Attribute) ToBool() optional.Option[bool] { + switch a { + case "set", "true": + return optional.Some(true) + case "unset", "false": + return optional.Some(false) + } + return optional.None[bool]() +} + +type Attributes struct { + m map[string]Attribute +} + +func NewAttributes() *Attributes { + return &Attributes{m: make(map[string]Attribute)} +} + +func (attrs *Attributes) Get(name string) Attribute { + if value, has := attrs.m[name]; has { + return value + } + return "" +} + +func (attrs *Attributes) GetVendored() optional.Option[bool] { + return attrs.Get(LinguistVendored).ToBool() +} + +func (attrs *Attributes) GetGenerated() optional.Option[bool] { + return attrs.Get(LinguistGenerated).ToBool() +} + +func (attrs *Attributes) GetDocumentation() optional.Option[bool] { + return attrs.Get(LinguistDocumentation).ToBool() +} + +func (attrs *Attributes) GetDetectable() optional.Option[bool] { + return attrs.Get(LinguistDetectable).ToBool() +} + +func (attrs *Attributes) GetLinguistLanguage() optional.Option[string] { + return attrs.Get(LinguistLanguage).ToString() +} + +func (attrs *Attributes) GetGitlabLanguage() optional.Option[string] { + attrStr := attrs.Get(GitlabLanguage).ToString() + if attrStr.Has() { + raw := attrStr.Value() + // gitlab-language may have additional parameters after the language + // ignore them and just use the main language + // https://docs.gitlab.com/ee/user/project/highlighting.html#override-syntax-highlighting-for-a-file-type + if idx := strings.IndexByte(raw, '?'); idx >= 0 { + return optional.Some(raw[:idx]) + } + } + return attrStr +} + +func (attrs *Attributes) GetLanguage() optional.Option[string] { + // prefer linguist-language over gitlab-language + // if linguist-language is not set, use gitlab-language + // if both are not set, return none + language := attrs.GetLinguistLanguage() + if language.Value() == "" { + language = attrs.GetGitlabLanguage() + } + return language +} diff --git a/modules/git/attribute/attribute_test.go b/modules/git/attribute/attribute_test.go new file mode 100644 index 0000000000..dadb5582a3 --- /dev/null +++ b/modules/git/attribute/attribute_test.go @@ -0,0 +1,37 @@ +// Copyright 2025 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package attribute + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func Test_Attribute(t *testing.T) { + assert.Empty(t, Attribute("").ToString().Value()) + assert.Empty(t, Attribute("unspecified").ToString().Value()) + assert.Equal(t, "python", Attribute("python").ToString().Value()) + assert.Equal(t, "Java", Attribute("Java").ToString().Value()) + + attributes := Attributes{ + m: map[string]Attribute{ + LinguistGenerated: "true", + LinguistDocumentation: "false", + LinguistDetectable: "set", + LinguistLanguage: "Python", + GitlabLanguage: "Java", + "filter": "unspecified", + "test": "", + }, + } + + assert.Empty(t, attributes.Get("test").ToString().Value()) + assert.Empty(t, attributes.Get("filter").ToString().Value()) + assert.Equal(t, "Python", attributes.Get(LinguistLanguage).ToString().Value()) + assert.Equal(t, "Java", attributes.Get(GitlabLanguage).ToString().Value()) + assert.True(t, attributes.Get(LinguistGenerated).ToBool().Value()) + assert.False(t, attributes.Get(LinguistDocumentation).ToBool().Value()) + assert.True(t, attributes.Get(LinguistDetectable).ToBool().Value()) +} diff --git a/modules/git/attribute/batch.go b/modules/git/attribute/batch.go new file mode 100644 index 0000000000..4e31fda575 --- /dev/null +++ b/modules/git/attribute/batch.go @@ -0,0 +1,216 @@ +// Copyright 2019 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package attribute + +import ( + "bytes" + "context" + "fmt" + "os" + "path/filepath" + "time" + + "code.gitea.io/gitea/modules/git" + "code.gitea.io/gitea/modules/log" +) + +// BatchChecker provides a reader for check-attribute content that can be long running +type BatchChecker struct { + attributesNum int + repo *git.Repository + stdinWriter *os.File + stdOut *nulSeparatedAttributeWriter + ctx context.Context + cancel context.CancelFunc + cmd *git.Command +} + +// NewBatchChecker creates a check attribute reader for the current repository and provided commit ID +// If treeish is empty, then it will use current working directory, otherwise it will use the provided treeish on the bare repo +func NewBatchChecker(repo *git.Repository, treeish string, attributes []string) (checker *BatchChecker, returnedErr error) { + ctx, cancel := context.WithCancel(repo.Ctx) + defer func() { + if returnedErr != nil { + cancel() + } + }() + + cmd, envs, cleanup, err := checkAttrCommand(repo, treeish, nil, attributes) + if err != nil { + return nil, err + } + defer func() { + if returnedErr != nil { + cleanup() + } + }() + + cmd.AddArguments("--stdin") + + checker = &BatchChecker{ + attributesNum: len(attributes), + repo: repo, + ctx: ctx, + cmd: cmd, + cancel: func() { + cancel() + cleanup() + }, + } + + stdinReader, stdinWriter, err := os.Pipe() + if err != nil { + return nil, err + } + checker.stdinWriter = stdinWriter + + lw := new(nulSeparatedAttributeWriter) + lw.attributes = make(chan attributeTriple, len(attributes)) + lw.closed = make(chan struct{}) + checker.stdOut = lw + + go func() { + defer func() { + _ = stdinReader.Close() + _ = lw.Close() + }() + stdErr := new(bytes.Buffer) + err := cmd.Run(ctx, &git.RunOpts{ + Env: envs, + Dir: repo.Path, + Stdin: stdinReader, + Stdout: lw, + Stderr: stdErr, + }) + + if err != nil && !git.IsErrCanceledOrKilled(err) { + log.Error("Attribute checker for commit %s exits with error: %v", treeish, err) + } + checker.cancel() + }() + + return checker, nil +} + +// CheckPath check attr for given path +func (c *BatchChecker) CheckPath(path string) (rs *Attributes, err error) { + defer func() { + if err != nil && err != c.ctx.Err() { + log.Error("Unexpected error when checking path %s in %s, error: %v", path, filepath.Base(c.repo.Path), err) + } + }() + + select { + case <-c.ctx.Done(): + return nil, c.ctx.Err() + default: + } + + if _, err = c.stdinWriter.Write([]byte(path + "\x00")); err != nil { + defer c.Close() + return nil, err + } + + reportTimeout := func() error { + stdOutClosed := false + select { + case <-c.stdOut.closed: + stdOutClosed = true + default: + } + debugMsg := fmt.Sprintf("check path %q in repo %q", path, filepath.Base(c.repo.Path)) + debugMsg += fmt.Sprintf(", stdOut: tmp=%q, pos=%d, closed=%v", string(c.stdOut.tmp), c.stdOut.pos, stdOutClosed) + if c.cmd != nil { + debugMsg += fmt.Sprintf(", process state: %q", c.cmd.ProcessState()) + } + _ = c.Close() + return fmt.Errorf("CheckPath timeout: %s", debugMsg) + } + + rs = NewAttributes() + for i := 0; i < c.attributesNum; i++ { + select { + case <-time.After(5 * time.Second): + // there is no "hang" problem now. This code is just used to catch other potential problems. + return nil, reportTimeout() + case attr, ok := <-c.stdOut.ReadAttribute(): + if !ok { + return nil, c.ctx.Err() + } + rs.m[attr.Attribute] = Attribute(attr.Value) + case <-c.ctx.Done(): + return nil, c.ctx.Err() + } + } + return rs, nil +} + +func (c *BatchChecker) Close() error { + c.cancel() + err := c.stdinWriter.Close() + return err +} + +type attributeTriple struct { + Filename string + Attribute string + Value string +} + +type nulSeparatedAttributeWriter struct { + tmp []byte + attributes chan attributeTriple + closed chan struct{} + working attributeTriple + pos int +} + +func (wr *nulSeparatedAttributeWriter) Write(p []byte) (n int, err error) { + l, read := len(p), 0 + + nulIdx := bytes.IndexByte(p, '\x00') + for nulIdx >= 0 { + wr.tmp = append(wr.tmp, p[:nulIdx]...) + switch wr.pos { + case 0: + wr.working = attributeTriple{ + Filename: string(wr.tmp), + } + case 1: + wr.working.Attribute = string(wr.tmp) + case 2: + wr.working.Value = string(wr.tmp) + } + wr.tmp = wr.tmp[:0] + wr.pos++ + if wr.pos > 2 { + wr.attributes <- wr.working + wr.pos = 0 + } + read += nulIdx + 1 + if l > read { + p = p[nulIdx+1:] + nulIdx = bytes.IndexByte(p, '\x00') + } else { + return l, nil + } + } + wr.tmp = append(wr.tmp, p...) + return l, nil +} + +func (wr *nulSeparatedAttributeWriter) ReadAttribute() <-chan attributeTriple { + return wr.attributes +} + +func (wr *nulSeparatedAttributeWriter) Close() error { + select { + case <-wr.closed: + return nil + default: + } + close(wr.attributes) + close(wr.closed) + return nil +} diff --git a/modules/git/repo_attribute_test.go b/modules/git/attribute/batch_test.go similarity index 50% rename from modules/git/repo_attribute_test.go rename to modules/git/attribute/batch_test.go index d8fd9f0e8d..30a3d805fe 100644 --- a/modules/git/repo_attribute_test.go +++ b/modules/git/attribute/batch_test.go @@ -1,13 +1,19 @@ // Copyright 2021 The Gitea Authors. All rights reserved. // SPDX-License-Identifier: MIT -package git +package attribute import ( + "path/filepath" "testing" "time" + "code.gitea.io/gitea/modules/git" + "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/modules/test" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func Test_nulSeparatedAttributeWriter_ReadAttribute(t *testing.T) { @@ -24,7 +30,7 @@ func Test_nulSeparatedAttributeWriter_ReadAttribute(t *testing.T) { select { case attr := <-wr.ReadAttribute(): assert.Equal(t, ".gitignore\"\n", attr.Filename) - assert.Equal(t, AttributeLinguistVendored, attr.Attribute) + assert.Equal(t, LinguistVendored, attr.Attribute) assert.Equal(t, "unspecified", attr.Value) case <-time.After(100 * time.Millisecond): assert.FailNow(t, "took too long to read an attribute from the list") @@ -38,7 +44,7 @@ func Test_nulSeparatedAttributeWriter_ReadAttribute(t *testing.T) { select { case attr := <-wr.ReadAttribute(): assert.Equal(t, ".gitignore\"\n", attr.Filename) - assert.Equal(t, AttributeLinguistVendored, attr.Attribute) + assert.Equal(t, LinguistVendored, attr.Attribute) assert.Equal(t, "unspecified", attr.Value) case <-time.After(100 * time.Millisecond): assert.FailNow(t, "took too long to read an attribute from the list") @@ -77,21 +83,90 @@ func Test_nulSeparatedAttributeWriter_ReadAttribute(t *testing.T) { assert.NoError(t, err) assert.Equal(t, attributeTriple{ Filename: "shouldbe.vendor", - Attribute: AttributeLinguistVendored, + Attribute: LinguistVendored, Value: "set", }, attr) attr = <-wr.ReadAttribute() assert.NoError(t, err) assert.Equal(t, attributeTriple{ Filename: "shouldbe.vendor", - Attribute: AttributeLinguistGenerated, + Attribute: LinguistGenerated, Value: "unspecified", }, attr) attr = <-wr.ReadAttribute() assert.NoError(t, err) assert.Equal(t, attributeTriple{ Filename: "shouldbe.vendor", - Attribute: AttributeLinguistLanguage, + Attribute: LinguistLanguage, Value: "unspecified", }, attr) } + +func expectedAttrs() *Attributes { + return &Attributes{ + m: map[string]Attribute{ + LinguistGenerated: "unspecified", + LinguistDetectable: "unspecified", + LinguistDocumentation: "unspecified", + LinguistVendored: "unspecified", + LinguistLanguage: "Python", + GitlabLanguage: "unspecified", + }, + } +} + +func Test_BatchChecker(t *testing.T) { + setting.AppDataPath = t.TempDir() + repoPath := "../tests/repos/language_stats_repo" + gitRepo, err := git.OpenRepository(t.Context(), repoPath) + require.NoError(t, err) + defer gitRepo.Close() + + commitID := "8fee858da5796dfb37704761701bb8e800ad9ef3" + + t.Run("Create index file to run git check-attr", func(t *testing.T) { + defer test.MockVariableValue(&git.DefaultFeatures().SupportCheckAttrOnBare, false)() + checker, err := NewBatchChecker(gitRepo, commitID, LinguistAttributes) + assert.NoError(t, err) + defer checker.Close() + attributes, err := checker.CheckPath("i-am-a-python.p") + assert.NoError(t, err) + assert.Equal(t, expectedAttrs(), attributes) + }) + + // run git check-attr on work tree + t.Run("Run git check-attr on git work tree", func(t *testing.T) { + dir := filepath.Join(t.TempDir(), "test-repo") + err := git.Clone(t.Context(), repoPath, dir, git.CloneRepoOptions{ + Shared: true, + Branch: "master", + }) + assert.NoError(t, err) + + tempRepo, err := git.OpenRepository(t.Context(), dir) + assert.NoError(t, err) + defer tempRepo.Close() + + checker, err := NewBatchChecker(tempRepo, "", LinguistAttributes) + assert.NoError(t, err) + defer checker.Close() + attributes, err := checker.CheckPath("i-am-a-python.p") + assert.NoError(t, err) + assert.Equal(t, expectedAttrs(), attributes) + }) + + if !git.DefaultFeatures().SupportCheckAttrOnBare { + t.Skip("git version 2.40 is required to support run check-attr on bare repo") + return + } + + t.Run("Run git check-attr in bare repository", func(t *testing.T) { + checker, err := NewBatchChecker(gitRepo, commitID, LinguistAttributes) + assert.NoError(t, err) + defer checker.Close() + + attributes, err := checker.CheckPath("i-am-a-python.p") + assert.NoError(t, err) + assert.Equal(t, expectedAttrs(), attributes) + }) +} diff --git a/modules/git/attribute/checker.go b/modules/git/attribute/checker.go new file mode 100644 index 0000000000..c17006a154 --- /dev/null +++ b/modules/git/attribute/checker.go @@ -0,0 +1,96 @@ +// Copyright 2025 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package attribute + +import ( + "bytes" + "context" + "errors" + "fmt" + "os" + + "code.gitea.io/gitea/modules/git" +) + +func checkAttrCommand(gitRepo *git.Repository, treeish string, filenames, attributes []string) (*git.Command, []string, func(), error) { + cancel := func() {} + envs := []string{"GIT_FLUSH=1"} + cmd := git.NewCommand("check-attr", "-z") + if len(attributes) == 0 { + cmd.AddArguments("--all") + } + + // there is treeish, read from bare repo or temp index created by "read-tree" + if treeish != "" { + if git.DefaultFeatures().SupportCheckAttrOnBare { + cmd.AddArguments("--source") + cmd.AddDynamicArguments(treeish) + } else { + indexFilename, worktree, deleteTemporaryFile, err := gitRepo.ReadTreeToTemporaryIndex(treeish) + if err != nil { + return nil, nil, nil, err + } + + cmd.AddArguments("--cached") + envs = append(envs, + "GIT_INDEX_FILE="+indexFilename, + "GIT_WORK_TREE="+worktree, + ) + cancel = deleteTemporaryFile + } + } // else: no treeish, assume it is a not a bare repo, read from working directory + + cmd.AddDynamicArguments(attributes...) + if len(filenames) > 0 { + cmd.AddDashesAndList(filenames...) + } + return cmd, envs, cancel, nil +} + +type CheckAttributeOpts struct { + Filenames []string + Attributes []string +} + +// CheckAttributes return the attributes of the given filenames and attributes in the given treeish. +// If treeish is empty, then it will use current working directory, otherwise it will use the provided treeish on the bare repo +func CheckAttributes(ctx context.Context, gitRepo *git.Repository, treeish string, opts CheckAttributeOpts) (map[string]*Attributes, error) { + cmd, envs, cancel, err := checkAttrCommand(gitRepo, treeish, opts.Filenames, opts.Attributes) + if err != nil { + return nil, err + } + defer cancel() + + stdOut := new(bytes.Buffer) + stdErr := new(bytes.Buffer) + + if err := cmd.Run(ctx, &git.RunOpts{ + Env: append(os.Environ(), envs...), + Dir: gitRepo.Path, + Stdout: stdOut, + Stderr: stdErr, + }); err != nil { + return nil, fmt.Errorf("failed to run check-attr: %w\n%s\n%s", err, stdOut.String(), stdErr.String()) + } + + fields := bytes.Split(stdOut.Bytes(), []byte{'\000'}) + if len(fields)%3 != 1 { + return nil, errors.New("wrong number of fields in return from check-attr") + } + + attributesMap := make(map[string]*Attributes) + for i := 0; i < (len(fields) / 3); i++ { + filename := string(fields[3*i]) + attribute := string(fields[3*i+1]) + info := string(fields[3*i+2]) + attribute2info, ok := attributesMap[filename] + if !ok { + attribute2info = NewAttributes() + attributesMap[filename] = attribute2info + } + attribute2info.m[attribute] = Attribute(info) + } + + return attributesMap, nil +} diff --git a/modules/git/attribute/checker_test.go b/modules/git/attribute/checker_test.go new file mode 100644 index 0000000000..97db43460b --- /dev/null +++ b/modules/git/attribute/checker_test.go @@ -0,0 +1,74 @@ +// Copyright 2025 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package attribute + +import ( + "path/filepath" + "testing" + + "code.gitea.io/gitea/modules/git" + "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/modules/test" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func Test_Checker(t *testing.T) { + setting.AppDataPath = t.TempDir() + repoPath := "../tests/repos/language_stats_repo" + gitRepo, err := git.OpenRepository(t.Context(), repoPath) + require.NoError(t, err) + defer gitRepo.Close() + + commitID := "8fee858da5796dfb37704761701bb8e800ad9ef3" + + t.Run("Create index file to run git check-attr", func(t *testing.T) { + defer test.MockVariableValue(&git.DefaultFeatures().SupportCheckAttrOnBare, false)() + attrs, err := CheckAttributes(t.Context(), gitRepo, commitID, CheckAttributeOpts{ + Filenames: []string{"i-am-a-python.p"}, + Attributes: LinguistAttributes, + }) + assert.NoError(t, err) + assert.Len(t, attrs, 1) + assert.Equal(t, expectedAttrs(), attrs["i-am-a-python.p"]) + }) + + // run git check-attr on work tree + t.Run("Run git check-attr on git work tree", func(t *testing.T) { + dir := filepath.Join(t.TempDir(), "test-repo") + err := git.Clone(t.Context(), repoPath, dir, git.CloneRepoOptions{ + Shared: true, + Branch: "master", + }) + assert.NoError(t, err) + + tempRepo, err := git.OpenRepository(t.Context(), dir) + assert.NoError(t, err) + defer tempRepo.Close() + + attrs, err := CheckAttributes(t.Context(), tempRepo, "", CheckAttributeOpts{ + Filenames: []string{"i-am-a-python.p"}, + Attributes: LinguistAttributes, + }) + assert.NoError(t, err) + assert.Len(t, attrs, 1) + assert.Equal(t, expectedAttrs(), attrs["i-am-a-python.p"]) + }) + + if !git.DefaultFeatures().SupportCheckAttrOnBare { + t.Skip("git version 2.40 is required to support run check-attr on bare repo") + return + } + + t.Run("Run git check-attr in bare repository", func(t *testing.T) { + attrs, err := CheckAttributes(t.Context(), gitRepo, commitID, CheckAttributeOpts{ + Filenames: []string{"i-am-a-python.p"}, + Attributes: LinguistAttributes, + }) + assert.NoError(t, err) + assert.Len(t, attrs, 1) + assert.Equal(t, expectedAttrs(), attrs["i-am-a-python.p"]) + }) +} diff --git a/modules/git/attribute/main_test.go b/modules/git/attribute/main_test.go new file mode 100644 index 0000000000..df8241bfb0 --- /dev/null +++ b/modules/git/attribute/main_test.go @@ -0,0 +1,41 @@ +// Copyright 2025 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package attribute + +import ( + "context" + "fmt" + "os" + "testing" + + "code.gitea.io/gitea/modules/git" + "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/modules/util" +) + +func testRun(m *testing.M) error { + gitHomePath, err := os.MkdirTemp(os.TempDir(), "git-home") + if err != nil { + return fmt.Errorf("unable to create temp dir: %w", err) + } + defer util.RemoveAll(gitHomePath) + setting.Git.HomePath = gitHomePath + + if err = git.InitFull(context.Background()); err != nil { + return fmt.Errorf("failed to call Init: %w", err) + } + + exitCode := m.Run() + if exitCode != 0 { + return fmt.Errorf("run test failed, ExitCode=%d", exitCode) + } + return nil +} + +func TestMain(m *testing.M) { + if err := testRun(m); err != nil { + _, _ = fmt.Fprintf(os.Stderr, "Test failed: %v", err) + os.Exit(1) + } +} diff --git a/modules/git/command.go b/modules/git/command.go index f449f1ff0e..eaaa4969d0 100644 --- a/modules/git/command.go +++ b/modules/git/command.go @@ -80,6 +80,13 @@ func (c *Command) LogString() string { return strings.Join(a, " ") } +func (c *Command) ProcessState() string { + if c.cmd == nil { + return "" + } + return c.cmd.ProcessState.String() +} + // NewCommand creates and returns a new Git Command based on given command and arguments. // Each argument should be safe to be trusted. User-provided arguments should be passed to AddDynamicArguments instead. func NewCommand(args ...internal.CmdArg) *Command { diff --git a/modules/git/git.go b/modules/git/git.go index 2b593910a2..a2ffd6d289 100644 --- a/modules/git/git.go +++ b/modules/git/git.go @@ -30,6 +30,7 @@ type Features struct { SupportProcReceive bool // >= 2.29 SupportHashSha256 bool // >= 2.42, SHA-256 repositories no longer an ‘experimental curiosity’ SupportedObjectFormats []ObjectFormat // sha1, sha256 + SupportCheckAttrOnBare bool // >= 2.40 } var ( @@ -77,6 +78,7 @@ func loadGitVersionFeatures() (*Features, error) { if features.SupportHashSha256 { features.SupportedObjectFormats = append(features.SupportedObjectFormats, Sha256ObjectFormat) } + features.SupportCheckAttrOnBare = features.CheckVersionAtLeast("2.40") return features, nil } diff --git a/modules/git/repo_language_stats.go b/modules/git/languagestats/language_stats.go similarity index 59% rename from modules/git/repo_language_stats.go rename to modules/git/languagestats/language_stats.go index 8551ea9d24..a71284c3e4 100644 --- a/modules/git/repo_language_stats.go +++ b/modules/git/languagestats/language_stats.go @@ -1,13 +1,15 @@ // Copyright 2020 The Gitea Authors. All rights reserved. // SPDX-License-Identifier: MIT -package git +package languagestats import ( + "context" "strings" "unicode" - "code.gitea.io/gitea/modules/optional" + "code.gitea.io/gitea/modules/git" + "code.gitea.io/gitea/modules/git/attribute" ) const ( @@ -49,19 +51,15 @@ func mergeLanguageStats(stats map[string]int64) map[string]int64 { return res } -func TryReadLanguageAttribute(attrs map[string]string) optional.Option[string] { - language := AttributeToString(attrs, AttributeLinguistLanguage) - if language.Value() == "" { - language = AttributeToString(attrs, AttributeGitlabLanguage) - if language.Has() { - raw := language.Value() - // gitlab-language may have additional parameters after the language - // ignore them and just use the main language - // https://docs.gitlab.com/ee/user/project/highlighting.html#override-syntax-highlighting-for-a-file-type - if idx := strings.IndexByte(raw, '?'); idx >= 0 { - language = optional.Some(raw[:idx]) - } - } +// GetFileLanguage tries to get the (linguist) language of the file content +func GetFileLanguage(ctx context.Context, gitRepo *git.Repository, treeish, treePath string) (string, error) { + attributesMap, err := attribute.CheckAttributes(ctx, gitRepo, treeish, attribute.CheckAttributeOpts{ + Attributes: []string{attribute.LinguistLanguage, attribute.GitlabLanguage}, + Filenames: []string{treePath}, + }) + if err != nil { + return "", err } - return language + + return attributesMap[treePath].GetLanguage().Value(), nil } diff --git a/modules/git/repo_language_stats_gogit.go b/modules/git/languagestats/language_stats_gogit.go similarity index 73% rename from modules/git/repo_language_stats_gogit.go rename to modules/git/languagestats/language_stats_gogit.go index a34c03c781..418c05b157 100644 --- a/modules/git/repo_language_stats_gogit.go +++ b/modules/git/languagestats/language_stats_gogit.go @@ -3,13 +3,15 @@ //go:build gogit -package git +package languagestats import ( "bytes" "io" "code.gitea.io/gitea/modules/analyze" + git_module "code.gitea.io/gitea/modules/git" + "code.gitea.io/gitea/modules/git/attribute" "code.gitea.io/gitea/modules/optional" "github.com/go-enry/go-enry/v2" @@ -19,7 +21,7 @@ import ( ) // GetLanguageStats calculates language stats for git repository at specified commit -func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, error) { +func GetLanguageStats(repo *git_module.Repository, commitID string) (map[string]int64, error) { r, err := git.PlainOpen(repo.Path) if err != nil { return nil, err @@ -40,8 +42,11 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err return nil, err } - checker, deferable := repo.CheckAttributeReader(commitID) - defer deferable() + checker, err := attribute.NewBatchChecker(repo, commitID, attribute.LinguistAttributes) + if err != nil { + return nil, err + } + defer checker.Close() // sizes contains the current calculated size of all files by language sizes := make(map[string]int64) @@ -62,43 +67,41 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err isDocumentation := optional.None[bool]() isDetectable := optional.None[bool]() - if checker != nil { - attrs, err := checker.CheckPath(f.Name) - if err == nil { - isVendored = AttributeToBool(attrs, AttributeLinguistVendored) - if isVendored.ValueOrDefault(false) { - return nil + attrs, err := checker.CheckPath(f.Name) + if err == nil { + isVendored = attrs.GetVendored() + if isVendored.ValueOrDefault(false) { + return nil + } + + isGenerated = attrs.GetGenerated() + if isGenerated.ValueOrDefault(false) { + return nil + } + + isDocumentation = attrs.GetDocumentation() + if isDocumentation.ValueOrDefault(false) { + return nil + } + + isDetectable = attrs.GetDetectable() + if !isDetectable.ValueOrDefault(true) { + return nil + } + + hasLanguage := attrs.GetLanguage() + if hasLanguage.Value() != "" { + language := hasLanguage.Value() + + // group languages, such as Pug -> HTML; SCSS -> CSS + group := enry.GetLanguageGroup(language) + if len(group) != 0 { + language = group } - isGenerated = AttributeToBool(attrs, AttributeLinguistGenerated) - if isGenerated.ValueOrDefault(false) { - return nil - } - - isDocumentation = AttributeToBool(attrs, AttributeLinguistDocumentation) - if isDocumentation.ValueOrDefault(false) { - return nil - } - - isDetectable = AttributeToBool(attrs, AttributeLinguistDetectable) - if !isDetectable.ValueOrDefault(true) { - return nil - } - - hasLanguage := TryReadLanguageAttribute(attrs) - if hasLanguage.Value() != "" { - language := hasLanguage.Value() - - // group languages, such as Pug -> HTML; SCSS -> CSS - group := enry.GetLanguageGroup(language) - if len(group) != 0 { - language = group - } - - // this language will always be added to the size - sizes[language] += f.Size - return nil - } + // this language will always be added to the size + sizes[language] += f.Size + return nil } } diff --git a/modules/git/repo_language_stats_nogogit.go b/modules/git/languagestats/language_stats_nogogit.go similarity index 73% rename from modules/git/repo_language_stats_nogogit.go rename to modules/git/languagestats/language_stats_nogogit.go index de7707bd6c..34797263a6 100644 --- a/modules/git/repo_language_stats_nogogit.go +++ b/modules/git/languagestats/language_stats_nogogit.go @@ -3,13 +3,15 @@ //go:build !gogit -package git +package languagestats import ( "bytes" "io" "code.gitea.io/gitea/modules/analyze" + "code.gitea.io/gitea/modules/git" + "code.gitea.io/gitea/modules/git/attribute" "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/optional" @@ -17,7 +19,7 @@ import ( ) // GetLanguageStats calculates language stats for git repository at specified commit -func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, error) { +func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64, error) { // We will feed the commit IDs in order into cat-file --batch, followed by blobs as necessary. // so let's create a batch stdin and stdout batchStdinWriter, batchReader, cancel, err := repo.CatFileBatch(repo.Ctx) @@ -34,19 +36,19 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err if err := writeID(commitID); err != nil { return nil, err } - shaBytes, typ, size, err := ReadBatchLine(batchReader) + shaBytes, typ, size, err := git.ReadBatchLine(batchReader) if typ != "commit" { log.Debug("Unable to get commit for: %s. Err: %v", commitID, err) - return nil, ErrNotExist{commitID, ""} + return nil, git.ErrNotExist{ID: commitID} } - sha, err := NewIDFromString(string(shaBytes)) + sha, err := git.NewIDFromString(string(shaBytes)) if err != nil { log.Debug("Unable to get commit for: %s. Err: %v", commitID, err) - return nil, ErrNotExist{commitID, ""} + return nil, git.ErrNotExist{ID: commitID} } - commit, err := CommitFromReader(repo, sha, io.LimitReader(batchReader, size)) + commit, err := git.CommitFromReader(repo, sha, io.LimitReader(batchReader, size)) if err != nil { log.Debug("Unable to get commit for: %s. Err: %v", commitID, err) return nil, err @@ -62,8 +64,11 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err return nil, err } - checker, deferable := repo.CheckAttributeReader(commitID) - defer deferable() + checker, err := attribute.NewBatchChecker(repo, commitID, attribute.LinguistAttributes) + if err != nil { + return nil, err + } + defer checker.Close() contentBuf := bytes.Buffer{} var content []byte @@ -96,43 +101,36 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err isDocumentation := optional.None[bool]() isDetectable := optional.None[bool]() - if checker != nil { - attrs, err := checker.CheckPath(f.Name()) - if err == nil { - isVendored = AttributeToBool(attrs, AttributeLinguistVendored) - if isVendored.ValueOrDefault(false) { - continue + attrs, err := checker.CheckPath(f.Name()) + if err == nil { + if isVendored = attrs.GetVendored(); isVendored.ValueOrDefault(false) { + continue + } + + if isGenerated = attrs.GetGenerated(); isGenerated.ValueOrDefault(false) { + continue + } + + if isDocumentation = attrs.GetDocumentation(); isDocumentation.ValueOrDefault(false) { + continue + } + + if isDetectable = attrs.GetDetectable(); !isDetectable.ValueOrDefault(true) { + continue + } + + if hasLanguage := attrs.GetLanguage(); hasLanguage.Value() != "" { + language := hasLanguage.Value() + + // group languages, such as Pug -> HTML; SCSS -> CSS + group := enry.GetLanguageGroup(language) + if len(group) != 0 { + language = group } - isGenerated = AttributeToBool(attrs, AttributeLinguistGenerated) - if isGenerated.ValueOrDefault(false) { - continue - } - - isDocumentation = AttributeToBool(attrs, AttributeLinguistDocumentation) - if isDocumentation.ValueOrDefault(false) { - continue - } - - isDetectable = AttributeToBool(attrs, AttributeLinguistDetectable) - if !isDetectable.ValueOrDefault(true) { - continue - } - - hasLanguage := TryReadLanguageAttribute(attrs) - if hasLanguage.Value() != "" { - language := hasLanguage.Value() - - // group languages, such as Pug -> HTML; SCSS -> CSS - group := enry.GetLanguageGroup(language) - if len(group) != 0 { - language = group - } - - // this language will always be added to the size - sizes[language] += f.Size() - continue - } + // this language will always be added to the size + sizes[language] += f.Size() + continue } } @@ -149,7 +147,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err if err := writeID(f.ID.String()); err != nil { return nil, err } - _, _, size, err := ReadBatchLine(batchReader) + _, _, size, err := git.ReadBatchLine(batchReader) if err != nil { log.Debug("Error reading blob: %s Err: %v", f.ID.String(), err) return nil, err @@ -167,7 +165,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err return nil, err } content = contentBuf.Bytes() - if err := DiscardFull(batchReader, discard); err != nil { + if err := git.DiscardFull(batchReader, discard); err != nil { return nil, err } } diff --git a/modules/git/repo_language_stats_test.go b/modules/git/languagestats/language_stats_test.go similarity index 75% rename from modules/git/repo_language_stats_test.go rename to modules/git/languagestats/language_stats_test.go index 12ce958c6e..b908ae6413 100644 --- a/modules/git/repo_language_stats_test.go +++ b/modules/git/languagestats/language_stats_test.go @@ -3,12 +3,12 @@ //go:build !gogit -package git +package languagestats import ( - "path/filepath" "testing" + "code.gitea.io/gitea/modules/git" "code.gitea.io/gitea/modules/setting" "github.com/stretchr/testify/assert" @@ -17,13 +17,12 @@ import ( func TestRepository_GetLanguageStats(t *testing.T) { setting.AppDataPath = t.TempDir() - repoPath := filepath.Join(testReposDir, "language_stats_repo") - gitRepo, err := openRepositoryWithDefaultContext(repoPath) + repoPath := "../tests/repos/language_stats_repo" + gitRepo, err := git.OpenRepository(t.Context(), repoPath) require.NoError(t, err) - defer gitRepo.Close() - stats, err := gitRepo.GetLanguageStats("8fee858da5796dfb37704761701bb8e800ad9ef3") + stats, err := GetLanguageStats(gitRepo, "8fee858da5796dfb37704761701bb8e800ad9ef3") require.NoError(t, err) assert.Equal(t, map[string]int64{ diff --git a/modules/git/languagestats/main_test.go b/modules/git/languagestats/main_test.go new file mode 100644 index 0000000000..707d268c81 --- /dev/null +++ b/modules/git/languagestats/main_test.go @@ -0,0 +1,41 @@ +// Copyright 2025 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package languagestats + +import ( + "context" + "fmt" + "os" + "testing" + + "code.gitea.io/gitea/modules/git" + "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/modules/util" +) + +func testRun(m *testing.M) error { + gitHomePath, err := os.MkdirTemp(os.TempDir(), "git-home") + if err != nil { + return fmt.Errorf("unable to create temp dir: %w", err) + } + defer util.RemoveAll(gitHomePath) + setting.Git.HomePath = gitHomePath + + if err = git.InitFull(context.Background()); err != nil { + return fmt.Errorf("failed to call Init: %w", err) + } + + exitCode := m.Run() + if exitCode != 0 { + return fmt.Errorf("run test failed, ExitCode=%d", exitCode) + } + return nil +} + +func TestMain(m *testing.M) { + if err := testRun(m); err != nil { + _, _ = fmt.Fprintf(os.Stderr, "Test failed: %v", err) + os.Exit(1) + } +} diff --git a/modules/git/repo_attribute.go b/modules/git/repo_attribute.go deleted file mode 100644 index fde42d4730..0000000000 --- a/modules/git/repo_attribute.go +++ /dev/null @@ -1,341 +0,0 @@ -// Copyright 2019 The Gitea Authors. All rights reserved. -// SPDX-License-Identifier: MIT - -package git - -import ( - "bytes" - "context" - "errors" - "fmt" - "io" - "os" - "path/filepath" - "time" - - "code.gitea.io/gitea/modules/log" -) - -// CheckAttributeOpts represents the possible options to CheckAttribute -type CheckAttributeOpts struct { - CachedOnly bool - AllAttributes bool - Attributes []string - Filenames []string - IndexFile string - WorkTree string -} - -// CheckAttribute return the Blame object of file -func (repo *Repository) CheckAttribute(opts CheckAttributeOpts) (map[string]map[string]string, error) { - env := []string{} - - if len(opts.IndexFile) > 0 { - env = append(env, "GIT_INDEX_FILE="+opts.IndexFile) - } - if len(opts.WorkTree) > 0 { - env = append(env, "GIT_WORK_TREE="+opts.WorkTree) - } - - if len(env) > 0 { - env = append(os.Environ(), env...) - } - - stdOut := new(bytes.Buffer) - stdErr := new(bytes.Buffer) - - cmd := NewCommand("check-attr", "-z") - - if opts.AllAttributes { - cmd.AddArguments("-a") - } else { - for _, attribute := range opts.Attributes { - if attribute != "" { - cmd.AddDynamicArguments(attribute) - } - } - } - - if opts.CachedOnly { - cmd.AddArguments("--cached") - } - - cmd.AddDashesAndList(opts.Filenames...) - - if err := cmd.Run(repo.Ctx, &RunOpts{ - Env: env, - Dir: repo.Path, - Stdout: stdOut, - Stderr: stdErr, - }); err != nil { - return nil, fmt.Errorf("failed to run check-attr: %w\n%s\n%s", err, stdOut.String(), stdErr.String()) - } - - // FIXME: This is incorrect on versions < 1.8.5 - fields := bytes.Split(stdOut.Bytes(), []byte{'\000'}) - - if len(fields)%3 != 1 { - return nil, errors.New("wrong number of fields in return from check-attr") - } - - name2attribute2info := make(map[string]map[string]string) - - for i := 0; i < (len(fields) / 3); i++ { - filename := string(fields[3*i]) - attribute := string(fields[3*i+1]) - info := string(fields[3*i+2]) - attribute2info := name2attribute2info[filename] - if attribute2info == nil { - attribute2info = make(map[string]string) - } - attribute2info[attribute] = info - name2attribute2info[filename] = attribute2info - } - - return name2attribute2info, nil -} - -// CheckAttributeReader provides a reader for check-attribute content that can be long running -type CheckAttributeReader struct { - // params - Attributes []string - Repo *Repository - IndexFile string - WorkTree string - - stdinReader io.ReadCloser - stdinWriter *os.File - stdOut *nulSeparatedAttributeWriter - cmd *Command - env []string - ctx context.Context - cancel context.CancelFunc -} - -// Init initializes the CheckAttributeReader -func (c *CheckAttributeReader) Init(ctx context.Context) error { - if len(c.Attributes) == 0 { - lw := new(nulSeparatedAttributeWriter) - lw.attributes = make(chan attributeTriple) - lw.closed = make(chan struct{}) - - c.stdOut = lw - c.stdOut.Close() - return errors.New("no provided Attributes to check") - } - - c.ctx, c.cancel = context.WithCancel(ctx) - c.cmd = NewCommand("check-attr", "--stdin", "-z") - - if len(c.IndexFile) > 0 { - c.cmd.AddArguments("--cached") - c.env = append(c.env, "GIT_INDEX_FILE="+c.IndexFile) - } - - if len(c.WorkTree) > 0 { - c.env = append(c.env, "GIT_WORK_TREE="+c.WorkTree) - } - - c.env = append(c.env, "GIT_FLUSH=1") - - c.cmd.AddDynamicArguments(c.Attributes...) - - var err error - - c.stdinReader, c.stdinWriter, err = os.Pipe() - if err != nil { - c.cancel() - return err - } - - lw := new(nulSeparatedAttributeWriter) - lw.attributes = make(chan attributeTriple, 5) - lw.closed = make(chan struct{}) - c.stdOut = lw - return nil -} - -func (c *CheckAttributeReader) Run() error { - defer func() { - _ = c.stdinReader.Close() - _ = c.stdOut.Close() - }() - stdErr := new(bytes.Buffer) - err := c.cmd.Run(c.ctx, &RunOpts{ - Env: c.env, - Dir: c.Repo.Path, - Stdin: c.stdinReader, - Stdout: c.stdOut, - Stderr: stdErr, - }) - if err != nil && !IsErrCanceledOrKilled(err) { - return fmt.Errorf("failed to run attr-check. Error: %w\nStderr: %s", err, stdErr.String()) - } - return nil -} - -// CheckPath check attr for given path -func (c *CheckAttributeReader) CheckPath(path string) (rs map[string]string, err error) { - defer func() { - if err != nil && err != c.ctx.Err() { - log.Error("Unexpected error when checking path %s in %s, error: %v", path, filepath.Base(c.Repo.Path), err) - } - }() - - select { - case <-c.ctx.Done(): - return nil, c.ctx.Err() - default: - } - - if _, err = c.stdinWriter.Write([]byte(path + "\x00")); err != nil { - defer c.Close() - return nil, err - } - - reportTimeout := func() error { - stdOutClosed := false - select { - case <-c.stdOut.closed: - stdOutClosed = true - default: - } - debugMsg := fmt.Sprintf("check path %q in repo %q", path, filepath.Base(c.Repo.Path)) - debugMsg += fmt.Sprintf(", stdOut: tmp=%q, pos=%d, closed=%v", string(c.stdOut.tmp), c.stdOut.pos, stdOutClosed) - if c.cmd.cmd != nil { - debugMsg += fmt.Sprintf(", process state: %q", c.cmd.cmd.ProcessState.String()) - } - _ = c.Close() - return fmt.Errorf("CheckPath timeout: %s", debugMsg) - } - - rs = make(map[string]string) - for range c.Attributes { - select { - case <-time.After(5 * time.Second): - // There is a strange "hang" problem in gitdiff.GetDiff -> CheckPath - // So add a timeout here to mitigate the problem, and output more logs for debug purpose - // In real world, if CheckPath runs long than seconds, it blocks the end user's operation, - // and at the moment the CheckPath result is not so important, so we can just ignore it. - return nil, reportTimeout() - case attr, ok := <-c.stdOut.ReadAttribute(): - if !ok { - return nil, c.ctx.Err() - } - rs[attr.Attribute] = attr.Value - case <-c.ctx.Done(): - return nil, c.ctx.Err() - } - } - return rs, nil -} - -func (c *CheckAttributeReader) Close() error { - c.cancel() - err := c.stdinWriter.Close() - return err -} - -type attributeTriple struct { - Filename string - Attribute string - Value string -} - -type nulSeparatedAttributeWriter struct { - tmp []byte - attributes chan attributeTriple - closed chan struct{} - working attributeTriple - pos int -} - -func (wr *nulSeparatedAttributeWriter) Write(p []byte) (n int, err error) { - l, read := len(p), 0 - - nulIdx := bytes.IndexByte(p, '\x00') - for nulIdx >= 0 { - wr.tmp = append(wr.tmp, p[:nulIdx]...) - switch wr.pos { - case 0: - wr.working = attributeTriple{ - Filename: string(wr.tmp), - } - case 1: - wr.working.Attribute = string(wr.tmp) - case 2: - wr.working.Value = string(wr.tmp) - } - wr.tmp = wr.tmp[:0] - wr.pos++ - if wr.pos > 2 { - wr.attributes <- wr.working - wr.pos = 0 - } - read += nulIdx + 1 - if l > read { - p = p[nulIdx+1:] - nulIdx = bytes.IndexByte(p, '\x00') - } else { - return l, nil - } - } - wr.tmp = append(wr.tmp, p...) - return l, nil -} - -func (wr *nulSeparatedAttributeWriter) ReadAttribute() <-chan attributeTriple { - return wr.attributes -} - -func (wr *nulSeparatedAttributeWriter) Close() error { - select { - case <-wr.closed: - return nil - default: - } - close(wr.attributes) - close(wr.closed) - return nil -} - -// CheckAttributeReader creates a check attribute reader for the current repository and provided commit ID -func (repo *Repository) CheckAttributeReader(commitID string) (*CheckAttributeReader, context.CancelFunc) { - indexFilename, worktree, deleteTemporaryFile, err := repo.ReadTreeToTemporaryIndex(commitID) - if err != nil { - return nil, func() {} - } - - checker := &CheckAttributeReader{ - Attributes: []string{ - AttributeLinguistVendored, - AttributeLinguistGenerated, - AttributeLinguistDocumentation, - AttributeLinguistDetectable, - AttributeLinguistLanguage, - AttributeGitlabLanguage, - }, - Repo: repo, - IndexFile: indexFilename, - WorkTree: worktree, - } - ctx, cancel := context.WithCancel(repo.Ctx) - if err := checker.Init(ctx); err != nil { - log.Error("Unable to open attribute checker for commit %s, error: %v", commitID, err) - } else { - go func() { - err := checker.Run() - if err != nil && !IsErrCanceledOrKilled(err) { - log.Error("Attribute checker for commit %s exits with error: %v", commitID, err) - } - cancel() - }() - } - deferrable := func() { - _ = checker.Close() - cancel() - deleteTemporaryFile() - } - - return checker, deferrable -} diff --git a/modules/git/tests/repos/language_stats_repo/config b/modules/git/tests/repos/language_stats_repo/config index 515f483629..a4ef456cbc 100644 --- a/modules/git/tests/repos/language_stats_repo/config +++ b/modules/git/tests/repos/language_stats_repo/config @@ -1,5 +1,5 @@ [core] repositoryformatversion = 0 filemode = true - bare = false + bare = true logallrefupdates = true diff --git a/modules/git/tests/repos/repo3_notes/config b/modules/git/tests/repos/repo3_notes/config index d545cdabdb..5ed22e23d1 100644 --- a/modules/git/tests/repos/repo3_notes/config +++ b/modules/git/tests/repos/repo3_notes/config @@ -1,7 +1,7 @@ [core] repositoryformatversion = 0 filemode = false - bare = false + bare = true logallrefupdates = true symlinks = false ignorecase = true diff --git a/modules/git/tests/repos/repo4_commitsbetween/config b/modules/git/tests/repos/repo4_commitsbetween/config index d545cdabdb..5ed22e23d1 100644 --- a/modules/git/tests/repos/repo4_commitsbetween/config +++ b/modules/git/tests/repos/repo4_commitsbetween/config @@ -1,7 +1,7 @@ [core] repositoryformatversion = 0 filemode = false - bare = false + bare = true logallrefupdates = true symlinks = false ignorecase = true diff --git a/modules/indexer/stats/db.go b/modules/indexer/stats/db.go index 067a6f609b..199d493e97 100644 --- a/modules/indexer/stats/db.go +++ b/modules/indexer/stats/db.go @@ -8,6 +8,7 @@ import ( repo_model "code.gitea.io/gitea/models/repo" "code.gitea.io/gitea/modules/git" + "code.gitea.io/gitea/modules/git/languagestats" "code.gitea.io/gitea/modules/gitrepo" "code.gitea.io/gitea/modules/graceful" "code.gitea.io/gitea/modules/log" @@ -62,7 +63,7 @@ func (db *DBIndexer) Index(id int64) error { } // Calculate and save language statistics to database - stats, err := gitRepo.GetLanguageStats(commitID) + stats, err := languagestats.GetLanguageStats(gitRepo, commitID) if err != nil { if !setting.IsInTesting { log.Error("Unable to get language stats for ID %s for default branch %s in %s. Error: %v", commitID, repo.DefaultBranch, repo.FullName(), err) diff --git a/routers/web/repo/blame.go b/routers/web/repo/blame.go index efd85b9452..e125267524 100644 --- a/routers/web/repo/blame.go +++ b/routers/web/repo/blame.go @@ -15,13 +15,13 @@ import ( user_model "code.gitea.io/gitea/models/user" "code.gitea.io/gitea/modules/charset" "code.gitea.io/gitea/modules/git" + "code.gitea.io/gitea/modules/git/languagestats" "code.gitea.io/gitea/modules/highlight" "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/templates" "code.gitea.io/gitea/modules/util" "code.gitea.io/gitea/services/context" - files_service "code.gitea.io/gitea/services/repository/files" ) type blameRow struct { @@ -234,7 +234,7 @@ func processBlameParts(ctx *context.Context, blameParts []*git.BlamePart) map[st func renderBlame(ctx *context.Context, blameParts []*git.BlamePart, commitNames map[string]*user_model.UserCommit) { repoLink := ctx.Repo.RepoLink - language, err := files_service.TryGetContentLanguage(ctx.Repo.GitRepo, ctx.Repo.CommitID, ctx.Repo.TreePath) + language, err := languagestats.GetFileLanguage(ctx, ctx.Repo.GitRepo, ctx.Repo.CommitID, ctx.Repo.TreePath) if err != nil { log.Error("Unable to get file language for %-v:%s. Error: %v", ctx.Repo.Repository, ctx.Repo.TreePath, err) } diff --git a/routers/web/repo/setting/lfs.go b/routers/web/repo/setting/lfs.go index efda9bda58..a065620b2b 100644 --- a/routers/web/repo/setting/lfs.go +++ b/routers/web/repo/setting/lfs.go @@ -18,6 +18,7 @@ import ( "code.gitea.io/gitea/modules/charset" "code.gitea.io/gitea/modules/container" "code.gitea.io/gitea/modules/git" + "code.gitea.io/gitea/modules/git/attribute" "code.gitea.io/gitea/modules/git/pipeline" "code.gitea.io/gitea/modules/lfs" "code.gitea.io/gitea/modules/log" @@ -134,39 +135,24 @@ func LFSLocks(ctx *context.Context) { } defer gitRepo.Close() - filenames := make([]string, len(lfsLocks)) - - for i, lock := range lfsLocks { - filenames[i] = lock.Path - } - - if err := gitRepo.ReadTreeToIndex(ctx.Repo.Repository.DefaultBranch); err != nil { - log.Error("Unable to read the default branch to the index: %s (%v)", ctx.Repo.Repository.DefaultBranch, err) - ctx.ServerError("LFSLocks", fmt.Errorf("unable to read the default branch to the index: %s (%w)", ctx.Repo.Repository.DefaultBranch, err)) - return - } - - name2attribute2info, err := gitRepo.CheckAttribute(git.CheckAttributeOpts{ - Attributes: []string{"lockable"}, - Filenames: filenames, - CachedOnly: true, - }) + checker, err := attribute.NewBatchChecker(gitRepo, ctx.Repo.Repository.DefaultBranch, []string{attribute.Lockable}) if err != nil { log.Error("Unable to check attributes in %s (%v)", tmpBasePath, err) ctx.ServerError("LFSLocks", err) return } + defer checker.Close() lockables := make([]bool, len(lfsLocks)) + filenames := make([]string, len(lfsLocks)) for i, lock := range lfsLocks { - attribute2info, has := name2attribute2info[lock.Path] - if !has { + filenames[i] = lock.Path + attrs, err := checker.CheckPath(lock.Path) + if err != nil { + log.Error("Unable to check attributes in %s: %s (%v)", tmpBasePath, lock.Path, err) continue } - if attribute2info["lockable"] != "set" { - continue - } - lockables[i] = true + lockables[i] = attrs.Get(attribute.Lockable).ToBool().Value() } ctx.Data["Lockables"] = lockables diff --git a/routers/web/repo/view_file.go b/routers/web/repo/view_file.go index 12083a1ced..ff0e1b4d54 100644 --- a/routers/web/repo/view_file.go +++ b/routers/web/repo/view_file.go @@ -18,6 +18,7 @@ import ( "code.gitea.io/gitea/modules/actions" "code.gitea.io/gitea/modules/charset" "code.gitea.io/gitea/modules/git" + "code.gitea.io/gitea/modules/git/attribute" "code.gitea.io/gitea/modules/highlight" "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/markup" @@ -25,7 +26,6 @@ import ( "code.gitea.io/gitea/modules/util" "code.gitea.io/gitea/services/context" issue_service "code.gitea.io/gitea/services/issue" - files_service "code.gitea.io/gitea/services/repository/files" "github.com/nektos/act/pkg/model" ) @@ -147,6 +147,23 @@ func prepareToRenderFile(ctx *context.Context, entry *git.TreeEntry) { ctx.Data["EditFileTooltip"] = ctx.Tr("repo.editor.cannot_edit_non_text_files") } + // read all needed attributes which will be used later + // there should be no performance different between reading 2 or 4 here + attrsMap, err := attribute.CheckAttributes(ctx, ctx.Repo.GitRepo, ctx.Repo.CommitID, attribute.CheckAttributeOpts{ + Filenames: []string{ctx.Repo.TreePath}, + Attributes: []string{attribute.LinguistGenerated, attribute.LinguistVendored, attribute.LinguistLanguage, attribute.GitlabLanguage}, + }) + if err != nil { + ctx.ServerError("attribute.CheckAttributes", err) + return + } + attrs := attrsMap[ctx.Repo.TreePath] + if attrs == nil { + // this case shouldn't happen, just in case. + setting.PanicInDevOrTesting("no attributes found for %s", ctx.Repo.TreePath) + attrs = attribute.NewAttributes() + } + switch { case isRepresentableAsText: if fInfo.fileSize >= setting.UI.MaxDisplayFileSize { @@ -209,11 +226,7 @@ func prepareToRenderFile(ctx *context.Context, entry *git.TreeEntry) { ctx.Data["NumLines"] = bytes.Count(buf, []byte{'\n'}) + 1 } - language, err := files_service.TryGetContentLanguage(ctx.Repo.GitRepo, ctx.Repo.CommitID, ctx.Repo.TreePath) - if err != nil { - log.Error("Unable to get file language for %-v:%s. Error: %v", ctx.Repo.Repository, ctx.Repo.TreePath, err) - } - + language := attrs.GetLanguage().Value() fileContent, lexerName, err := highlight.File(blob.Name(), language, buf) ctx.Data["LexerName"] = lexerName if err != nil { @@ -283,17 +296,7 @@ func prepareToRenderFile(ctx *context.Context, entry *git.TreeEntry) { } } - if ctx.Repo.GitRepo != nil { - checker, deferable := ctx.Repo.GitRepo.CheckAttributeReader(ctx.Repo.CommitID) - if checker != nil { - defer deferable() - attrs, err := checker.CheckPath(ctx.Repo.TreePath) - if err == nil { - ctx.Data["IsVendored"] = git.AttributeToBool(attrs, git.AttributeLinguistVendored).Value() - ctx.Data["IsGenerated"] = git.AttributeToBool(attrs, git.AttributeLinguistGenerated).Value() - } - } - } + ctx.Data["IsVendored"], ctx.Data["IsGenerated"] = attrs.GetVendored().Value(), attrs.GetGenerated().Value() if fInfo.st.IsImage() && !fInfo.st.IsSvgImage() { img, _, err := image.DecodeConfig(bytes.NewReader(buf)) diff --git a/services/gitdiff/gitdiff.go b/services/gitdiff/gitdiff.go index b9781cf8d0..9ee86d9dfc 100644 --- a/services/gitdiff/gitdiff.go +++ b/services/gitdiff/gitdiff.go @@ -25,6 +25,7 @@ import ( "code.gitea.io/gitea/modules/analyze" "code.gitea.io/gitea/modules/charset" "code.gitea.io/gitea/modules/git" + "code.gitea.io/gitea/modules/git/attribute" "code.gitea.io/gitea/modules/highlight" "code.gitea.io/gitea/modules/lfs" "code.gitea.io/gitea/modules/log" @@ -1237,24 +1238,21 @@ func GetDiffForRender(ctx context.Context, gitRepo *git.Repository, opts *DiffOp return nil, err } - checker, deferrable := gitRepo.CheckAttributeReader(opts.AfterCommitID) - defer deferrable() + checker, err := attribute.NewBatchChecker(gitRepo, opts.AfterCommitID, []string{attribute.LinguistVendored, attribute.LinguistGenerated, attribute.LinguistLanguage, attribute.GitlabLanguage}) + if err != nil { + return nil, err + } + defer checker.Close() for _, diffFile := range diff.Files { isVendored := optional.None[bool]() isGenerated := optional.None[bool]() - if checker != nil { - attrs, err := checker.CheckPath(diffFile.Name) - if err == nil { - isVendored = git.AttributeToBool(attrs, git.AttributeLinguistVendored) - isGenerated = git.AttributeToBool(attrs, git.AttributeLinguistGenerated) - - language := git.TryReadLanguageAttribute(attrs) - if language.Has() { - diffFile.Language = language.Value() - } - } else { - checker = nil // CheckPath fails, it's not impossible to "check" anymore + attrs, err := checker.CheckPath(diffFile.Name) + if err == nil { + isVendored, isGenerated = attrs.GetVendored(), attrs.GetGenerated() + language := attrs.GetLanguage() + if language.Has() { + diffFile.Language = language.Value() } } diff --git a/services/markup/renderhelper_codepreview.go b/services/markup/renderhelper_codepreview.go index 28d1120984..fa1eb824a2 100644 --- a/services/markup/renderhelper_codepreview.go +++ b/services/markup/renderhelper_codepreview.go @@ -14,13 +14,13 @@ import ( "code.gitea.io/gitea/models/repo" "code.gitea.io/gitea/models/unit" "code.gitea.io/gitea/modules/charset" + "code.gitea.io/gitea/modules/git/languagestats" "code.gitea.io/gitea/modules/gitrepo" "code.gitea.io/gitea/modules/indexer/code" "code.gitea.io/gitea/modules/markup" "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/util" gitea_context "code.gitea.io/gitea/services/context" - "code.gitea.io/gitea/services/repository/files" ) func renderRepoFileCodePreview(ctx context.Context, opts markup.RenderCodePreviewOptions) (template.HTML, error) { @@ -61,7 +61,7 @@ func renderRepoFileCodePreview(ctx context.Context, opts markup.RenderCodePrevie return "", err } - language, _ := files.TryGetContentLanguage(gitRepo, opts.CommitID, opts.FilePath) + language, _ := languagestats.GetFileLanguage(ctx, gitRepo, opts.CommitID, opts.FilePath) blob, err := commit.GetBlobByPath(opts.FilePath) if err != nil { return "", err diff --git a/services/repository/files/content.go b/services/repository/files/content.go index e23cd1abce..0327e7f2ce 100644 --- a/services/repository/files/content.go +++ b/services/repository/files/content.go @@ -277,28 +277,3 @@ func GetBlobBySHA(ctx context.Context, repo *repo_model.Repository, gitRepo *git Content: content, }, nil } - -// TryGetContentLanguage tries to get the (linguist) language of the file content -func TryGetContentLanguage(gitRepo *git.Repository, commitID, treePath string) (string, error) { - indexFilename, worktree, deleteTemporaryFile, err := gitRepo.ReadTreeToTemporaryIndex(commitID) - if err != nil { - return "", err - } - - defer deleteTemporaryFile() - - filename2attribute2info, err := gitRepo.CheckAttribute(git.CheckAttributeOpts{ - CachedOnly: true, - Attributes: []string{git.AttributeLinguistLanguage, git.AttributeGitlabLanguage}, - Filenames: []string{treePath}, - IndexFile: indexFilename, - WorkTree: worktree, - }) - if err != nil { - return "", err - } - - language := git.TryReadLanguageAttribute(filename2attribute2info[treePath]) - - return language.Value(), nil -} diff --git a/services/repository/files/update.go b/services/repository/files/update.go index 3f6255e77a..75ede4976f 100644 --- a/services/repository/files/update.go +++ b/services/repository/files/update.go @@ -15,6 +15,7 @@ import ( repo_model "code.gitea.io/gitea/models/repo" user_model "code.gitea.io/gitea/models/user" "code.gitea.io/gitea/modules/git" + "code.gitea.io/gitea/modules/git/attribute" "code.gitea.io/gitea/modules/gitrepo" "code.gitea.io/gitea/modules/lfs" "code.gitea.io/gitea/modules/log" @@ -488,16 +489,15 @@ func CreateOrUpdateFile(ctx context.Context, t *TemporaryUploadRepository, file var lfsMetaObject *git_model.LFSMetaObject if setting.LFS.StartServer && hasOldBranch { // Check there is no way this can return multiple infos - filename2attribute2info, err := t.gitRepo.CheckAttribute(git.CheckAttributeOpts{ - Attributes: []string{"filter"}, + attributesMap, err := attribute.CheckAttributes(ctx, t.gitRepo, "" /* use temp repo's working dir */, attribute.CheckAttributeOpts{ + Attributes: []string{attribute.Filter}, Filenames: []string{file.Options.treePath}, - CachedOnly: true, }) if err != nil { return err } - if filename2attribute2info[file.Options.treePath] != nil && filename2attribute2info[file.Options.treePath]["filter"] == "lfs" { + if attributesMap[file.Options.treePath] != nil && attributesMap[file.Options.treePath].Get(attribute.Filter).ToString().Value() == "lfs" { // OK so we are supposed to LFS this data! pointer, err := lfs.GeneratePointer(treeObjectContentReader) if err != nil { diff --git a/services/repository/files/upload.go b/services/repository/files/upload.go index 2e4ed1744e..f348cb68ab 100644 --- a/services/repository/files/upload.go +++ b/services/repository/files/upload.go @@ -14,6 +14,7 @@ import ( repo_model "code.gitea.io/gitea/models/repo" user_model "code.gitea.io/gitea/models/user" "code.gitea.io/gitea/modules/git" + "code.gitea.io/gitea/modules/git/attribute" "code.gitea.io/gitea/modules/lfs" "code.gitea.io/gitea/modules/setting" ) @@ -105,12 +106,11 @@ func UploadRepoFiles(ctx context.Context, repo *repo_model.Repository, doer *use } } - var filename2attribute2info map[string]map[string]string + var attributesMap map[string]*attribute.Attributes if setting.LFS.StartServer { - filename2attribute2info, err = t.gitRepo.CheckAttribute(git.CheckAttributeOpts{ - Attributes: []string{"filter"}, + attributesMap, err = attribute.CheckAttributes(ctx, t.gitRepo, "" /* use temp repo's working dir */, attribute.CheckAttributeOpts{ + Attributes: []string{attribute.Filter}, Filenames: names, - CachedOnly: true, }) if err != nil { return err @@ -119,7 +119,7 @@ func UploadRepoFiles(ctx context.Context, repo *repo_model.Repository, doer *use // Copy uploaded files into repository. for i := range infos { - if err := copyUploadedLFSFileIntoRepository(ctx, &infos[i], filename2attribute2info, t, opts.TreePath); err != nil { + if err := copyUploadedLFSFileIntoRepository(ctx, &infos[i], attributesMap, t, opts.TreePath); err != nil { return err } } @@ -176,7 +176,7 @@ func UploadRepoFiles(ctx context.Context, repo *repo_model.Repository, doer *use return repo_model.DeleteUploads(ctx, uploads...) } -func copyUploadedLFSFileIntoRepository(ctx context.Context, info *uploadInfo, filename2attribute2info map[string]map[string]string, t *TemporaryUploadRepository, treePath string) error { +func copyUploadedLFSFileIntoRepository(ctx context.Context, info *uploadInfo, attributesMap map[string]*attribute.Attributes, t *TemporaryUploadRepository, treePath string) error { file, err := os.Open(info.upload.LocalPath()) if err != nil { return err @@ -184,7 +184,7 @@ func copyUploadedLFSFileIntoRepository(ctx context.Context, info *uploadInfo, fi defer file.Close() var objectHash string - if setting.LFS.StartServer && filename2attribute2info[info.upload.Name] != nil && filename2attribute2info[info.upload.Name]["filter"] == "lfs" { + if setting.LFS.StartServer && attributesMap[info.upload.Name] != nil && attributesMap[info.upload.Name].Get(attribute.Filter).ToString().Value() == "lfs" { // Handle LFS // FIXME: Inefficient! this should probably happen in models.Upload pointer, err := lfs.GeneratePointer(file)