Bruno Sofiato f64fbd9b74
Updated tokenizer to better matching when search for code snippets ()
This PR improves the accuracy of Gitea's code search. 

Currently, Gitea does not consider statements such as
`onsole.log("hello")` as hits when the user searches for `log`. The
culprit is how both ES and Bleve are tokenizing the file contents (in
both cases, `console.log` is a whole token).

In ES' case, we changed the tokenizer to
[simple_pattern_split](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-simplepatternsplit-tokenizer.html#:~:text=The%20simple_pattern_split%20tokenizer%20uses%20a,the%20tokenization%20is%20generally%20faster.).
In such a case, tokens are words formed by digits and letters. In
Bleve's case, it employs a
[letter](https://blevesearch.com/docs/Tokenizers/) tokenizer.

Resolves 

---------

Signed-off-by: Bruno Sofiato <bruno.sofiato@gmail.com>
2024-11-06 20:51:20 +00:00

90 lines
2.6 KiB
Go

// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package bleve
import (
"errors"
"os"
"unicode"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/util"
"github.com/blevesearch/bleve/v2"
unicode_tokenizer "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
"github.com/blevesearch/bleve/v2/index/upsidedown"
"github.com/ethantkoenig/rupture"
)
const (
maxFuzziness = 2
)
// openIndexer open the index at the specified path, checking for metadata
// updates and bleve version updates. If index needs to be created (or
// re-created), returns (nil, nil)
func openIndexer(path string, latestVersion int) (bleve.Index, int, error) {
_, err := os.Stat(path)
if err != nil && os.IsNotExist(err) {
return nil, 0, nil
} else if err != nil {
return nil, 0, err
}
metadata, err := rupture.ReadIndexMetadata(path)
if err != nil {
return nil, 0, err
}
if metadata.Version < latestVersion {
// the indexer is using a previous version, so we should delete it and
// re-populate
return nil, metadata.Version, util.RemoveAll(path)
}
index, err := bleve.Open(path)
if err != nil {
if errors.Is(err, upsidedown.IncompatibleVersion) {
log.Warn("Indexer was built with a previous version of bleve, deleting and rebuilding")
return nil, 0, util.RemoveAll(path)
}
return nil, 0, err
}
return index, 0, nil
}
// This method test the GuessFuzzinessByKeyword method. The fuzziness is based on the levenshtein distance and determines how many chars
// may be different on two string and they still be considered equivalent.
// Given a phrasse, its shortest word determines its fuzziness. If a phrase uses CJK (eg: `갃갃갃` `啊啊啊`), the fuzziness is zero.
func GuessFuzzinessByKeyword(s string) int {
tokenizer := unicode_tokenizer.NewUnicodeTokenizer()
tokens := tokenizer.Tokenize([]byte(s))
if len(tokens) > 0 {
fuzziness := maxFuzziness
for _, token := range tokens {
fuzziness = min(fuzziness, guessFuzzinessByKeyword(string(token.Term)))
}
return fuzziness
}
return 0
}
func guessFuzzinessByKeyword(s string) int {
// according to https://github.com/blevesearch/bleve/issues/1563, the supported max fuzziness is 2
// magic number 4 was chosen to determine the levenshtein distance per each character of a keyword
// BUT, when using CJK (eg: `갃갃갃` `啊啊啊`), it mismatches a lot.
// Likewise, queries whose terms contains characters that are *not* letters should not use fuzziness
for _, r := range s {
if r >= 128 || !unicode.IsLetter(r) {
return 0
}
}
return min(maxFuzziness, len(s)/4)
}