Fix bleve fuzziness (#30799)

Fix #30797
Fix #30317
This commit is contained in:
wxiaoguang 2024-05-01 20:32:52 +08:00 committed by GitHub
parent f135cb7c94
commit 6f7cd94a02
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 16 additions and 10 deletions

View File

@ -39,8 +39,6 @@ import (
const (
unicodeNormalizeName = "unicodeNormalize"
maxBatchSize = 16
// fuzzyDenominator determines the levenshtein distance per each character of a keyword
fuzzyDenominator = 4
)
func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
@ -245,7 +243,7 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
phraseQuery.Analyzer = repoIndexerAnalyzer
keywordQuery = phraseQuery
if opts.IsKeywordFuzzy {
phraseQuery.Fuzziness = len(opts.Keyword) / fuzzyDenominator
phraseQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
}
if len(opts.RepoIDs) > 0 {

View File

@ -47,3 +47,15 @@ func openIndexer(path string, latestVersion int) (bleve.Index, int, error) {
return index, 0, nil
}
func GuessFuzzinessByKeyword(s string) int {
// according to https://github.com/blevesearch/bleve/issues/1563, the supported max fuzziness is 2
// magic number 4 was chosen to determine the levenshtein distance per each character of a keyword
// BUT, when using CJK (eg: `갃갃갃` `啊啊啊`), it mismatches a lot.
for _, r := range s {
if r >= 128 {
return 0
}
}
return min(2, len(s)/4)
}

View File

@ -35,11 +35,7 @@ func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
})
}
const (
maxBatchSize = 16
// fuzzyDenominator determines the levenshtein distance per each character of a keyword
fuzzyDenominator = 4
)
const maxBatchSize = 16
// IndexerData an update to the issue indexer
type IndexerData internal.IndexerData
@ -162,7 +158,7 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
if options.Keyword != "" {
fuzziness := 0
if options.IsFuzzyKeyword {
fuzziness = len(options.Keyword) / fuzzyDenominator
fuzziness = inner_bleve.GuessFuzzinessByKeyword(options.Keyword)
}
queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{

View File

@ -28,6 +28,7 @@ func Search(ctx *context.Context) {
ctx.Data["Language"] = language
ctx.Data["IsFuzzy"] = isFuzzy
ctx.Data["PageIsViewCode"] = true
ctx.Data["IsRepoIndexerEnabled"] = setting.Indexer.RepoIndexerEnabled
if keyword == "" {
ctx.HTML(http.StatusOK, tplSearch)
@ -86,7 +87,6 @@ func Search(ctx *context.Context) {
}
}
ctx.Data["IsRepoIndexerEnabled"] = setting.Indexer.RepoIndexerEnabled
ctx.Data["Repo"] = ctx.Repo.Repository
ctx.Data["SearchResults"] = searchResults
ctx.Data["SearchResultLanguages"] = searchResultLanguages