summaryrefslogtreecommitdiff
path: root/src/cmd/vendor/golang.org/x/tools/internal/lsp/fuzzy/symbol.go
blob: 073a4cd101220b6c02a38b10ca1100d5ad3f3c3d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
// Copyright 2021 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package fuzzy

import (
	"unicode"
)

// SymbolMatcher implements a fuzzy matching algorithm optimized for Go symbols
// of the form:
//
//	example.com/path/to/package.object.field
//
// Knowing that we are matching symbols like this allows us to make the
// following optimizations:
//   - We can incorporate right-to-left relevance directly into the score
//     calculation.
//   - We can match from right to left, discarding leading bytes if the input is
//     too long.
//   - We just take the right-most match without losing too much precision. This
//     allows us to use an O(n) algorithm.
//   - We can operate directly on chunked strings; in many cases we will
//     be storing the package path and/or package name separately from the
//     symbol or identifiers, so doing this avoids allocating strings.
//   - We can return the index of the right-most match, allowing us to trim
//     irrelevant qualification.
//
// This implementation is experimental, serving as a reference fast algorithm
// to compare to the fuzzy algorithm implemented by Matcher.
type SymbolMatcher struct {
	// Using buffers of length 256 is both a reasonable size for most qualified
	// symbols, and makes it easy to avoid bounds checks by using uint8 indexes.
	pattern     [256]rune
	patternLen  uint8
	inputBuffer [256]rune   // avoid allocating when considering chunks
	roles       [256]uint32 // which roles does a rune play (word start, etc.)
	segments    [256]uint8  // how many segments from the right is each rune
}

const (
	segmentStart uint32 = 1 << iota
	wordStart
	separator
)

// NewSymbolMatcher creates a SymbolMatcher that may be used to match the given
// search pattern.
//
// Currently this matcher only accepts case-insensitive fuzzy patterns.
//
// An empty pattern matches no input.
func NewSymbolMatcher(pattern string) *SymbolMatcher {
	m := &SymbolMatcher{}
	for _, p := range pattern {
		m.pattern[m.patternLen] = unicode.ToLower(p)
		m.patternLen++
		if m.patternLen == 255 || int(m.patternLen) == len(pattern) {
			// break at 255 so that we can represent patternLen with a uint8.
			break
		}
	}
	return m
}

// Match looks for the right-most match of the search pattern within the symbol
// represented by concatenating the given chunks, returning its offset and
// score.
//
// If a match is found, the first return value will hold the absolute byte
// offset within all chunks for the start of the symbol. In other words, the
// index of the match within strings.Join(chunks, ""). If no match is found,
// the first return value will be -1.
//
// The second return value will be the score of the match, which is always
// between 0 and 1, inclusive. A score of 0 indicates no match.
func (m *SymbolMatcher) Match(chunks []string) (int, float64) {
	// Explicit behavior for an empty pattern.
	//
	// As a minor optimization, this also avoids nilness checks later on, since
	// the compiler can prove that m != nil.
	if m.patternLen == 0 {
		return -1, 0
	}

	// First phase: populate the input buffer with lower-cased runes.
	//
	// We could also check for a forward match here, but since we'd have to write
	// the entire input anyway this has negligible impact on performance.

	var (
		inputLen  = uint8(0)
		modifiers = wordStart | segmentStart
	)

input:
	for _, chunk := range chunks {
		for _, r := range chunk {
			if r == '.' || r == '/' {
				modifiers |= separator
			}
			// optimization: avoid calls to unicode.ToLower, which can't be inlined.
			l := r
			if r <= unicode.MaxASCII {
				if 'A' <= r && r <= 'Z' {
					l = r + 'a' - 'A'
				}
			} else {
				l = unicode.ToLower(r)
			}
			if l != r {
				modifiers |= wordStart
			}
			m.inputBuffer[inputLen] = l
			m.roles[inputLen] = modifiers
			inputLen++
			if m.roles[inputLen-1]&separator != 0 {
				modifiers = wordStart | segmentStart
			} else {
				modifiers = 0
			}
			// TODO: we should prefer the right-most input if it overflows, rather
			//       than the left-most as we're doing here.
			if inputLen == 255 {
				break input
			}
		}
	}

	// Second phase: find the right-most match, and count segments from the
	// right.

	var (
		pi    = uint8(m.patternLen - 1) // pattern index
		p     = m.pattern[pi]           // pattern rune
		start = -1                      // start offset of match
		rseg  = uint8(0)
	)
	const maxSeg = 3 // maximum number of segments from the right to count, for scoring purposes.

	for ii := inputLen - 1; ; ii-- {
		r := m.inputBuffer[ii]
		if rseg < maxSeg && m.roles[ii]&separator != 0 {
			rseg++
		}
		m.segments[ii] = rseg
		if p == r {
			if pi == 0 {
				start = int(ii)
				break
			}
			pi--
			p = m.pattern[pi]
		}
		// Don't check ii >= 0 in the loop condition: ii is a uint8.
		if ii == 0 {
			break
		}
	}

	if start < 0 {
		// no match: skip scoring
		return -1, 0
	}

	// Third phase: find the shortest match, and compute the score.

	// Score is the average score for each character.
	//
	// A character score is the multiple of:
	//   1. 1.0 if the character starts a segment, .8 if the character start a
	//      mid-segment word, otherwise 0.6. This carries over to immediately
	//      following characters.
	//   2. For the final character match, the multiplier from (1) is reduced to
	//     .8 if the next character in the input is a mid-segment word, or 0.6 if
	//      the next character in the input is not a word or segment start. This
	//      ensures that we favor whole-word or whole-segment matches over prefix
	//      matches.
	//   3. 1.0 if the character is part of the last segment, otherwise
	//      1.0-.2*<segments from the right>, with a max segment count of 3.
	//
	// This is a very naive algorithm, but it is fast. There's lots of prior art
	// here, and we should leverage it. For example, we could explicitly consider
	// character distance, and exact matches of words or segments.
	//
	// Also note that this might not actually find the highest scoring match, as
	// doing so could require a non-linear algorithm, depending on how the score
	// is calculated.

	pi = 0
	p = m.pattern[pi]

	const (
		segStreak  = 1.0
		wordStreak = 0.8
		noStreak   = 0.6
		perSegment = 0.2 // we count at most 3 segments above
	)

	streakBonus := noStreak
	totScore := 0.0
	for ii := uint8(start); ii < inputLen; ii++ {
		r := m.inputBuffer[ii]
		if r == p {
			pi++
			p = m.pattern[pi]
			// Note: this could be optimized with some bit operations.
			switch {
			case m.roles[ii]&segmentStart != 0 && segStreak > streakBonus:
				streakBonus = segStreak
			case m.roles[ii]&wordStart != 0 && wordStreak > streakBonus:
				streakBonus = wordStreak
			}
			finalChar := pi >= m.patternLen
			// finalCost := 1.0
			if finalChar && streakBonus > noStreak {
				switch {
				case ii == inputLen-1 || m.roles[ii+1]&segmentStart != 0:
					// Full segment: no reduction
				case m.roles[ii+1]&wordStart != 0:
					streakBonus = wordStreak
				default:
					streakBonus = noStreak
				}
			}
			totScore += streakBonus * (1.0 - float64(m.segments[ii])*perSegment)
			if finalChar {
				break
			}
		} else {
			streakBonus = noStreak
		}
	}

	return start, totScore / float64(m.patternLen)
}