summaryrefslogtreecommitdiff
path: root/libgo/go/exp/locale/collate/contract.go
blob: 0d9bc401bcfb452759d0ae2b7afee574bcfb9da9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package collate

import "unicode/utf8"

// For a description of contractTrieSet, see exp/locale/collate/build/contract.go.

type contractTrieSet []struct{ l, h, n, i uint8 }

// ctScanner is used to match a trie to an input sequence.
// A contraction may match a non-contiguous sequence of bytes in an input string.
// For example, if there is a contraction for <a, combining_ring>, it should match
// the sequence <a, combining_cedilla, combining_ring>, as combining_cedilla does
// not block combining_ring.
// ctScanner does not automatically skip over non-blocking non-starters, but rather
// retains the state of the last match and leaves it up to the user to continue
// the match at the appropriate points.
type ctScanner struct {
	states contractTrieSet
	s      []byte
	n      int
	index  int
	pindex int
	done   bool
}

func (t contractTrieSet) scanner(index, n int, b []byte) ctScanner {
	return ctScanner{states: t[index:], s: b, n: n}
}

// result returns the offset i and bytes consumed p so far.  If no suffix
// matched, i and p will be 0.
func (s *ctScanner) result() (i, p int) {
	return s.index, s.pindex
}

const (
	final   = 0
	noIndex = 0xFF
)

// scan matches the longest suffix at the current location in the input
// and returns the number of bytes consumed.
func (s *ctScanner) scan(p int) int {
	pr := p // the p at the rune start
	str := s.s
	states, n := s.states, s.n
	for i := 0; i < n && p < len(str); {
		e := states[i]
		c := str[p]
		// TODO: a significant number of contractions are of a form that
		// cannot match discontiguous UTF-8 in a normalized string. We could let
		// a negative value of e.n mean that we can set s.done = true and avoid
		// the need for additional matches.
		if c >= e.l {
			if e.l == c {
				p++
				if e.i != noIndex {
					s.index = int(e.i)
					s.pindex = p
				}
				if e.n != final {
					i, states, n = 0, states[int(e.h)+n:], int(e.n)
					if p >= len(str) || utf8.RuneStart(str[p]) {
						s.states, s.n, pr = states, n, p
					}
				} else {
					s.done = true
					return p
				}
				continue
			} else if e.n == final && c <= e.h {
				p++
				s.done = true
				s.index = int(c-e.l) + int(e.i)
				s.pindex = p
				return p
			}
		}
		i++
	}
	return pr
}