summaryrefslogtreecommitdiff
path: root/src/pkg/index/suffixarray/suffixarray.go
blob: acc9a785f0ae120d1a36e3e320ab47ca8b06180e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// The suffixarray package implements substring search in logarithmic time
// using an in-memory suffix array.
//
// Example use:
//
//	// create index for some data
//	index := suffixarray.New(data)
//
//	// lookup byte slice s
//	offsets1 := index.Lookup(s, -1) // the list of all indices where s occurs in data
//	offsets2 := index.Lookup(s, 3)  // the list of at most 3 indices where s occurs in data
//
package suffixarray

import (
	"bytes"
	"container/vector"
	"sort"
)

// BUG(gri): For larger data (10MB) which contains very long (say 100000)
// contiguous sequences of identical bytes, index creation time will be extremely slow.

// TODO(gri): Use a more sophisticated algorithm to create the suffix array.


// Index implements a suffix array for fast substring search.
type Index struct {
	data []byte
	sa   []int // suffix array for data
}


// New creates a new Index for data.
// Index creation time is approximately O(N*log(N)) for N = len(data).
//
func New(data []byte) *Index {
	sa := make([]int, len(data))
	for i, _ := range sa {
		sa[i] = i
	}
	x := &index{data, sa}
	sort.Sort(x)
	return (*Index)(x)
}


func (x *Index) at(i int) []byte {
	return x.data[x.sa[i]:]
}


// Binary search according to "A Method of Programming", E.W. Dijkstra.
func (x *Index) search(s []byte) int {
	i, j := 0, len(x.sa)
	// i < j for non-empty x
	for i+1 < j {
		// 0 <= i < j <= len(x.sa) && (x.at(i) <= s < x.at(j) || (s is not in x))
		h := i + (j-i)/2 // i < h < j
		if bytes.Compare(x.at(h), s) <= 0 {
			i = h
		} else { // s < x.at(h)
			j = h
		}
	}
	// i+1 == j for non-empty x
	return i
}


// Lookup returns an unsorted list of at most n indices where the byte string s
// occurs in the indexed data. If n < 0, all occurrences are returned.
// The result is nil if s is empty, s is not found, or n == 0.
// Lookup time is O((log(N) + len(res))*len(s)) where N is the
// size of the indexed data.
//
func (x *Index) Lookup(s []byte, n int) []int {
	var res vector.IntVector

	if len(s) > 0 && n != 0 {
		// find matching suffix index i
		i := x.search(s)
		// x.at(i) <= s < x.at(i+1)

		// ignore the first suffix if it is < s
		if i < len(x.sa) && bytes.Compare(x.at(i), s) < 0 {
			i++
		}

		// collect the following suffixes with matching prefixes
		for (n < 0 || len(res) < n) && i < len(x.sa) && bytes.HasPrefix(x.at(i), s) {
			res.Push(x.sa[i])
			i++
		}
	}

	return res
}


// index is like Index; it is only used to hide the sort.Interface methods
type index struct {
	data []byte
	sa   []int
}


// index implements sort.Interface

func (x *index) Len() int           { return len(x.sa) }
func (x *index) Less(i, j int) bool { return bytes.Compare(x.at(i), x.at(j)) < 0 }
func (x *index) Swap(i, j int)      { x.sa[i], x.sa[j] = x.sa[j], x.sa[i] }
func (a *index) at(i int) []byte    { return a.data[a.sa[i]:] }