diff options
author | Austin Clements <austin@google.com> | 2014-10-22 13:25:37 -0400 |
---|---|---|
committer | Austin Clements <austin@google.com> | 2014-10-22 13:25:37 -0400 |
commit | 800a826ae560191d4dbb6c529eea15cf4349a06f (patch) | |
tree | c4206bb02365c3b9c66e2946031b86f0ee527faf /src/strings | |
parent | 2c586384223e980fd3303625ea6b02ed5d9fb9c0 (diff) | |
parent | 1678ee65674b332e900a703de296eb66fbadcf45 (diff) | |
download | go-800a826ae560191d4dbb6c529eea15cf4349a06f.tar.gz |
build: merge the great pkg/ rename into dev.power64
This also removes pkg/runtime/traceback_lr.c, which was ported
to Go in an earlier commit and then moved to
runtime/traceback.go.
Reviewer: rsc@golang.org
rsc: LGTM
Diffstat (limited to 'src/strings')
-rw-r--r-- | src/strings/example_test.go | 225 | ||||
-rw-r--r-- | src/strings/export_test.go | 45 | ||||
-rw-r--r-- | src/strings/reader.go | 144 | ||||
-rw-r--r-- | src/strings/reader_test.go | 159 | ||||
-rw-r--r-- | src/strings/replace.go | 518 | ||||
-rw-r--r-- | src/strings/replace_test.go | 542 | ||||
-rw-r--r-- | src/strings/search.go | 124 | ||||
-rw-r--r-- | src/strings/search_test.go | 90 | ||||
-rw-r--r-- | src/strings/strings.go | 765 | ||||
-rw-r--r-- | src/strings/strings.s | 5 | ||||
-rw-r--r-- | src/strings/strings_decl.go | 8 | ||||
-rw-r--r-- | src/strings/strings_test.go | 1204 |
12 files changed, 3829 insertions, 0 deletions
diff --git a/src/strings/example_test.go b/src/strings/example_test.go new file mode 100644 index 000000000..7243e16b1 --- /dev/null +++ b/src/strings/example_test.go @@ -0,0 +1,225 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings_test + +import ( + "fmt" + "strings" + "unicode" +) + +func ExampleFields() { + fmt.Printf("Fields are: %q", strings.Fields(" foo bar baz ")) + // Output: Fields are: ["foo" "bar" "baz"] +} + +func ExampleFieldsFunc() { + f := func(c rune) bool { + return !unicode.IsLetter(c) && !unicode.IsNumber(c) + } + fmt.Printf("Fields are: %q", strings.FieldsFunc(" foo1;bar2,baz3...", f)) + // Output: Fields are: ["foo1" "bar2" "baz3"] +} + +func ExampleContains() { + fmt.Println(strings.Contains("seafood", "foo")) + fmt.Println(strings.Contains("seafood", "bar")) + fmt.Println(strings.Contains("seafood", "")) + fmt.Println(strings.Contains("", "")) + // Output: + // true + // false + // true + // true +} + +func ExampleContainsAny() { + fmt.Println(strings.ContainsAny("team", "i")) + fmt.Println(strings.ContainsAny("failure", "u & i")) + fmt.Println(strings.ContainsAny("foo", "")) + fmt.Println(strings.ContainsAny("", "")) + // Output: + // false + // true + // false + // false +} + +func ExampleCount() { + fmt.Println(strings.Count("cheese", "e")) + fmt.Println(strings.Count("five", "")) // before & after each rune + // Output: + // 3 + // 5 +} + +func ExampleEqualFold() { + fmt.Println(strings.EqualFold("Go", "go")) + // Output: true +} + +func ExampleIndex() { + fmt.Println(strings.Index("chicken", "ken")) + fmt.Println(strings.Index("chicken", "dmr")) + // Output: + // 4 + // -1 +} + +func ExampleIndexFunc() { + f := func(c rune) bool { + return unicode.Is(unicode.Han, c) + } + fmt.Println(strings.IndexFunc("Hello, 世界", f)) + fmt.Println(strings.IndexFunc("Hello, world", f)) + // Output: + // 7 + // -1 +} + +func ExampleIndexAny() { + fmt.Println(strings.IndexAny("chicken", "aeiouy")) + fmt.Println(strings.IndexAny("crwth", "aeiouy")) + // Output: + // 2 + // -1 +} + +func ExampleIndexRune() { + fmt.Println(strings.IndexRune("chicken", 'k')) + fmt.Println(strings.IndexRune("chicken", 'd')) + // Output: + // 4 + // -1 +} + +func ExampleLastIndex() { + fmt.Println(strings.Index("go gopher", "go")) + fmt.Println(strings.LastIndex("go gopher", "go")) + fmt.Println(strings.LastIndex("go gopher", "rodent")) + // Output: + // 0 + // 3 + // -1 +} + +func ExampleJoin() { + s := []string{"foo", "bar", "baz"} + fmt.Println(strings.Join(s, ", ")) + // Output: foo, bar, baz +} + +func ExampleRepeat() { + fmt.Println("ba" + strings.Repeat("na", 2)) + // Output: banana +} + +func ExampleReplace() { + fmt.Println(strings.Replace("oink oink oink", "k", "ky", 2)) + fmt.Println(strings.Replace("oink oink oink", "oink", "moo", -1)) + // Output: + // oinky oinky oink + // moo moo moo +} + +func ExampleSplit() { + fmt.Printf("%q\n", strings.Split("a,b,c", ",")) + fmt.Printf("%q\n", strings.Split("a man a plan a canal panama", "a ")) + fmt.Printf("%q\n", strings.Split(" xyz ", "")) + fmt.Printf("%q\n", strings.Split("", "Bernardo O'Higgins")) + // Output: + // ["a" "b" "c"] + // ["" "man " "plan " "canal panama"] + // [" " "x" "y" "z" " "] + // [""] +} + +func ExampleSplitN() { + fmt.Printf("%q\n", strings.SplitN("a,b,c", ",", 2)) + z := strings.SplitN("a,b,c", ",", 0) + fmt.Printf("%q (nil = %v)\n", z, z == nil) + // Output: + // ["a" "b,c"] + // [] (nil = true) +} + +func ExampleSplitAfter() { + fmt.Printf("%q\n", strings.SplitAfter("a,b,c", ",")) + // Output: ["a," "b," "c"] +} + +func ExampleSplitAfterN() { + fmt.Printf("%q\n", strings.SplitAfterN("a,b,c", ",", 2)) + // Output: ["a," "b,c"] +} + +func ExampleTitle() { + fmt.Println(strings.Title("her royal highness")) + // Output: Her Royal Highness +} + +func ExampleToTitle() { + fmt.Println(strings.ToTitle("loud noises")) + fmt.Println(strings.ToTitle("хлеб")) + // Output: + // LOUD NOISES + // ХЛЕБ +} + +func ExampleTrim() { + fmt.Printf("[%q]", strings.Trim(" !!! Achtung! Achtung! !!! ", "! ")) + // Output: ["Achtung! Achtung"] +} + +func ExampleMap() { + rot13 := func(r rune) rune { + switch { + case r >= 'A' && r <= 'Z': + return 'A' + (r-'A'+13)%26 + case r >= 'a' && r <= 'z': + return 'a' + (r-'a'+13)%26 + } + return r + } + fmt.Println(strings.Map(rot13, "'Twas brillig and the slithy gopher...")) + // Output: 'Gjnf oevyyvt naq gur fyvgul tbcure... +} + +func ExampleTrimSpace() { + fmt.Println(strings.TrimSpace(" \t\n a lone gopher \n\t\r\n")) + // Output: a lone gopher +} + +func ExampleNewReplacer() { + r := strings.NewReplacer("<", "<", ">", ">") + fmt.Println(r.Replace("This is <b>HTML</b>!")) + // Output: This is <b>HTML</b>! +} + +func ExampleToUpper() { + fmt.Println(strings.ToUpper("Gopher")) + // Output: GOPHER +} + +func ExampleToLower() { + fmt.Println(strings.ToLower("Gopher")) + // Output: gopher +} + +func ExampleTrimSuffix() { + var s = "Hello, goodbye, etc!" + s = strings.TrimSuffix(s, "goodbye, etc!") + s = strings.TrimSuffix(s, "planet") + fmt.Print(s, "world!") + // Output: Hello, world! +} + +func ExampleTrimPrefix() { + var s = "Goodbye,, world!" + s = strings.TrimPrefix(s, "Goodbye,") + s = strings.TrimPrefix(s, "Howdy,") + fmt.Print("Hello" + s) + // Output: Hello, world! +} diff --git a/src/strings/export_test.go b/src/strings/export_test.go new file mode 100644 index 000000000..17c806aa5 --- /dev/null +++ b/src/strings/export_test.go @@ -0,0 +1,45 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings + +func (r *Replacer) Replacer() interface{} { + return r.r +} + +func (r *Replacer) PrintTrie() string { + gen := r.r.(*genericReplacer) + return gen.printNode(&gen.root, 0) +} + +func (r *genericReplacer) printNode(t *trieNode, depth int) (s string) { + if t.priority > 0 { + s += "+" + } else { + s += "-" + } + s += "\n" + + if t.prefix != "" { + s += Repeat(".", depth) + t.prefix + s += r.printNode(t.next, depth+len(t.prefix)) + } else if t.table != nil { + for b, m := range r.mapping { + if int(m) != r.tableSize && t.table[m] != nil { + s += Repeat(".", depth) + string([]byte{byte(b)}) + s += r.printNode(t.table[m], depth+1) + } + } + } + return +} + +func StringFind(pattern, text string) int { + return makeStringFinder(pattern).next(text) +} + +func DumpTables(pattern string) ([]int, []int) { + finder := makeStringFinder(pattern) + return finder.badCharSkip[:], finder.goodSuffixSkip +} diff --git a/src/strings/reader.go b/src/strings/reader.go new file mode 100644 index 000000000..82df97439 --- /dev/null +++ b/src/strings/reader.go @@ -0,0 +1,144 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings + +import ( + "errors" + "io" + "unicode/utf8" +) + +// A Reader implements the io.Reader, io.ReaderAt, io.Seeker, io.WriterTo, +// io.ByteScanner, and io.RuneScanner interfaces by reading +// from a string. +type Reader struct { + s string + i int64 // current reading index + prevRune int // index of previous rune; or < 0 +} + +// Len returns the number of bytes of the unread portion of the +// string. +func (r *Reader) Len() int { + if r.i >= int64(len(r.s)) { + return 0 + } + return int(int64(len(r.s)) - r.i) +} + +func (r *Reader) Read(b []byte) (n int, err error) { + if len(b) == 0 { + return 0, nil + } + if r.i >= int64(len(r.s)) { + return 0, io.EOF + } + r.prevRune = -1 + n = copy(b, r.s[r.i:]) + r.i += int64(n) + return +} + +func (r *Reader) ReadAt(b []byte, off int64) (n int, err error) { + // cannot modify state - see io.ReaderAt + if off < 0 { + return 0, errors.New("strings.Reader.ReadAt: negative offset") + } + if off >= int64(len(r.s)) { + return 0, io.EOF + } + n = copy(b, r.s[off:]) + if n < len(b) { + err = io.EOF + } + return +} + +func (r *Reader) ReadByte() (b byte, err error) { + r.prevRune = -1 + if r.i >= int64(len(r.s)) { + return 0, io.EOF + } + b = r.s[r.i] + r.i++ + return +} + +func (r *Reader) UnreadByte() error { + r.prevRune = -1 + if r.i <= 0 { + return errors.New("strings.Reader.UnreadByte: at beginning of string") + } + r.i-- + return nil +} + +func (r *Reader) ReadRune() (ch rune, size int, err error) { + if r.i >= int64(len(r.s)) { + r.prevRune = -1 + return 0, 0, io.EOF + } + r.prevRune = int(r.i) + if c := r.s[r.i]; c < utf8.RuneSelf { + r.i++ + return rune(c), 1, nil + } + ch, size = utf8.DecodeRuneInString(r.s[r.i:]) + r.i += int64(size) + return +} + +func (r *Reader) UnreadRune() error { + if r.prevRune < 0 { + return errors.New("strings.Reader.UnreadRune: previous operation was not ReadRune") + } + r.i = int64(r.prevRune) + r.prevRune = -1 + return nil +} + +// Seek implements the io.Seeker interface. +func (r *Reader) Seek(offset int64, whence int) (int64, error) { + r.prevRune = -1 + var abs int64 + switch whence { + case 0: + abs = offset + case 1: + abs = int64(r.i) + offset + case 2: + abs = int64(len(r.s)) + offset + default: + return 0, errors.New("strings.Reader.Seek: invalid whence") + } + if abs < 0 { + return 0, errors.New("strings.Reader.Seek: negative position") + } + r.i = abs + return abs, nil +} + +// WriteTo implements the io.WriterTo interface. +func (r *Reader) WriteTo(w io.Writer) (n int64, err error) { + r.prevRune = -1 + if r.i >= int64(len(r.s)) { + return 0, nil + } + s := r.s[r.i:] + m, err := io.WriteString(w, s) + if m > len(s) { + panic("strings.Reader.WriteTo: invalid WriteString count") + } + r.i += int64(m) + n = int64(m) + if m != len(s) && err == nil { + err = io.ErrShortWrite + } + return +} + +// NewReader returns a new Reader reading from s. +// It is similar to bytes.NewBufferString but more efficient and read-only. +func NewReader(s string) *Reader { return &Reader{s, 0, -1} } diff --git a/src/strings/reader_test.go b/src/strings/reader_test.go new file mode 100644 index 000000000..bee90eb25 --- /dev/null +++ b/src/strings/reader_test.go @@ -0,0 +1,159 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings_test + +import ( + "bytes" + "fmt" + "io" + "os" + "strings" + "sync" + "testing" +) + +func TestReader(t *testing.T) { + r := strings.NewReader("0123456789") + tests := []struct { + off int64 + seek int + n int + want string + wantpos int64 + seekerr string + }{ + {seek: os.SEEK_SET, off: 0, n: 20, want: "0123456789"}, + {seek: os.SEEK_SET, off: 1, n: 1, want: "1"}, + {seek: os.SEEK_CUR, off: 1, wantpos: 3, n: 2, want: "34"}, + {seek: os.SEEK_SET, off: -1, seekerr: "strings.Reader.Seek: negative position"}, + {seek: os.SEEK_SET, off: 1 << 33, wantpos: 1 << 33}, + {seek: os.SEEK_CUR, off: 1, wantpos: 1<<33 + 1}, + {seek: os.SEEK_SET, n: 5, want: "01234"}, + {seek: os.SEEK_CUR, n: 5, want: "56789"}, + {seek: os.SEEK_END, off: -1, n: 1, wantpos: 9, want: "9"}, + } + + for i, tt := range tests { + pos, err := r.Seek(tt.off, tt.seek) + if err == nil && tt.seekerr != "" { + t.Errorf("%d. want seek error %q", i, tt.seekerr) + continue + } + if err != nil && err.Error() != tt.seekerr { + t.Errorf("%d. seek error = %q; want %q", i, err.Error(), tt.seekerr) + continue + } + if tt.wantpos != 0 && tt.wantpos != pos { + t.Errorf("%d. pos = %d, want %d", i, pos, tt.wantpos) + } + buf := make([]byte, tt.n) + n, err := r.Read(buf) + if err != nil { + t.Errorf("%d. read = %v", i, err) + continue + } + got := string(buf[:n]) + if got != tt.want { + t.Errorf("%d. got %q; want %q", i, got, tt.want) + } + } +} + +func TestReadAfterBigSeek(t *testing.T) { + r := strings.NewReader("0123456789") + if _, err := r.Seek(1<<31+5, os.SEEK_SET); err != nil { + t.Fatal(err) + } + if n, err := r.Read(make([]byte, 10)); n != 0 || err != io.EOF { + t.Errorf("Read = %d, %v; want 0, EOF", n, err) + } +} + +func TestReaderAt(t *testing.T) { + r := strings.NewReader("0123456789") + tests := []struct { + off int64 + n int + want string + wanterr interface{} + }{ + {0, 10, "0123456789", nil}, + {1, 10, "123456789", io.EOF}, + {1, 9, "123456789", nil}, + {11, 10, "", io.EOF}, + {0, 0, "", nil}, + {-1, 0, "", "strings.Reader.ReadAt: negative offset"}, + } + for i, tt := range tests { + b := make([]byte, tt.n) + rn, err := r.ReadAt(b, tt.off) + got := string(b[:rn]) + if got != tt.want { + t.Errorf("%d. got %q; want %q", i, got, tt.want) + } + if fmt.Sprintf("%v", err) != fmt.Sprintf("%v", tt.wanterr) { + t.Errorf("%d. got error = %v; want %v", i, err, tt.wanterr) + } + } +} + +func TestReaderAtConcurrent(t *testing.T) { + // Test for the race detector, to verify ReadAt doesn't mutate + // any state. + r := strings.NewReader("0123456789") + var wg sync.WaitGroup + for i := 0; i < 5; i++ { + wg.Add(1) + go func(i int) { + defer wg.Done() + var buf [1]byte + r.ReadAt(buf[:], int64(i)) + }(i) + } + wg.Wait() +} + +func TestEmptyReaderConcurrent(t *testing.T) { + // Test for the race detector, to verify a Read that doesn't yield any bytes + // is okay to use from multiple goroutines. This was our historic behavior. + // See golang.org/issue/7856 + r := strings.NewReader("") + var wg sync.WaitGroup + for i := 0; i < 5; i++ { + wg.Add(2) + go func() { + defer wg.Done() + var buf [1]byte + r.Read(buf[:]) + }() + go func() { + defer wg.Done() + r.Read(nil) + }() + } + wg.Wait() +} + +func TestWriteTo(t *testing.T) { + const str = "0123456789" + for i := 0; i <= len(str); i++ { + s := str[i:] + r := strings.NewReader(s) + var b bytes.Buffer + n, err := r.WriteTo(&b) + if expect := int64(len(s)); n != expect { + t.Errorf("got %v; want %v", n, expect) + } + if err != nil { + t.Errorf("for length %d: got error = %v; want nil", len(s), err) + } + if b.String() != s { + t.Errorf("got string %q; want %q", b.String(), s) + } + if r.Len() != 0 { + t.Errorf("reader contains %v bytes; want 0", r.Len()) + } + } +} diff --git a/src/strings/replace.go b/src/strings/replace.go new file mode 100644 index 000000000..4752641be --- /dev/null +++ b/src/strings/replace.go @@ -0,0 +1,518 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings + +import "io" + +// Replacer replaces a list of strings with replacements. +// It is safe for concurrent use by multiple goroutines. +type Replacer struct { + r replacer +} + +// replacer is the interface that a replacement algorithm needs to implement. +type replacer interface { + Replace(s string) string + WriteString(w io.Writer, s string) (n int, err error) +} + +// NewReplacer returns a new Replacer from a list of old, new string pairs. +// Replacements are performed in order, without overlapping matches. +func NewReplacer(oldnew ...string) *Replacer { + if len(oldnew)%2 == 1 { + panic("strings.NewReplacer: odd argument count") + } + + if len(oldnew) == 2 && len(oldnew[0]) > 1 { + return &Replacer{r: makeSingleStringReplacer(oldnew[0], oldnew[1])} + } + + allNewBytes := true + for i := 0; i < len(oldnew); i += 2 { + if len(oldnew[i]) != 1 { + return &Replacer{r: makeGenericReplacer(oldnew)} + } + if len(oldnew[i+1]) != 1 { + allNewBytes = false + } + } + + if allNewBytes { + r := byteReplacer{} + for i := range r { + r[i] = byte(i) + } + // The first occurrence of old->new map takes precedence + // over the others with the same old string. + for i := len(oldnew) - 2; i >= 0; i -= 2 { + o := oldnew[i][0] + n := oldnew[i+1][0] + r[o] = n + } + return &Replacer{r: &r} + } + + r := byteStringReplacer{} + // The first occurrence of old->new map takes precedence + // over the others with the same old string. + for i := len(oldnew) - 2; i >= 0; i -= 2 { + o := oldnew[i][0] + n := oldnew[i+1] + r[o] = []byte(n) + } + return &Replacer{r: &r} +} + +// Replace returns a copy of s with all replacements performed. +func (r *Replacer) Replace(s string) string { + return r.r.Replace(s) +} + +// WriteString writes s to w with all replacements performed. +func (r *Replacer) WriteString(w io.Writer, s string) (n int, err error) { + return r.r.WriteString(w, s) +} + +// trieNode is a node in a lookup trie for prioritized key/value pairs. Keys +// and values may be empty. For example, the trie containing keys "ax", "ay", +// "bcbc", "x" and "xy" could have eight nodes: +// +// n0 - +// n1 a- +// n2 .x+ +// n3 .y+ +// n4 b- +// n5 .cbc+ +// n6 x+ +// n7 .y+ +// +// n0 is the root node, and its children are n1, n4 and n6; n1's children are +// n2 and n3; n4's child is n5; n6's child is n7. Nodes n0, n1 and n4 (marked +// with a trailing "-") are partial keys, and nodes n2, n3, n5, n6 and n7 +// (marked with a trailing "+") are complete keys. +type trieNode struct { + // value is the value of the trie node's key/value pair. It is empty if + // this node is not a complete key. + value string + // priority is the priority (higher is more important) of the trie node's + // key/value pair; keys are not necessarily matched shortest- or longest- + // first. Priority is positive if this node is a complete key, and zero + // otherwise. In the example above, positive/zero priorities are marked + // with a trailing "+" or "-". + priority int + + // A trie node may have zero, one or more child nodes: + // * if the remaining fields are zero, there are no children. + // * if prefix and next are non-zero, there is one child in next. + // * if table is non-zero, it defines all the children. + // + // Prefixes are preferred over tables when there is one child, but the + // root node always uses a table for lookup efficiency. + + // prefix is the difference in keys between this trie node and the next. + // In the example above, node n4 has prefix "cbc" and n4's next node is n5. + // Node n5 has no children and so has zero prefix, next and table fields. + prefix string + next *trieNode + + // table is a lookup table indexed by the next byte in the key, after + // remapping that byte through genericReplacer.mapping to create a dense + // index. In the example above, the keys only use 'a', 'b', 'c', 'x' and + // 'y', which remap to 0, 1, 2, 3 and 4. All other bytes remap to 5, and + // genericReplacer.tableSize will be 5. Node n0's table will be + // []*trieNode{ 0:n1, 1:n4, 3:n6 }, where the 0, 1 and 3 are the remapped + // 'a', 'b' and 'x'. + table []*trieNode +} + +func (t *trieNode) add(key, val string, priority int, r *genericReplacer) { + if key == "" { + if t.priority == 0 { + t.value = val + t.priority = priority + } + return + } + + if t.prefix != "" { + // Need to split the prefix among multiple nodes. + var n int // length of the longest common prefix + for ; n < len(t.prefix) && n < len(key); n++ { + if t.prefix[n] != key[n] { + break + } + } + if n == len(t.prefix) { + t.next.add(key[n:], val, priority, r) + } else if n == 0 { + // First byte differs, start a new lookup table here. Looking up + // what is currently t.prefix[0] will lead to prefixNode, and + // looking up key[0] will lead to keyNode. + var prefixNode *trieNode + if len(t.prefix) == 1 { + prefixNode = t.next + } else { + prefixNode = &trieNode{ + prefix: t.prefix[1:], + next: t.next, + } + } + keyNode := new(trieNode) + t.table = make([]*trieNode, r.tableSize) + t.table[r.mapping[t.prefix[0]]] = prefixNode + t.table[r.mapping[key[0]]] = keyNode + t.prefix = "" + t.next = nil + keyNode.add(key[1:], val, priority, r) + } else { + // Insert new node after the common section of the prefix. + next := &trieNode{ + prefix: t.prefix[n:], + next: t.next, + } + t.prefix = t.prefix[:n] + t.next = next + next.add(key[n:], val, priority, r) + } + } else if t.table != nil { + // Insert into existing table. + m := r.mapping[key[0]] + if t.table[m] == nil { + t.table[m] = new(trieNode) + } + t.table[m].add(key[1:], val, priority, r) + } else { + t.prefix = key + t.next = new(trieNode) + t.next.add("", val, priority, r) + } +} + +func (r *genericReplacer) lookup(s string, ignoreRoot bool) (val string, keylen int, found bool) { + // Iterate down the trie to the end, and grab the value and keylen with + // the highest priority. + bestPriority := 0 + node := &r.root + n := 0 + for node != nil { + if node.priority > bestPriority && !(ignoreRoot && node == &r.root) { + bestPriority = node.priority + val = node.value + keylen = n + found = true + } + + if s == "" { + break + } + if node.table != nil { + index := r.mapping[s[0]] + if int(index) == r.tableSize { + break + } + node = node.table[index] + s = s[1:] + n++ + } else if node.prefix != "" && HasPrefix(s, node.prefix) { + n += len(node.prefix) + s = s[len(node.prefix):] + node = node.next + } else { + break + } + } + return +} + +// genericReplacer is the fully generic algorithm. +// It's used as a fallback when nothing faster can be used. +type genericReplacer struct { + root trieNode + // tableSize is the size of a trie node's lookup table. It is the number + // of unique key bytes. + tableSize int + // mapping maps from key bytes to a dense index for trieNode.table. + mapping [256]byte +} + +func makeGenericReplacer(oldnew []string) *genericReplacer { + r := new(genericReplacer) + // Find each byte used, then assign them each an index. + for i := 0; i < len(oldnew); i += 2 { + key := oldnew[i] + for j := 0; j < len(key); j++ { + r.mapping[key[j]] = 1 + } + } + + for _, b := range r.mapping { + r.tableSize += int(b) + } + + var index byte + for i, b := range r.mapping { + if b == 0 { + r.mapping[i] = byte(r.tableSize) + } else { + r.mapping[i] = index + index++ + } + } + // Ensure root node uses a lookup table (for performance). + r.root.table = make([]*trieNode, r.tableSize) + + for i := 0; i < len(oldnew); i += 2 { + r.root.add(oldnew[i], oldnew[i+1], len(oldnew)-i, r) + } + return r +} + +type appendSliceWriter []byte + +// Write writes to the buffer to satisfy io.Writer. +func (w *appendSliceWriter) Write(p []byte) (int, error) { + *w = append(*w, p...) + return len(p), nil +} + +// WriteString writes to the buffer without string->[]byte->string allocations. +func (w *appendSliceWriter) WriteString(s string) (int, error) { + *w = append(*w, s...) + return len(s), nil +} + +type stringWriterIface interface { + WriteString(string) (int, error) +} + +type stringWriter struct { + w io.Writer +} + +func (w stringWriter) WriteString(s string) (int, error) { + return w.w.Write([]byte(s)) +} + +func getStringWriter(w io.Writer) stringWriterIface { + sw, ok := w.(stringWriterIface) + if !ok { + sw = stringWriter{w} + } + return sw +} + +func (r *genericReplacer) Replace(s string) string { + buf := make(appendSliceWriter, 0, len(s)) + r.WriteString(&buf, s) + return string(buf) +} + +func (r *genericReplacer) WriteString(w io.Writer, s string) (n int, err error) { + sw := getStringWriter(w) + var last, wn int + var prevMatchEmpty bool + for i := 0; i <= len(s); { + // Fast path: s[i] is not a prefix of any pattern. + if i != len(s) && r.root.priority == 0 { + index := int(r.mapping[s[i]]) + if index == r.tableSize || r.root.table[index] == nil { + i++ + continue + } + } + + // Ignore the empty match iff the previous loop found the empty match. + val, keylen, match := r.lookup(s[i:], prevMatchEmpty) + prevMatchEmpty = match && keylen == 0 + if match { + wn, err = sw.WriteString(s[last:i]) + n += wn + if err != nil { + return + } + wn, err = sw.WriteString(val) + n += wn + if err != nil { + return + } + i += keylen + last = i + continue + } + i++ + } + if last != len(s) { + wn, err = sw.WriteString(s[last:]) + n += wn + } + return +} + +// singleStringReplacer is the implementation that's used when there is only +// one string to replace (and that string has more than one byte). +type singleStringReplacer struct { + finder *stringFinder + // value is the new string that replaces that pattern when it's found. + value string +} + +func makeSingleStringReplacer(pattern string, value string) *singleStringReplacer { + return &singleStringReplacer{finder: makeStringFinder(pattern), value: value} +} + +func (r *singleStringReplacer) Replace(s string) string { + var buf []byte + i, matched := 0, false + for { + match := r.finder.next(s[i:]) + if match == -1 { + break + } + matched = true + buf = append(buf, s[i:i+match]...) + buf = append(buf, r.value...) + i += match + len(r.finder.pattern) + } + if !matched { + return s + } + buf = append(buf, s[i:]...) + return string(buf) +} + +func (r *singleStringReplacer) WriteString(w io.Writer, s string) (n int, err error) { + sw := getStringWriter(w) + var i, wn int + for { + match := r.finder.next(s[i:]) + if match == -1 { + break + } + wn, err = sw.WriteString(s[i : i+match]) + n += wn + if err != nil { + return + } + wn, err = sw.WriteString(r.value) + n += wn + if err != nil { + return + } + i += match + len(r.finder.pattern) + } + wn, err = sw.WriteString(s[i:]) + n += wn + return +} + +// byteReplacer is the implementation that's used when all the "old" +// and "new" values are single ASCII bytes. +// The array contains replacement bytes indexed by old byte. +type byteReplacer [256]byte + +func (r *byteReplacer) Replace(s string) string { + var buf []byte // lazily allocated + for i := 0; i < len(s); i++ { + b := s[i] + if r[b] != b { + if buf == nil { + buf = []byte(s) + } + buf[i] = r[b] + } + } + if buf == nil { + return s + } + return string(buf) +} + +func (r *byteReplacer) WriteString(w io.Writer, s string) (n int, err error) { + // TODO(bradfitz): use io.WriteString with slices of s, avoiding allocation. + bufsize := 32 << 10 + if len(s) < bufsize { + bufsize = len(s) + } + buf := make([]byte, bufsize) + + for len(s) > 0 { + ncopy := copy(buf, s[:]) + s = s[ncopy:] + for i, b := range buf[:ncopy] { + buf[i] = r[b] + } + wn, err := w.Write(buf[:ncopy]) + n += wn + if err != nil { + return n, err + } + } + return n, nil +} + +// byteStringReplacer is the implementation that's used when all the +// "old" values are single ASCII bytes but the "new" values vary in size. +// The array contains replacement byte slices indexed by old byte. +// A nil []byte means that the old byte should not be replaced. +type byteStringReplacer [256][]byte + +func (r *byteStringReplacer) Replace(s string) string { + newSize := len(s) + anyChanges := false + for i := 0; i < len(s); i++ { + b := s[i] + if r[b] != nil { + anyChanges = true + // The -1 is because we are replacing 1 byte with len(r[b]) bytes. + newSize += len(r[b]) - 1 + } + } + if !anyChanges { + return s + } + buf := make([]byte, newSize) + bi := buf + for i := 0; i < len(s); i++ { + b := s[i] + if r[b] != nil { + n := copy(bi, r[b]) + bi = bi[n:] + } else { + bi[0] = b + bi = bi[1:] + } + } + return string(buf) +} + +func (r *byteStringReplacer) WriteString(w io.Writer, s string) (n int, err error) { + sw := getStringWriter(w) + last := 0 + for i := 0; i < len(s); i++ { + b := s[i] + if r[b] == nil { + continue + } + if last != i { + nw, err := sw.WriteString(s[last:i]) + n += nw + if err != nil { + return n, err + } + } + last = i + 1 + nw, err := w.Write(r[b]) + n += nw + if err != nil { + return n, err + } + } + if last != len(s) { + var nw int + nw, err = sw.WriteString(s[last:]) + n += nw + } + return +} diff --git a/src/strings/replace_test.go b/src/strings/replace_test.go new file mode 100644 index 000000000..77e48b988 --- /dev/null +++ b/src/strings/replace_test.go @@ -0,0 +1,542 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings_test + +import ( + "bytes" + "fmt" + . "strings" + "testing" +) + +var htmlEscaper = NewReplacer( + "&", "&", + "<", "<", + ">", ">", + `"`, """, + "'", "'", +) + +var htmlUnescaper = NewReplacer( + "&", "&", + "<", "<", + ">", ">", + """, `"`, + "'", "'", +) + +// The http package's old HTML escaping function. +func oldHTMLEscape(s string) string { + s = Replace(s, "&", "&", -1) + s = Replace(s, "<", "<", -1) + s = Replace(s, ">", ">", -1) + s = Replace(s, `"`, """, -1) + s = Replace(s, "'", "'", -1) + return s +} + +var capitalLetters = NewReplacer("a", "A", "b", "B") + +// TestReplacer tests the replacer implementations. +func TestReplacer(t *testing.T) { + type testCase struct { + r *Replacer + in, out string + } + var testCases []testCase + + // str converts 0xff to "\xff". This isn't just string(b) since that converts to UTF-8. + str := func(b byte) string { + return string([]byte{b}) + } + var s []string + + // inc maps "\x00"->"\x01", ..., "a"->"b", "b"->"c", ..., "\xff"->"\x00". + s = nil + for i := 0; i < 256; i++ { + s = append(s, str(byte(i)), str(byte(i+1))) + } + inc := NewReplacer(s...) + + // Test cases with 1-byte old strings, 1-byte new strings. + testCases = append(testCases, + testCase{capitalLetters, "brad", "BrAd"}, + testCase{capitalLetters, Repeat("a", (32<<10)+123), Repeat("A", (32<<10)+123)}, + testCase{capitalLetters, "", ""}, + + testCase{inc, "brad", "csbe"}, + testCase{inc, "\x00\xff", "\x01\x00"}, + testCase{inc, "", ""}, + + testCase{NewReplacer("a", "1", "a", "2"), "brad", "br1d"}, + ) + + // repeat maps "a"->"a", "b"->"bb", "c"->"ccc", ... + s = nil + for i := 0; i < 256; i++ { + n := i + 1 - 'a' + if n < 1 { + n = 1 + } + s = append(s, str(byte(i)), Repeat(str(byte(i)), n)) + } + repeat := NewReplacer(s...) + + // Test cases with 1-byte old strings, variable length new strings. + testCases = append(testCases, + testCase{htmlEscaper, "No changes", "No changes"}, + testCase{htmlEscaper, "I <3 escaping & stuff", "I <3 escaping & stuff"}, + testCase{htmlEscaper, "&&&", "&&&"}, + testCase{htmlEscaper, "", ""}, + + testCase{repeat, "brad", "bbrrrrrrrrrrrrrrrrrradddd"}, + testCase{repeat, "abba", "abbbba"}, + testCase{repeat, "", ""}, + + testCase{NewReplacer("a", "11", "a", "22"), "brad", "br11d"}, + ) + + // The remaining test cases have variable length old strings. + + testCases = append(testCases, + testCase{htmlUnescaper, "&amp;", "&"}, + testCase{htmlUnescaper, "<b>HTML's neat</b>", "<b>HTML's neat</b>"}, + testCase{htmlUnescaper, "", ""}, + + testCase{NewReplacer("a", "1", "a", "2", "xxx", "xxx"), "brad", "br1d"}, + + testCase{NewReplacer("a", "1", "aa", "2", "aaa", "3"), "aaaa", "1111"}, + + testCase{NewReplacer("aaa", "3", "aa", "2", "a", "1"), "aaaa", "31"}, + ) + + // gen1 has multiple old strings of variable length. There is no + // overall non-empty common prefix, but some pairwise common prefixes. + gen1 := NewReplacer( + "aaa", "3[aaa]", + "aa", "2[aa]", + "a", "1[a]", + "i", "i", + "longerst", "most long", + "longer", "medium", + "long", "short", + "xx", "xx", + "x", "X", + "X", "Y", + "Y", "Z", + ) + testCases = append(testCases, + testCase{gen1, "fooaaabar", "foo3[aaa]b1[a]r"}, + testCase{gen1, "long, longerst, longer", "short, most long, medium"}, + testCase{gen1, "xxxxx", "xxxxX"}, + testCase{gen1, "XiX", "YiY"}, + testCase{gen1, "", ""}, + ) + + // gen2 has multiple old strings with no pairwise common prefix. + gen2 := NewReplacer( + "roses", "red", + "violets", "blue", + "sugar", "sweet", + ) + testCases = append(testCases, + testCase{gen2, "roses are red, violets are blue...", "red are red, blue are blue..."}, + testCase{gen2, "", ""}, + ) + + // gen3 has multiple old strings with an overall common prefix. + gen3 := NewReplacer( + "abracadabra", "poof", + "abracadabrakazam", "splat", + "abraham", "lincoln", + "abrasion", "scrape", + "abraham", "isaac", + ) + testCases = append(testCases, + testCase{gen3, "abracadabrakazam abraham", "poofkazam lincoln"}, + testCase{gen3, "abrasion abracad", "scrape abracad"}, + testCase{gen3, "abba abram abrasive", "abba abram abrasive"}, + testCase{gen3, "", ""}, + ) + + // foo{1,2,3,4} have multiple old strings with an overall common prefix + // and 1- or 2- byte extensions from the common prefix. + foo1 := NewReplacer( + "foo1", "A", + "foo2", "B", + "foo3", "C", + ) + foo2 := NewReplacer( + "foo1", "A", + "foo2", "B", + "foo31", "C", + "foo32", "D", + ) + foo3 := NewReplacer( + "foo11", "A", + "foo12", "B", + "foo31", "C", + "foo32", "D", + ) + foo4 := NewReplacer( + "foo12", "B", + "foo32", "D", + ) + testCases = append(testCases, + testCase{foo1, "fofoofoo12foo32oo", "fofooA2C2oo"}, + testCase{foo1, "", ""}, + + testCase{foo2, "fofoofoo12foo32oo", "fofooA2Doo"}, + testCase{foo2, "", ""}, + + testCase{foo3, "fofoofoo12foo32oo", "fofooBDoo"}, + testCase{foo3, "", ""}, + + testCase{foo4, "fofoofoo12foo32oo", "fofooBDoo"}, + testCase{foo4, "", ""}, + ) + + // genAll maps "\x00\x01\x02...\xfe\xff" to "[all]", amongst other things. + allBytes := make([]byte, 256) + for i := range allBytes { + allBytes[i] = byte(i) + } + allString := string(allBytes) + genAll := NewReplacer( + allString, "[all]", + "\xff", "[ff]", + "\x00", "[00]", + ) + testCases = append(testCases, + testCase{genAll, allString, "[all]"}, + testCase{genAll, "a\xff" + allString + "\x00", "a[ff][all][00]"}, + testCase{genAll, "", ""}, + ) + + // Test cases with empty old strings. + + blankToX1 := NewReplacer("", "X") + blankToX2 := NewReplacer("", "X", "", "") + blankHighPriority := NewReplacer("", "X", "o", "O") + blankLowPriority := NewReplacer("o", "O", "", "X") + blankNoOp1 := NewReplacer("", "") + blankNoOp2 := NewReplacer("", "", "", "A") + blankFoo := NewReplacer("", "X", "foobar", "R", "foobaz", "Z") + testCases = append(testCases, + testCase{blankToX1, "foo", "XfXoXoX"}, + testCase{blankToX1, "", "X"}, + + testCase{blankToX2, "foo", "XfXoXoX"}, + testCase{blankToX2, "", "X"}, + + testCase{blankHighPriority, "oo", "XOXOX"}, + testCase{blankHighPriority, "ii", "XiXiX"}, + testCase{blankHighPriority, "oiio", "XOXiXiXOX"}, + testCase{blankHighPriority, "iooi", "XiXOXOXiX"}, + testCase{blankHighPriority, "", "X"}, + + testCase{blankLowPriority, "oo", "OOX"}, + testCase{blankLowPriority, "ii", "XiXiX"}, + testCase{blankLowPriority, "oiio", "OXiXiOX"}, + testCase{blankLowPriority, "iooi", "XiOOXiX"}, + testCase{blankLowPriority, "", "X"}, + + testCase{blankNoOp1, "foo", "foo"}, + testCase{blankNoOp1, "", ""}, + + testCase{blankNoOp2, "foo", "foo"}, + testCase{blankNoOp2, "", ""}, + + testCase{blankFoo, "foobarfoobaz", "XRXZX"}, + testCase{blankFoo, "foobar-foobaz", "XRX-XZX"}, + testCase{blankFoo, "", "X"}, + ) + + // single string replacer + + abcMatcher := NewReplacer("abc", "[match]") + + testCases = append(testCases, + testCase{abcMatcher, "", ""}, + testCase{abcMatcher, "ab", "ab"}, + testCase{abcMatcher, "abc", "[match]"}, + testCase{abcMatcher, "abcd", "[match]d"}, + testCase{abcMatcher, "cabcabcdabca", "c[match][match]d[match]a"}, + ) + + // Issue 6659 cases (more single string replacer) + + noHello := NewReplacer("Hello", "") + testCases = append(testCases, + testCase{noHello, "Hello", ""}, + testCase{noHello, "Hellox", "x"}, + testCase{noHello, "xHello", "x"}, + testCase{noHello, "xHellox", "xx"}, + ) + + // No-arg test cases. + + nop := NewReplacer() + testCases = append(testCases, + testCase{nop, "abc", "abc"}, + testCase{nop, "", ""}, + ) + + // Run the test cases. + + for i, tc := range testCases { + if s := tc.r.Replace(tc.in); s != tc.out { + t.Errorf("%d. Replace(%q) = %q, want %q", i, tc.in, s, tc.out) + } + var buf bytes.Buffer + n, err := tc.r.WriteString(&buf, tc.in) + if err != nil { + t.Errorf("%d. WriteString: %v", i, err) + continue + } + got := buf.String() + if got != tc.out { + t.Errorf("%d. WriteString(%q) wrote %q, want %q", i, tc.in, got, tc.out) + continue + } + if n != len(tc.out) { + t.Errorf("%d. WriteString(%q) wrote correct string but reported %d bytes; want %d (%q)", + i, tc.in, n, len(tc.out), tc.out) + } + } +} + +var algorithmTestCases = []struct { + r *Replacer + want string +}{ + {capitalLetters, "*strings.byteReplacer"}, + {htmlEscaper, "*strings.byteStringReplacer"}, + {NewReplacer("12", "123"), "*strings.singleStringReplacer"}, + {NewReplacer("1", "12"), "*strings.byteStringReplacer"}, + {NewReplacer("", "X"), "*strings.genericReplacer"}, + {NewReplacer("a", "1", "b", "12", "cde", "123"), "*strings.genericReplacer"}, +} + +// TestPickAlgorithm tests that NewReplacer picks the correct algorithm. +func TestPickAlgorithm(t *testing.T) { + for i, tc := range algorithmTestCases { + got := fmt.Sprintf("%T", tc.r.Replacer()) + if got != tc.want { + t.Errorf("%d. algorithm = %s, want %s", i, got, tc.want) + } + } +} + +type errWriter struct{} + +func (errWriter) Write(p []byte) (n int, err error) { + return 0, fmt.Errorf("unwritable") +} + +// TestWriteStringError tests that WriteString returns an error +// received from the underlying io.Writer. +func TestWriteStringError(t *testing.T) { + for i, tc := range algorithmTestCases { + n, err := tc.r.WriteString(errWriter{}, "abc") + if n != 0 || err == nil || err.Error() != "unwritable" { + t.Errorf("%d. WriteStringError = %d, %v, want 0, unwritable", i, n, err) + } + } +} + +// TestGenericTrieBuilding verifies the structure of the generated trie. There +// is one node per line, and the key ending with the current line is in the +// trie if it ends with a "+". +func TestGenericTrieBuilding(t *testing.T) { + testCases := []struct{ in, out string }{ + {"abc;abdef;abdefgh;xx;xy;z", `- + a- + .b- + ..c+ + ..d- + ...ef+ + .....gh+ + x- + .x+ + .y+ + z+ + `}, + {"abracadabra;abracadabrakazam;abraham;abrasion", `- + a- + .bra- + ....c- + .....adabra+ + ...........kazam+ + ....h- + .....am+ + ....s- + .....ion+ + `}, + {"aaa;aa;a;i;longerst;longer;long;xx;x;X;Y", `- + X+ + Y+ + a+ + .a+ + ..a+ + i+ + l- + .ong+ + ....er+ + ......st+ + x+ + .x+ + `}, + {"foo;;foo;foo1", `+ + f- + .oo+ + ...1+ + `}, + } + + for _, tc := range testCases { + keys := Split(tc.in, ";") + args := make([]string, len(keys)*2) + for i, key := range keys { + args[i*2] = key + } + + got := NewReplacer(args...).PrintTrie() + // Remove tabs from tc.out + wantbuf := make([]byte, 0, len(tc.out)) + for i := 0; i < len(tc.out); i++ { + if tc.out[i] != '\t' { + wantbuf = append(wantbuf, tc.out[i]) + } + } + want := string(wantbuf) + + if got != want { + t.Errorf("PrintTrie(%q)\ngot\n%swant\n%s", tc.in, got, want) + } + } +} + +func BenchmarkGenericNoMatch(b *testing.B) { + str := Repeat("A", 100) + Repeat("B", 100) + generic := NewReplacer("a", "A", "b", "B", "12", "123") // varying lengths forces generic + for i := 0; i < b.N; i++ { + generic.Replace(str) + } +} + +func BenchmarkGenericMatch1(b *testing.B) { + str := Repeat("a", 100) + Repeat("b", 100) + generic := NewReplacer("a", "A", "b", "B", "12", "123") + for i := 0; i < b.N; i++ { + generic.Replace(str) + } +} + +func BenchmarkGenericMatch2(b *testing.B) { + str := Repeat("It's <b>HTML</b>!", 100) + for i := 0; i < b.N; i++ { + htmlUnescaper.Replace(str) + } +} + +func benchmarkSingleString(b *testing.B, pattern, text string) { + r := NewReplacer(pattern, "[match]") + b.SetBytes(int64(len(text))) + b.ResetTimer() + for i := 0; i < b.N; i++ { + r.Replace(text) + } +} + +func BenchmarkSingleMaxSkipping(b *testing.B) { + benchmarkSingleString(b, Repeat("b", 25), Repeat("a", 10000)) +} + +func BenchmarkSingleLongSuffixFail(b *testing.B) { + benchmarkSingleString(b, "b"+Repeat("a", 500), Repeat("a", 1002)) +} + +func BenchmarkSingleMatch(b *testing.B) { + benchmarkSingleString(b, "abcdef", Repeat("abcdefghijklmno", 1000)) +} + +func BenchmarkByteByteNoMatch(b *testing.B) { + str := Repeat("A", 100) + Repeat("B", 100) + for i := 0; i < b.N; i++ { + capitalLetters.Replace(str) + } +} + +func BenchmarkByteByteMatch(b *testing.B) { + str := Repeat("a", 100) + Repeat("b", 100) + for i := 0; i < b.N; i++ { + capitalLetters.Replace(str) + } +} + +func BenchmarkByteStringMatch(b *testing.B) { + str := "<" + Repeat("a", 99) + Repeat("b", 99) + ">" + for i := 0; i < b.N; i++ { + htmlEscaper.Replace(str) + } +} + +func BenchmarkHTMLEscapeNew(b *testing.B) { + str := "I <3 to escape HTML & other text too." + for i := 0; i < b.N; i++ { + htmlEscaper.Replace(str) + } +} + +func BenchmarkHTMLEscapeOld(b *testing.B) { + str := "I <3 to escape HTML & other text too." + for i := 0; i < b.N; i++ { + oldHTMLEscape(str) + } +} + +func BenchmarkByteStringReplacerWriteString(b *testing.B) { + str := Repeat("I <3 to escape HTML & other text too.", 100) + buf := new(bytes.Buffer) + for i := 0; i < b.N; i++ { + htmlEscaper.WriteString(buf, str) + buf.Reset() + } +} + +func BenchmarkByteReplacerWriteString(b *testing.B) { + str := Repeat("abcdefghijklmnopqrstuvwxyz", 100) + buf := new(bytes.Buffer) + for i := 0; i < b.N; i++ { + capitalLetters.WriteString(buf, str) + buf.Reset() + } +} + +// BenchmarkByteByteReplaces compares byteByteImpl against multiple Replaces. +func BenchmarkByteByteReplaces(b *testing.B) { + str := Repeat("a", 100) + Repeat("b", 100) + for i := 0; i < b.N; i++ { + Replace(Replace(str, "a", "A", -1), "b", "B", -1) + } +} + +// BenchmarkByteByteMap compares byteByteImpl against Map. +func BenchmarkByteByteMap(b *testing.B) { + str := Repeat("a", 100) + Repeat("b", 100) + fn := func(r rune) rune { + switch r { + case 'a': + return 'A' + case 'b': + return 'B' + } + return r + } + for i := 0; i < b.N; i++ { + Map(fn, str) + } +} diff --git a/src/strings/search.go b/src/strings/search.go new file mode 100644 index 000000000..f77c879c5 --- /dev/null +++ b/src/strings/search.go @@ -0,0 +1,124 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings + +// stringFinder efficiently finds strings in a source text. It's implemented +// using the Boyer-Moore string search algorithm: +// http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm +// http://www.cs.utexas.edu/~moore/publications/fstrpos.pdf (note: this aged +// document uses 1-based indexing) +type stringFinder struct { + // pattern is the string that we are searching for in the text. + pattern string + + // badCharSkip[b] contains the distance between the last byte of pattern + // and the rightmost occurrence of b in pattern. If b is not in pattern, + // badCharSkip[b] is len(pattern). + // + // Whenever a mismatch is found with byte b in the text, we can safely + // shift the matching frame at least badCharSkip[b] until the next time + // the matching char could be in alignment. + badCharSkip [256]int + + // goodSuffixSkip[i] defines how far we can shift the matching frame given + // that the suffix pattern[i+1:] matches, but the byte pattern[i] does + // not. There are two cases to consider: + // + // 1. The matched suffix occurs elsewhere in pattern (with a different + // byte preceding it that we might possibly match). In this case, we can + // shift the matching frame to align with the next suffix chunk. For + // example, the pattern "mississi" has the suffix "issi" next occurring + // (in right-to-left order) at index 1, so goodSuffixSkip[3] == + // shift+len(suffix) == 3+4 == 7. + // + // 2. If the matched suffix does not occur elsewhere in pattern, then the + // matching frame may share part of its prefix with the end of the + // matching suffix. In this case, goodSuffixSkip[i] will contain how far + // to shift the frame to align this portion of the prefix to the + // suffix. For example, in the pattern "abcxxxabc", when the first + // mismatch from the back is found to be in position 3, the matching + // suffix "xxabc" is not found elsewhere in the pattern. However, its + // rightmost "abc" (at position 6) is a prefix of the whole pattern, so + // goodSuffixSkip[3] == shift+len(suffix) == 6+5 == 11. + goodSuffixSkip []int +} + +func makeStringFinder(pattern string) *stringFinder { + f := &stringFinder{ + pattern: pattern, + goodSuffixSkip: make([]int, len(pattern)), + } + // last is the index of the last character in the pattern. + last := len(pattern) - 1 + + // Build bad character table. + // Bytes not in the pattern can skip one pattern's length. + for i := range f.badCharSkip { + f.badCharSkip[i] = len(pattern) + } + // The loop condition is < instead of <= so that the last byte does not + // have a zero distance to itself. Finding this byte out of place implies + // that it is not in the last position. + for i := 0; i < last; i++ { + f.badCharSkip[pattern[i]] = last - i + } + + // Build good suffix table. + // First pass: set each value to the next index which starts a prefix of + // pattern. + lastPrefix := last + for i := last; i >= 0; i-- { + if HasPrefix(pattern, pattern[i+1:]) { + lastPrefix = i + 1 + } + // lastPrefix is the shift, and (last-i) is len(suffix). + f.goodSuffixSkip[i] = lastPrefix + last - i + } + // Second pass: find repeats of pattern's suffix starting from the front. + for i := 0; i < last; i++ { + lenSuffix := longestCommonSuffix(pattern, pattern[1:i+1]) + if pattern[i-lenSuffix] != pattern[last-lenSuffix] { + // (last-i) is the shift, and lenSuffix is len(suffix). + f.goodSuffixSkip[last-lenSuffix] = lenSuffix + last - i + } + } + + return f +} + +func longestCommonSuffix(a, b string) (i int) { + for ; i < len(a) && i < len(b); i++ { + if a[len(a)-1-i] != b[len(b)-1-i] { + break + } + } + return +} + +// next returns the index in text of the first occurrence of the pattern. If +// the pattern is not found, it returns -1. +func (f *stringFinder) next(text string) int { + i := len(f.pattern) - 1 + for i < len(text) { + // Compare backwards from the end until the first unmatching character. + j := len(f.pattern) - 1 + for j >= 0 && text[i] == f.pattern[j] { + i-- + j-- + } + if j < 0 { + return i + 1 // match + } + i += max(f.badCharSkip[text[i]], f.goodSuffixSkip[j]) + } + return -1 +} + +func max(a, b int) int { + if a > b { + return a + } + return b +} diff --git a/src/strings/search_test.go b/src/strings/search_test.go new file mode 100644 index 000000000..966c05e65 --- /dev/null +++ b/src/strings/search_test.go @@ -0,0 +1,90 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings_test + +import ( + "reflect" + . "strings" + "testing" +) + +func TestFinderNext(t *testing.T) { + testCases := []struct { + pat, text string + index int + }{ + {"", "", 0}, + {"", "abc", 0}, + {"abc", "", -1}, + {"abc", "abc", 0}, + {"d", "abcdefg", 3}, + {"nan", "banana", 2}, + {"pan", "anpanman", 2}, + {"nnaaman", "anpanmanam", -1}, + {"abcd", "abc", -1}, + {"abcd", "bcd", -1}, + {"bcd", "abcd", 1}, + {"abc", "acca", -1}, + {"aa", "aaa", 0}, + {"baa", "aaaaa", -1}, + {"at that", "which finally halts. at that point", 22}, + } + + for _, tc := range testCases { + got := StringFind(tc.pat, tc.text) + want := tc.index + if got != want { + t.Errorf("stringFind(%q, %q) got %d, want %d\n", tc.pat, tc.text, got, want) + } + } +} + +func TestFinderCreation(t *testing.T) { + testCases := []struct { + pattern string + bad [256]int + suf []int + }{ + { + "abc", + [256]int{'a': 2, 'b': 1, 'c': 3}, + []int{5, 4, 1}, + }, + { + "mississi", + [256]int{'i': 3, 'm': 7, 's': 1}, + []int{15, 14, 13, 7, 11, 10, 7, 1}, + }, + // From http://www.cs.utexas.edu/~moore/publications/fstrpos.pdf + { + "abcxxxabc", + [256]int{'a': 2, 'b': 1, 'c': 6, 'x': 3}, + []int{14, 13, 12, 11, 10, 9, 11, 10, 1}, + }, + { + "abyxcdeyx", + [256]int{'a': 8, 'b': 7, 'c': 4, 'd': 3, 'e': 2, 'y': 1, 'x': 5}, + []int{17, 16, 15, 14, 13, 12, 7, 10, 1}, + }, + } + + for _, tc := range testCases { + bad, good := DumpTables(tc.pattern) + + for i, got := range bad { + want := tc.bad[i] + if want == 0 { + want = len(tc.pattern) + } + if got != want { + t.Errorf("boyerMoore(%q) bad['%c']: got %d want %d", tc.pattern, i, got, want) + } + } + + if !reflect.DeepEqual(good, tc.suf) { + t.Errorf("boyerMoore(%q) got %v want %v", tc.pattern, good, tc.suf) + } + } +} diff --git a/src/strings/strings.go b/src/strings/strings.go new file mode 100644 index 000000000..761f32a06 --- /dev/null +++ b/src/strings/strings.go @@ -0,0 +1,765 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package strings implements simple functions to manipulate strings. +package strings + +import ( + "unicode" + "unicode/utf8" +) + +// explode splits s into an array of UTF-8 sequences, one per Unicode character (still strings) up to a maximum of n (n < 0 means no limit). +// Invalid UTF-8 sequences become correct encodings of U+FFF8. +func explode(s string, n int) []string { + if n == 0 { + return nil + } + l := utf8.RuneCountInString(s) + if n <= 0 || n > l { + n = l + } + a := make([]string, n) + var size int + var ch rune + i, cur := 0, 0 + for ; i+1 < n; i++ { + ch, size = utf8.DecodeRuneInString(s[cur:]) + if ch == utf8.RuneError { + a[i] = string(utf8.RuneError) + } else { + a[i] = s[cur : cur+size] + } + cur += size + } + // add the rest, if there is any + if cur < len(s) { + a[i] = s[cur:] + } + return a +} + +// primeRK is the prime base used in Rabin-Karp algorithm. +const primeRK = 16777619 + +// hashStr returns the hash and the appropriate multiplicative +// factor for use in Rabin-Karp algorithm. +func hashStr(sep string) (uint32, uint32) { + hash := uint32(0) + for i := 0; i < len(sep); i++ { + hash = hash*primeRK + uint32(sep[i]) + } + var pow, sq uint32 = 1, primeRK + for i := len(sep); i > 0; i >>= 1 { + if i&1 != 0 { + pow *= sq + } + sq *= sq + } + return hash, pow +} + +// hashStrRev returns the hash of the reverse of sep and the +// appropriate multiplicative factor for use in Rabin-Karp algorithm. +func hashStrRev(sep string) (uint32, uint32) { + hash := uint32(0) + for i := len(sep) - 1; i >= 0; i-- { + hash = hash*primeRK + uint32(sep[i]) + } + var pow, sq uint32 = 1, primeRK + for i := len(sep); i > 0; i >>= 1 { + if i&1 != 0 { + pow *= sq + } + sq *= sq + } + return hash, pow +} + +// Count counts the number of non-overlapping instances of sep in s. +func Count(s, sep string) int { + n := 0 + // special cases + switch { + case len(sep) == 0: + return utf8.RuneCountInString(s) + 1 + case len(sep) == 1: + // special case worth making fast + c := sep[0] + for i := 0; i < len(s); i++ { + if s[i] == c { + n++ + } + } + return n + case len(sep) > len(s): + return 0 + case len(sep) == len(s): + if sep == s { + return 1 + } + return 0 + } + // Rabin-Karp search + hashsep, pow := hashStr(sep) + h := uint32(0) + for i := 0; i < len(sep); i++ { + h = h*primeRK + uint32(s[i]) + } + lastmatch := 0 + if h == hashsep && s[:len(sep)] == sep { + n++ + lastmatch = len(sep) + } + for i := len(sep); i < len(s); { + h *= primeRK + h += uint32(s[i]) + h -= pow * uint32(s[i-len(sep)]) + i++ + if h == hashsep && lastmatch <= i-len(sep) && s[i-len(sep):i] == sep { + n++ + lastmatch = i + } + } + return n +} + +// Contains returns true if substr is within s. +func Contains(s, substr string) bool { + return Index(s, substr) >= 0 +} + +// ContainsAny returns true if any Unicode code points in chars are within s. +func ContainsAny(s, chars string) bool { + return IndexAny(s, chars) >= 0 +} + +// ContainsRune returns true if the Unicode code point r is within s. +func ContainsRune(s string, r rune) bool { + return IndexRune(s, r) >= 0 +} + +// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s. +func Index(s, sep string) int { + n := len(sep) + switch { + case n == 0: + return 0 + case n == 1: + return IndexByte(s, sep[0]) + case n == len(s): + if sep == s { + return 0 + } + return -1 + case n > len(s): + return -1 + } + // Rabin-Karp search + hashsep, pow := hashStr(sep) + var h uint32 + for i := 0; i < n; i++ { + h = h*primeRK + uint32(s[i]) + } + if h == hashsep && s[:n] == sep { + return 0 + } + for i := n; i < len(s); { + h *= primeRK + h += uint32(s[i]) + h -= pow * uint32(s[i-n]) + i++ + if h == hashsep && s[i-n:i] == sep { + return i - n + } + } + return -1 +} + +// LastIndex returns the index of the last instance of sep in s, or -1 if sep is not present in s. +func LastIndex(s, sep string) int { + n := len(sep) + switch { + case n == 0: + return len(s) + case n == 1: + // special case worth making fast + c := sep[0] + for i := len(s) - 1; i >= 0; i-- { + if s[i] == c { + return i + } + } + return -1 + case n == len(s): + if sep == s { + return 0 + } + return -1 + case n > len(s): + return -1 + } + // Rabin-Karp search from the end of the string + hashsep, pow := hashStrRev(sep) + last := len(s) - n + var h uint32 + for i := len(s) - 1; i >= last; i-- { + h = h*primeRK + uint32(s[i]) + } + if h == hashsep && s[last:] == sep { + return last + } + for i := last - 1; i >= 0; i-- { + h *= primeRK + h += uint32(s[i]) + h -= pow * uint32(s[i+n]) + if h == hashsep && s[i:i+n] == sep { + return i + } + } + return -1 +} + +// IndexRune returns the index of the first instance of the Unicode code point +// r, or -1 if rune is not present in s. +func IndexRune(s string, r rune) int { + switch { + case r < 0x80: + b := byte(r) + for i := 0; i < len(s); i++ { + if s[i] == b { + return i + } + } + default: + for i, c := range s { + if c == r { + return i + } + } + } + return -1 +} + +// IndexAny returns the index of the first instance of any Unicode code point +// from chars in s, or -1 if no Unicode code point from chars is present in s. +func IndexAny(s, chars string) int { + if len(chars) > 0 { + for i, c := range s { + for _, m := range chars { + if c == m { + return i + } + } + } + } + return -1 +} + +// LastIndexAny returns the index of the last instance of any Unicode code +// point from chars in s, or -1 if no Unicode code point from chars is +// present in s. +func LastIndexAny(s, chars string) int { + if len(chars) > 0 { + for i := len(s); i > 0; { + rune, size := utf8.DecodeLastRuneInString(s[0:i]) + i -= size + for _, m := range chars { + if rune == m { + return i + } + } + } + } + return -1 +} + +// Generic split: splits after each instance of sep, +// including sepSave bytes of sep in the subarrays. +func genSplit(s, sep string, sepSave, n int) []string { + if n == 0 { + return nil + } + if sep == "" { + return explode(s, n) + } + if n < 0 { + n = Count(s, sep) + 1 + } + c := sep[0] + start := 0 + a := make([]string, n) + na := 0 + for i := 0; i+len(sep) <= len(s) && na+1 < n; i++ { + if s[i] == c && (len(sep) == 1 || s[i:i+len(sep)] == sep) { + a[na] = s[start : i+sepSave] + na++ + start = i + len(sep) + i += len(sep) - 1 + } + } + a[na] = s[start:] + return a[0 : na+1] +} + +// SplitN slices s into substrings separated by sep and returns a slice of +// the substrings between those separators. +// If sep is empty, SplitN splits after each UTF-8 sequence. +// The count determines the number of substrings to return: +// n > 0: at most n substrings; the last substring will be the unsplit remainder. +// n == 0: the result is nil (zero substrings) +// n < 0: all substrings +func SplitN(s, sep string, n int) []string { return genSplit(s, sep, 0, n) } + +// SplitAfterN slices s into substrings after each instance of sep and +// returns a slice of those substrings. +// If sep is empty, SplitAfterN splits after each UTF-8 sequence. +// The count determines the number of substrings to return: +// n > 0: at most n substrings; the last substring will be the unsplit remainder. +// n == 0: the result is nil (zero substrings) +// n < 0: all substrings +func SplitAfterN(s, sep string, n int) []string { + return genSplit(s, sep, len(sep), n) +} + +// Split slices s into all substrings separated by sep and returns a slice of +// the substrings between those separators. +// If sep is empty, Split splits after each UTF-8 sequence. +// It is equivalent to SplitN with a count of -1. +func Split(s, sep string) []string { return genSplit(s, sep, 0, -1) } + +// SplitAfter slices s into all substrings after each instance of sep and +// returns a slice of those substrings. +// If sep is empty, SplitAfter splits after each UTF-8 sequence. +// It is equivalent to SplitAfterN with a count of -1. +func SplitAfter(s, sep string) []string { + return genSplit(s, sep, len(sep), -1) +} + +// Fields splits the string s around each instance of one or more consecutive white space +// characters, as defined by unicode.IsSpace, returning an array of substrings of s or an +// empty list if s contains only white space. +func Fields(s string) []string { + return FieldsFunc(s, unicode.IsSpace) +} + +// FieldsFunc splits the string s at each run of Unicode code points c satisfying f(c) +// and returns an array of slices of s. If all code points in s satisfy f(c) or the +// string is empty, an empty slice is returned. +func FieldsFunc(s string, f func(rune) bool) []string { + // First count the fields. + n := 0 + inField := false + for _, rune := range s { + wasInField := inField + inField = !f(rune) + if inField && !wasInField { + n++ + } + } + + // Now create them. + a := make([]string, n) + na := 0 + fieldStart := -1 // Set to -1 when looking for start of field. + for i, rune := range s { + if f(rune) { + if fieldStart >= 0 { + a[na] = s[fieldStart:i] + na++ + fieldStart = -1 + } + } else if fieldStart == -1 { + fieldStart = i + } + } + if fieldStart >= 0 { // Last field might end at EOF. + a[na] = s[fieldStart:] + } + return a +} + +// Join concatenates the elements of a to create a single string. The separator string +// sep is placed between elements in the resulting string. +func Join(a []string, sep string) string { + if len(a) == 0 { + return "" + } + if len(a) == 1 { + return a[0] + } + n := len(sep) * (len(a) - 1) + for i := 0; i < len(a); i++ { + n += len(a[i]) + } + + b := make([]byte, n) + bp := copy(b, a[0]) + for _, s := range a[1:] { + bp += copy(b[bp:], sep) + bp += copy(b[bp:], s) + } + return string(b) +} + +// HasPrefix tests whether the string s begins with prefix. +func HasPrefix(s, prefix string) bool { + return len(s) >= len(prefix) && s[0:len(prefix)] == prefix +} + +// HasSuffix tests whether the string s ends with suffix. +func HasSuffix(s, suffix string) bool { + return len(s) >= len(suffix) && s[len(s)-len(suffix):] == suffix +} + +// Map returns a copy of the string s with all its characters modified +// according to the mapping function. If mapping returns a negative value, the character is +// dropped from the string with no replacement. +func Map(mapping func(rune) rune, s string) string { + // In the worst case, the string can grow when mapped, making + // things unpleasant. But it's so rare we barge in assuming it's + // fine. It could also shrink but that falls out naturally. + maxbytes := len(s) // length of b + nbytes := 0 // number of bytes encoded in b + // The output buffer b is initialized on demand, the first + // time a character differs. + var b []byte + + for i, c := range s { + r := mapping(c) + if b == nil { + if r == c { + continue + } + b = make([]byte, maxbytes) + nbytes = copy(b, s[:i]) + } + if r >= 0 { + wid := 1 + if r >= utf8.RuneSelf { + wid = utf8.RuneLen(r) + } + if nbytes+wid > maxbytes { + // Grow the buffer. + maxbytes = maxbytes*2 + utf8.UTFMax + nb := make([]byte, maxbytes) + copy(nb, b[0:nbytes]) + b = nb + } + nbytes += utf8.EncodeRune(b[nbytes:maxbytes], r) + } + } + if b == nil { + return s + } + return string(b[0:nbytes]) +} + +// Repeat returns a new string consisting of count copies of the string s. +func Repeat(s string, count int) string { + b := make([]byte, len(s)*count) + bp := copy(b, s) + for bp < len(b) { + copy(b[bp:], b[:bp]) + bp *= 2 + } + return string(b) +} + +// ToUpper returns a copy of the string s with all Unicode letters mapped to their upper case. +func ToUpper(s string) string { return Map(unicode.ToUpper, s) } + +// ToLower returns a copy of the string s with all Unicode letters mapped to their lower case. +func ToLower(s string) string { return Map(unicode.ToLower, s) } + +// ToTitle returns a copy of the string s with all Unicode letters mapped to their title case. +func ToTitle(s string) string { return Map(unicode.ToTitle, s) } + +// ToUpperSpecial returns a copy of the string s with all Unicode letters mapped to their +// upper case, giving priority to the special casing rules. +func ToUpperSpecial(_case unicode.SpecialCase, s string) string { + return Map(func(r rune) rune { return _case.ToUpper(r) }, s) +} + +// ToLowerSpecial returns a copy of the string s with all Unicode letters mapped to their +// lower case, giving priority to the special casing rules. +func ToLowerSpecial(_case unicode.SpecialCase, s string) string { + return Map(func(r rune) rune { return _case.ToLower(r) }, s) +} + +// ToTitleSpecial returns a copy of the string s with all Unicode letters mapped to their +// title case, giving priority to the special casing rules. +func ToTitleSpecial(_case unicode.SpecialCase, s string) string { + return Map(func(r rune) rune { return _case.ToTitle(r) }, s) +} + +// isSeparator reports whether the rune could mark a word boundary. +// TODO: update when package unicode captures more of the properties. +func isSeparator(r rune) bool { + // ASCII alphanumerics and underscore are not separators + if r <= 0x7F { + switch { + case '0' <= r && r <= '9': + return false + case 'a' <= r && r <= 'z': + return false + case 'A' <= r && r <= 'Z': + return false + case r == '_': + return false + } + return true + } + // Letters and digits are not separators + if unicode.IsLetter(r) || unicode.IsDigit(r) { + return false + } + // Otherwise, all we can do for now is treat spaces as separators. + return unicode.IsSpace(r) +} + +// Title returns a copy of the string s with all Unicode letters that begin words +// mapped to their title case. +// +// BUG: The rule Title uses for word boundaries does not handle Unicode punctuation properly. +func Title(s string) string { + // Use a closure here to remember state. + // Hackish but effective. Depends on Map scanning in order and calling + // the closure once per rune. + prev := ' ' + return Map( + func(r rune) rune { + if isSeparator(prev) { + prev = r + return unicode.ToTitle(r) + } + prev = r + return r + }, + s) +} + +// TrimLeftFunc returns a slice of the string s with all leading +// Unicode code points c satisfying f(c) removed. +func TrimLeftFunc(s string, f func(rune) bool) string { + i := indexFunc(s, f, false) + if i == -1 { + return "" + } + return s[i:] +} + +// TrimRightFunc returns a slice of the string s with all trailing +// Unicode code points c satisfying f(c) removed. +func TrimRightFunc(s string, f func(rune) bool) string { + i := lastIndexFunc(s, f, false) + if i >= 0 && s[i] >= utf8.RuneSelf { + _, wid := utf8.DecodeRuneInString(s[i:]) + i += wid + } else { + i++ + } + return s[0:i] +} + +// TrimFunc returns a slice of the string s with all leading +// and trailing Unicode code points c satisfying f(c) removed. +func TrimFunc(s string, f func(rune) bool) string { + return TrimRightFunc(TrimLeftFunc(s, f), f) +} + +// IndexFunc returns the index into s of the first Unicode +// code point satisfying f(c), or -1 if none do. +func IndexFunc(s string, f func(rune) bool) int { + return indexFunc(s, f, true) +} + +// LastIndexFunc returns the index into s of the last +// Unicode code point satisfying f(c), or -1 if none do. +func LastIndexFunc(s string, f func(rune) bool) int { + return lastIndexFunc(s, f, true) +} + +// indexFunc is the same as IndexFunc except that if +// truth==false, the sense of the predicate function is +// inverted. +func indexFunc(s string, f func(rune) bool, truth bool) int { + start := 0 + for start < len(s) { + wid := 1 + r := rune(s[start]) + if r >= utf8.RuneSelf { + r, wid = utf8.DecodeRuneInString(s[start:]) + } + if f(r) == truth { + return start + } + start += wid + } + return -1 +} + +// lastIndexFunc is the same as LastIndexFunc except that if +// truth==false, the sense of the predicate function is +// inverted. +func lastIndexFunc(s string, f func(rune) bool, truth bool) int { + for i := len(s); i > 0; { + r, size := utf8.DecodeLastRuneInString(s[0:i]) + i -= size + if f(r) == truth { + return i + } + } + return -1 +} + +func makeCutsetFunc(cutset string) func(rune) bool { + return func(r rune) bool { return IndexRune(cutset, r) >= 0 } +} + +// Trim returns a slice of the string s with all leading and +// trailing Unicode code points contained in cutset removed. +func Trim(s string, cutset string) string { + if s == "" || cutset == "" { + return s + } + return TrimFunc(s, makeCutsetFunc(cutset)) +} + +// TrimLeft returns a slice of the string s with all leading +// Unicode code points contained in cutset removed. +func TrimLeft(s string, cutset string) string { + if s == "" || cutset == "" { + return s + } + return TrimLeftFunc(s, makeCutsetFunc(cutset)) +} + +// TrimRight returns a slice of the string s, with all trailing +// Unicode code points contained in cutset removed. +func TrimRight(s string, cutset string) string { + if s == "" || cutset == "" { + return s + } + return TrimRightFunc(s, makeCutsetFunc(cutset)) +} + +// TrimSpace returns a slice of the string s, with all leading +// and trailing white space removed, as defined by Unicode. +func TrimSpace(s string) string { + return TrimFunc(s, unicode.IsSpace) +} + +// TrimPrefix returns s without the provided leading prefix string. +// If s doesn't start with prefix, s is returned unchanged. +func TrimPrefix(s, prefix string) string { + if HasPrefix(s, prefix) { + return s[len(prefix):] + } + return s +} + +// TrimSuffix returns s without the provided trailing suffix string. +// If s doesn't end with suffix, s is returned unchanged. +func TrimSuffix(s, suffix string) string { + if HasSuffix(s, suffix) { + return s[:len(s)-len(suffix)] + } + return s +} + +// Replace returns a copy of the string s with the first n +// non-overlapping instances of old replaced by new. +// If old is empty, it matches at the beginning of the string +// and after each UTF-8 sequence, yielding up to k+1 replacements +// for a k-rune string. +// If n < 0, there is no limit on the number of replacements. +func Replace(s, old, new string, n int) string { + if old == new || n == 0 { + return s // avoid allocation + } + + // Compute number of replacements. + if m := Count(s, old); m == 0 { + return s // avoid allocation + } else if n < 0 || m < n { + n = m + } + + // Apply replacements to buffer. + t := make([]byte, len(s)+n*(len(new)-len(old))) + w := 0 + start := 0 + for i := 0; i < n; i++ { + j := start + if len(old) == 0 { + if i > 0 { + _, wid := utf8.DecodeRuneInString(s[start:]) + j += wid + } + } else { + j += Index(s[start:], old) + } + w += copy(t[w:], s[start:j]) + w += copy(t[w:], new) + start = j + len(old) + } + w += copy(t[w:], s[start:]) + return string(t[0:w]) +} + +// EqualFold reports whether s and t, interpreted as UTF-8 strings, +// are equal under Unicode case-folding. +func EqualFold(s, t string) bool { + for s != "" && t != "" { + // Extract first rune from each string. + var sr, tr rune + if s[0] < utf8.RuneSelf { + sr, s = rune(s[0]), s[1:] + } else { + r, size := utf8.DecodeRuneInString(s) + sr, s = r, s[size:] + } + if t[0] < utf8.RuneSelf { + tr, t = rune(t[0]), t[1:] + } else { + r, size := utf8.DecodeRuneInString(t) + tr, t = r, t[size:] + } + + // If they match, keep going; if not, return false. + + // Easy case. + if tr == sr { + continue + } + + // Make sr < tr to simplify what follows. + if tr < sr { + tr, sr = sr, tr + } + // Fast check for ASCII. + if tr < utf8.RuneSelf && 'A' <= sr && sr <= 'Z' { + // ASCII, and sr is upper case. tr must be lower case. + if tr == sr+'a'-'A' { + continue + } + return false + } + + // General case. SimpleFold(x) returns the next equivalent rune > x + // or wraps around to smaller values. + r := unicode.SimpleFold(sr) + for r != sr && r < tr { + r = unicode.SimpleFold(r) + } + if r == tr { + continue + } + return false + } + + // One string is empty. Are both? + return s == t +} diff --git a/src/strings/strings.s b/src/strings/strings.s new file mode 100644 index 000000000..55103bae0 --- /dev/null +++ b/src/strings/strings.s @@ -0,0 +1,5 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This file is here just to make the go tool happy. diff --git a/src/strings/strings_decl.go b/src/strings/strings_decl.go new file mode 100644 index 000000000..810a696af --- /dev/null +++ b/src/strings/strings_decl.go @@ -0,0 +1,8 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings + +// IndexByte returns the index of the first instance of c in s, or -1 if c is not present in s. +func IndexByte(s string, c byte) int // ../runtime/asm_$GOARCH.s diff --git a/src/strings/strings_test.go b/src/strings/strings_test.go new file mode 100644 index 000000000..7bb81ef3c --- /dev/null +++ b/src/strings/strings_test.go @@ -0,0 +1,1204 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings_test + +import ( + "bytes" + "io" + "math/rand" + "reflect" + . "strings" + "testing" + "unicode" + "unicode/utf8" + "unsafe" +) + +func eq(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i := 0; i < len(a); i++ { + if a[i] != b[i] { + return false + } + } + return true +} + +var abcd = "abcd" +var faces = "☺☻☹" +var commas = "1,2,3,4" +var dots = "1....2....3....4" + +type IndexTest struct { + s string + sep string + out int +} + +var indexTests = []IndexTest{ + {"", "", 0}, + {"", "a", -1}, + {"", "foo", -1}, + {"fo", "foo", -1}, + {"foo", "foo", 0}, + {"oofofoofooo", "f", 2}, + {"oofofoofooo", "foo", 4}, + {"barfoobarfoo", "foo", 3}, + {"foo", "", 0}, + {"foo", "o", 1}, + {"abcABCabc", "A", 3}, + // cases with one byte strings - test special case in Index() + {"", "a", -1}, + {"x", "a", -1}, + {"x", "x", 0}, + {"abc", "a", 0}, + {"abc", "b", 1}, + {"abc", "c", 2}, + {"abc", "x", -1}, +} + +var lastIndexTests = []IndexTest{ + {"", "", 0}, + {"", "a", -1}, + {"", "foo", -1}, + {"fo", "foo", -1}, + {"foo", "foo", 0}, + {"foo", "f", 0}, + {"oofofoofooo", "f", 7}, + {"oofofoofooo", "foo", 7}, + {"barfoobarfoo", "foo", 9}, + {"foo", "", 3}, + {"foo", "o", 2}, + {"abcABCabc", "A", 3}, + {"abcABCabc", "a", 6}, +} + +var indexAnyTests = []IndexTest{ + {"", "", -1}, + {"", "a", -1}, + {"", "abc", -1}, + {"a", "", -1}, + {"a", "a", 0}, + {"aaa", "a", 0}, + {"abc", "xyz", -1}, + {"abc", "xcz", 2}, + {"a☺b☻c☹d", "uvw☻xyz", 2 + len("☺")}, + {"aRegExp*", ".(|)*+?^$[]", 7}, + {dots + dots + dots, " ", -1}, +} +var lastIndexAnyTests = []IndexTest{ + {"", "", -1}, + {"", "a", -1}, + {"", "abc", -1}, + {"a", "", -1}, + {"a", "a", 0}, + {"aaa", "a", 2}, + {"abc", "xyz", -1}, + {"abc", "ab", 1}, + {"a☺b☻c☹d", "uvw☻xyz", 2 + len("☺")}, + {"a.RegExp*", ".(|)*+?^$[]", 8}, + {dots + dots + dots, " ", -1}, +} + +// Execute f on each test case. funcName should be the name of f; it's used +// in failure reports. +func runIndexTests(t *testing.T, f func(s, sep string) int, funcName string, testCases []IndexTest) { + for _, test := range testCases { + actual := f(test.s, test.sep) + if actual != test.out { + t.Errorf("%s(%q,%q) = %v; want %v", funcName, test.s, test.sep, actual, test.out) + } + } +} + +func TestIndex(t *testing.T) { runIndexTests(t, Index, "Index", indexTests) } +func TestLastIndex(t *testing.T) { runIndexTests(t, LastIndex, "LastIndex", lastIndexTests) } +func TestIndexAny(t *testing.T) { runIndexTests(t, IndexAny, "IndexAny", indexAnyTests) } +func TestLastIndexAny(t *testing.T) { runIndexTests(t, LastIndexAny, "LastIndexAny", lastIndexAnyTests) } + +var indexRuneTests = []struct { + s string + rune rune + out int +}{ + {"a A x", 'A', 2}, + {"some_text=some_value", '=', 9}, + {"☺a", 'a', 3}, + {"a☻☺b", '☺', 4}, +} + +func TestIndexRune(t *testing.T) { + for _, test := range indexRuneTests { + if actual := IndexRune(test.s, test.rune); actual != test.out { + t.Errorf("IndexRune(%q,%d)= %v; want %v", test.s, test.rune, actual, test.out) + } + } +} + +const benchmarkString = "some_text=some☺value" + +func BenchmarkIndexRune(b *testing.B) { + if got := IndexRune(benchmarkString, '☺'); got != 14 { + b.Fatalf("wrong index: expected 14, got=%d", got) + } + for i := 0; i < b.N; i++ { + IndexRune(benchmarkString, '☺') + } +} + +func BenchmarkIndexRuneFastPath(b *testing.B) { + if got := IndexRune(benchmarkString, 'v'); got != 17 { + b.Fatalf("wrong index: expected 17, got=%d", got) + } + for i := 0; i < b.N; i++ { + IndexRune(benchmarkString, 'v') + } +} + +func BenchmarkIndex(b *testing.B) { + if got := Index(benchmarkString, "v"); got != 17 { + b.Fatalf("wrong index: expected 17, got=%d", got) + } + for i := 0; i < b.N; i++ { + Index(benchmarkString, "v") + } +} + +func BenchmarkLastIndex(b *testing.B) { + if got := Index(benchmarkString, "v"); got != 17 { + b.Fatalf("wrong index: expected 17, got=%d", got) + } + for i := 0; i < b.N; i++ { + LastIndex(benchmarkString, "v") + } +} + +func BenchmarkIndexByte(b *testing.B) { + if got := IndexByte(benchmarkString, 'v'); got != 17 { + b.Fatalf("wrong index: expected 17, got=%d", got) + } + for i := 0; i < b.N; i++ { + IndexByte(benchmarkString, 'v') + } +} + +var explodetests = []struct { + s string + n int + a []string +}{ + {"", -1, []string{}}, + {abcd, 4, []string{"a", "b", "c", "d"}}, + {faces, 3, []string{"☺", "☻", "☹"}}, + {abcd, 2, []string{"a", "bcd"}}, +} + +func TestExplode(t *testing.T) { + for _, tt := range explodetests { + a := SplitN(tt.s, "", tt.n) + if !eq(a, tt.a) { + t.Errorf("explode(%q, %d) = %v; want %v", tt.s, tt.n, a, tt.a) + continue + } + s := Join(a, "") + if s != tt.s { + t.Errorf(`Join(explode(%q, %d), "") = %q`, tt.s, tt.n, s) + } + } +} + +type SplitTest struct { + s string + sep string + n int + a []string +} + +var splittests = []SplitTest{ + {abcd, "a", 0, nil}, + {abcd, "a", -1, []string{"", "bcd"}}, + {abcd, "z", -1, []string{"abcd"}}, + {abcd, "", -1, []string{"a", "b", "c", "d"}}, + {commas, ",", -1, []string{"1", "2", "3", "4"}}, + {dots, "...", -1, []string{"1", ".2", ".3", ".4"}}, + {faces, "☹", -1, []string{"☺☻", ""}}, + {faces, "~", -1, []string{faces}}, + {faces, "", -1, []string{"☺", "☻", "☹"}}, + {"1 2 3 4", " ", 3, []string{"1", "2", "3 4"}}, + {"1 2", " ", 3, []string{"1", "2"}}, + {"123", "", 2, []string{"1", "23"}}, + {"123", "", 17, []string{"1", "2", "3"}}, +} + +func TestSplit(t *testing.T) { + for _, tt := range splittests { + a := SplitN(tt.s, tt.sep, tt.n) + if !eq(a, tt.a) { + t.Errorf("Split(%q, %q, %d) = %v; want %v", tt.s, tt.sep, tt.n, a, tt.a) + continue + } + if tt.n == 0 { + continue + } + s := Join(a, tt.sep) + if s != tt.s { + t.Errorf("Join(Split(%q, %q, %d), %q) = %q", tt.s, tt.sep, tt.n, tt.sep, s) + } + if tt.n < 0 { + b := Split(tt.s, tt.sep) + if !reflect.DeepEqual(a, b) { + t.Errorf("Split disagrees with SplitN(%q, %q, %d) = %v; want %v", tt.s, tt.sep, tt.n, b, a) + } + } + } +} + +var splitaftertests = []SplitTest{ + {abcd, "a", -1, []string{"a", "bcd"}}, + {abcd, "z", -1, []string{"abcd"}}, + {abcd, "", -1, []string{"a", "b", "c", "d"}}, + {commas, ",", -1, []string{"1,", "2,", "3,", "4"}}, + {dots, "...", -1, []string{"1...", ".2...", ".3...", ".4"}}, + {faces, "☹", -1, []string{"☺☻☹", ""}}, + {faces, "~", -1, []string{faces}}, + {faces, "", -1, []string{"☺", "☻", "☹"}}, + {"1 2 3 4", " ", 3, []string{"1 ", "2 ", "3 4"}}, + {"1 2 3", " ", 3, []string{"1 ", "2 ", "3"}}, + {"1 2", " ", 3, []string{"1 ", "2"}}, + {"123", "", 2, []string{"1", "23"}}, + {"123", "", 17, []string{"1", "2", "3"}}, +} + +func TestSplitAfter(t *testing.T) { + for _, tt := range splitaftertests { + a := SplitAfterN(tt.s, tt.sep, tt.n) + if !eq(a, tt.a) { + t.Errorf(`Split(%q, %q, %d) = %v; want %v`, tt.s, tt.sep, tt.n, a, tt.a) + continue + } + s := Join(a, "") + if s != tt.s { + t.Errorf(`Join(Split(%q, %q, %d), %q) = %q`, tt.s, tt.sep, tt.n, tt.sep, s) + } + if tt.n < 0 { + b := SplitAfter(tt.s, tt.sep) + if !reflect.DeepEqual(a, b) { + t.Errorf("SplitAfter disagrees with SplitAfterN(%q, %q, %d) = %v; want %v", tt.s, tt.sep, tt.n, b, a) + } + } + } +} + +type FieldsTest struct { + s string + a []string +} + +var fieldstests = []FieldsTest{ + {"", []string{}}, + {" ", []string{}}, + {" \t ", []string{}}, + {" abc ", []string{"abc"}}, + {"1 2 3 4", []string{"1", "2", "3", "4"}}, + {"1 2 3 4", []string{"1", "2", "3", "4"}}, + {"1\t\t2\t\t3\t4", []string{"1", "2", "3", "4"}}, + {"1\u20002\u20013\u20024", []string{"1", "2", "3", "4"}}, + {"\u2000\u2001\u2002", []string{}}, + {"\n™\t™\n", []string{"™", "™"}}, + {faces, []string{faces}}, +} + +func TestFields(t *testing.T) { + for _, tt := range fieldstests { + a := Fields(tt.s) + if !eq(a, tt.a) { + t.Errorf("Fields(%q) = %v; want %v", tt.s, a, tt.a) + continue + } + } +} + +var FieldsFuncTests = []FieldsTest{ + {"", []string{}}, + {"XX", []string{}}, + {"XXhiXXX", []string{"hi"}}, + {"aXXbXXXcX", []string{"a", "b", "c"}}, +} + +func TestFieldsFunc(t *testing.T) { + for _, tt := range fieldstests { + a := FieldsFunc(tt.s, unicode.IsSpace) + if !eq(a, tt.a) { + t.Errorf("FieldsFunc(%q, unicode.IsSpace) = %v; want %v", tt.s, a, tt.a) + continue + } + } + pred := func(c rune) bool { return c == 'X' } + for _, tt := range FieldsFuncTests { + a := FieldsFunc(tt.s, pred) + if !eq(a, tt.a) { + t.Errorf("FieldsFunc(%q) = %v, want %v", tt.s, a, tt.a) + } + } +} + +// Test case for any function which accepts and returns a single string. +type StringTest struct { + in, out string +} + +// Execute f on each test case. funcName should be the name of f; it's used +// in failure reports. +func runStringTests(t *testing.T, f func(string) string, funcName string, testCases []StringTest) { + for _, tc := range testCases { + actual := f(tc.in) + if actual != tc.out { + t.Errorf("%s(%q) = %q; want %q", funcName, tc.in, actual, tc.out) + } + } +} + +var upperTests = []StringTest{ + {"", ""}, + {"abc", "ABC"}, + {"AbC123", "ABC123"}, + {"azAZ09_", "AZAZ09_"}, + {"\u0250\u0250\u0250\u0250\u0250", "\u2C6F\u2C6F\u2C6F\u2C6F\u2C6F"}, // grows one byte per char +} + +var lowerTests = []StringTest{ + {"", ""}, + {"abc", "abc"}, + {"AbC123", "abc123"}, + {"azAZ09_", "azaz09_"}, + {"\u2C6D\u2C6D\u2C6D\u2C6D\u2C6D", "\u0251\u0251\u0251\u0251\u0251"}, // shrinks one byte per char +} + +const space = "\t\v\r\f\n\u0085\u00a0\u2000\u3000" + +var trimSpaceTests = []StringTest{ + {"", ""}, + {"abc", "abc"}, + {space + "abc" + space, "abc"}, + {" ", ""}, + {" \t\r\n \t\t\r\r\n\n ", ""}, + {" \t\r\n x\t\t\r\r\n\n ", "x"}, + {" \u2000\t\r\n x\t\t\r\r\ny\n \u3000", "x\t\t\r\r\ny"}, + {"1 \t\r\n2", "1 \t\r\n2"}, + {" x\x80", "x\x80"}, + {" x\xc0", "x\xc0"}, + {"x \xc0\xc0 ", "x \xc0\xc0"}, + {"x \xc0", "x \xc0"}, + {"x \xc0 ", "x \xc0"}, + {"x \xc0\xc0 ", "x \xc0\xc0"}, + {"x ☺\xc0\xc0 ", "x ☺\xc0\xc0"}, + {"x ☺ ", "x ☺"}, +} + +func tenRunes(ch rune) string { + r := make([]rune, 10) + for i := range r { + r[i] = ch + } + return string(r) +} + +// User-defined self-inverse mapping function +func rot13(r rune) rune { + step := rune(13) + if r >= 'a' && r <= 'z' { + return ((r - 'a' + step) % 26) + 'a' + } + if r >= 'A' && r <= 'Z' { + return ((r - 'A' + step) % 26) + 'A' + } + return r +} + +func TestMap(t *testing.T) { + // Run a couple of awful growth/shrinkage tests + a := tenRunes('a') + // 1. Grow. This triggers two reallocations in Map. + maxRune := func(rune) rune { return unicode.MaxRune } + m := Map(maxRune, a) + expect := tenRunes(unicode.MaxRune) + if m != expect { + t.Errorf("growing: expected %q got %q", expect, m) + } + + // 2. Shrink + minRune := func(rune) rune { return 'a' } + m = Map(minRune, tenRunes(unicode.MaxRune)) + expect = a + if m != expect { + t.Errorf("shrinking: expected %q got %q", expect, m) + } + + // 3. Rot13 + m = Map(rot13, "a to zed") + expect = "n gb mrq" + if m != expect { + t.Errorf("rot13: expected %q got %q", expect, m) + } + + // 4. Rot13^2 + m = Map(rot13, Map(rot13, "a to zed")) + expect = "a to zed" + if m != expect { + t.Errorf("rot13: expected %q got %q", expect, m) + } + + // 5. Drop + dropNotLatin := func(r rune) rune { + if unicode.Is(unicode.Latin, r) { + return r + } + return -1 + } + m = Map(dropNotLatin, "Hello, 세계") + expect = "Hello" + if m != expect { + t.Errorf("drop: expected %q got %q", expect, m) + } + + // 6. Identity + identity := func(r rune) rune { + return r + } + orig := "Input string that we expect not to be copied." + m = Map(identity, orig) + if (*reflect.StringHeader)(unsafe.Pointer(&orig)).Data != + (*reflect.StringHeader)(unsafe.Pointer(&m)).Data { + t.Error("unexpected copy during identity map") + } +} + +func TestToUpper(t *testing.T) { runStringTests(t, ToUpper, "ToUpper", upperTests) } + +func TestToLower(t *testing.T) { runStringTests(t, ToLower, "ToLower", lowerTests) } + +func BenchmarkMapNoChanges(b *testing.B) { + identity := func(r rune) rune { + return r + } + for i := 0; i < b.N; i++ { + Map(identity, "Some string that won't be modified.") + } +} + +func TestSpecialCase(t *testing.T) { + lower := "abcçdefgğhıijklmnoöprsştuüvyz" + upper := "ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZ" + u := ToUpperSpecial(unicode.TurkishCase, upper) + if u != upper { + t.Errorf("Upper(upper) is %s not %s", u, upper) + } + u = ToUpperSpecial(unicode.TurkishCase, lower) + if u != upper { + t.Errorf("Upper(lower) is %s not %s", u, upper) + } + l := ToLowerSpecial(unicode.TurkishCase, lower) + if l != lower { + t.Errorf("Lower(lower) is %s not %s", l, lower) + } + l = ToLowerSpecial(unicode.TurkishCase, upper) + if l != lower { + t.Errorf("Lower(upper) is %s not %s", l, lower) + } +} + +func TestTrimSpace(t *testing.T) { runStringTests(t, TrimSpace, "TrimSpace", trimSpaceTests) } + +var trimTests = []struct { + f string + in, arg, out string +}{ + {"Trim", "abba", "a", "bb"}, + {"Trim", "abba", "ab", ""}, + {"TrimLeft", "abba", "ab", ""}, + {"TrimRight", "abba", "ab", ""}, + {"TrimLeft", "abba", "a", "bba"}, + {"TrimRight", "abba", "a", "abb"}, + {"Trim", "<tag>", "<>", "tag"}, + {"Trim", "* listitem", " *", "listitem"}, + {"Trim", `"quote"`, `"`, "quote"}, + {"Trim", "\u2C6F\u2C6F\u0250\u0250\u2C6F\u2C6F", "\u2C6F", "\u0250\u0250"}, + //empty string tests + {"Trim", "abba", "", "abba"}, + {"Trim", "", "123", ""}, + {"Trim", "", "", ""}, + {"TrimLeft", "abba", "", "abba"}, + {"TrimLeft", "", "123", ""}, + {"TrimLeft", "", "", ""}, + {"TrimRight", "abba", "", "abba"}, + {"TrimRight", "", "123", ""}, + {"TrimRight", "", "", ""}, + {"TrimRight", "☺\xc0", "☺", "☺\xc0"}, + {"TrimPrefix", "aabb", "a", "abb"}, + {"TrimPrefix", "aabb", "b", "aabb"}, + {"TrimSuffix", "aabb", "a", "aabb"}, + {"TrimSuffix", "aabb", "b", "aab"}, +} + +func TestTrim(t *testing.T) { + for _, tc := range trimTests { + name := tc.f + var f func(string, string) string + switch name { + case "Trim": + f = Trim + case "TrimLeft": + f = TrimLeft + case "TrimRight": + f = TrimRight + case "TrimPrefix": + f = TrimPrefix + case "TrimSuffix": + f = TrimSuffix + default: + t.Errorf("Undefined trim function %s", name) + } + actual := f(tc.in, tc.arg) + if actual != tc.out { + t.Errorf("%s(%q, %q) = %q; want %q", name, tc.in, tc.arg, actual, tc.out) + } + } +} + +type predicate struct { + f func(rune) bool + name string +} + +var isSpace = predicate{unicode.IsSpace, "IsSpace"} +var isDigit = predicate{unicode.IsDigit, "IsDigit"} +var isUpper = predicate{unicode.IsUpper, "IsUpper"} +var isValidRune = predicate{ + func(r rune) bool { + return r != utf8.RuneError + }, + "IsValidRune", +} + +func not(p predicate) predicate { + return predicate{ + func(r rune) bool { + return !p.f(r) + }, + "not " + p.name, + } +} + +var trimFuncTests = []struct { + f predicate + in, out string +}{ + {isSpace, space + " hello " + space, "hello"}, + {isDigit, "\u0e50\u0e5212hello34\u0e50\u0e51", "hello"}, + {isUpper, "\u2C6F\u2C6F\u2C6F\u2C6FABCDhelloEF\u2C6F\u2C6FGH\u2C6F\u2C6F", "hello"}, + {not(isSpace), "hello" + space + "hello", space}, + {not(isDigit), "hello\u0e50\u0e521234\u0e50\u0e51helo", "\u0e50\u0e521234\u0e50\u0e51"}, + {isValidRune, "ab\xc0a\xc0cd", "\xc0a\xc0"}, + {not(isValidRune), "\xc0a\xc0", "a"}, +} + +func TestTrimFunc(t *testing.T) { + for _, tc := range trimFuncTests { + actual := TrimFunc(tc.in, tc.f.f) + if actual != tc.out { + t.Errorf("TrimFunc(%q, %q) = %q; want %q", tc.in, tc.f.name, actual, tc.out) + } + } +} + +var indexFuncTests = []struct { + in string + f predicate + first, last int +}{ + {"", isValidRune, -1, -1}, + {"abc", isDigit, -1, -1}, + {"0123", isDigit, 0, 3}, + {"a1b", isDigit, 1, 1}, + {space, isSpace, 0, len(space) - 3}, // last rune in space is 3 bytes + {"\u0e50\u0e5212hello34\u0e50\u0e51", isDigit, 0, 18}, + {"\u2C6F\u2C6F\u2C6F\u2C6FABCDhelloEF\u2C6F\u2C6FGH\u2C6F\u2C6F", isUpper, 0, 34}, + {"12\u0e50\u0e52hello34\u0e50\u0e51", not(isDigit), 8, 12}, + + // tests of invalid UTF-8 + {"\x801", isDigit, 1, 1}, + {"\x80abc", isDigit, -1, -1}, + {"\xc0a\xc0", isValidRune, 1, 1}, + {"\xc0a\xc0", not(isValidRune), 0, 2}, + {"\xc0☺\xc0", not(isValidRune), 0, 4}, + {"\xc0☺\xc0\xc0", not(isValidRune), 0, 5}, + {"ab\xc0a\xc0cd", not(isValidRune), 2, 4}, + {"a\xe0\x80cd", not(isValidRune), 1, 2}, + {"\x80\x80\x80\x80", not(isValidRune), 0, 3}, +} + +func TestIndexFunc(t *testing.T) { + for _, tc := range indexFuncTests { + first := IndexFunc(tc.in, tc.f.f) + if first != tc.first { + t.Errorf("IndexFunc(%q, %s) = %d; want %d", tc.in, tc.f.name, first, tc.first) + } + last := LastIndexFunc(tc.in, tc.f.f) + if last != tc.last { + t.Errorf("LastIndexFunc(%q, %s) = %d; want %d", tc.in, tc.f.name, last, tc.last) + } + } +} + +func equal(m string, s1, s2 string, t *testing.T) bool { + if s1 == s2 { + return true + } + e1 := Split(s1, "") + e2 := Split(s2, "") + for i, c1 := range e1 { + if i >= len(e2) { + break + } + r1, _ := utf8.DecodeRuneInString(c1) + r2, _ := utf8.DecodeRuneInString(e2[i]) + if r1 != r2 { + t.Errorf("%s diff at %d: U+%04X U+%04X", m, i, r1, r2) + } + } + return false +} + +func TestCaseConsistency(t *testing.T) { + // Make a string of all the runes. + numRunes := int(unicode.MaxRune + 1) + if testing.Short() { + numRunes = 1000 + } + a := make([]rune, numRunes) + for i := range a { + a[i] = rune(i) + } + s := string(a) + // convert the cases. + upper := ToUpper(s) + lower := ToLower(s) + + // Consistency checks + if n := utf8.RuneCountInString(upper); n != numRunes { + t.Error("rune count wrong in upper:", n) + } + if n := utf8.RuneCountInString(lower); n != numRunes { + t.Error("rune count wrong in lower:", n) + } + if !equal("ToUpper(upper)", ToUpper(upper), upper, t) { + t.Error("ToUpper(upper) consistency fail") + } + if !equal("ToLower(lower)", ToLower(lower), lower, t) { + t.Error("ToLower(lower) consistency fail") + } + /* + These fail because of non-one-to-oneness of the data, such as multiple + upper case 'I' mapping to 'i'. We comment them out but keep them for + interest. + For instance: CAPITAL LETTER I WITH DOT ABOVE: + unicode.ToUpper(unicode.ToLower('\u0130')) != '\u0130' + + if !equal("ToUpper(lower)", ToUpper(lower), upper, t) { + t.Error("ToUpper(lower) consistency fail"); + } + if !equal("ToLower(upper)", ToLower(upper), lower, t) { + t.Error("ToLower(upper) consistency fail"); + } + */ +} + +var RepeatTests = []struct { + in, out string + count int +}{ + {"", "", 0}, + {"", "", 1}, + {"", "", 2}, + {"-", "", 0}, + {"-", "-", 1}, + {"-", "----------", 10}, + {"abc ", "abc abc abc ", 3}, +} + +func TestRepeat(t *testing.T) { + for _, tt := range RepeatTests { + a := Repeat(tt.in, tt.count) + if !equal("Repeat(s)", a, tt.out, t) { + t.Errorf("Repeat(%v, %d) = %v; want %v", tt.in, tt.count, a, tt.out) + continue + } + } +} + +func runesEqual(a, b []rune) bool { + if len(a) != len(b) { + return false + } + for i, r := range a { + if r != b[i] { + return false + } + } + return true +} + +var RunesTests = []struct { + in string + out []rune + lossy bool +}{ + {"", []rune{}, false}, + {" ", []rune{32}, false}, + {"ABC", []rune{65, 66, 67}, false}, + {"abc", []rune{97, 98, 99}, false}, + {"\u65e5\u672c\u8a9e", []rune{26085, 26412, 35486}, false}, + {"ab\x80c", []rune{97, 98, 0xFFFD, 99}, true}, + {"ab\xc0c", []rune{97, 98, 0xFFFD, 99}, true}, +} + +func TestRunes(t *testing.T) { + for _, tt := range RunesTests { + a := []rune(tt.in) + if !runesEqual(a, tt.out) { + t.Errorf("[]rune(%q) = %v; want %v", tt.in, a, tt.out) + continue + } + if !tt.lossy { + // can only test reassembly if we didn't lose information + s := string(a) + if s != tt.in { + t.Errorf("string([]rune(%q)) = %x; want %x", tt.in, s, tt.in) + } + } + } +} + +func TestReadByte(t *testing.T) { + testStrings := []string{"", abcd, faces, commas} + for _, s := range testStrings { + reader := NewReader(s) + if e := reader.UnreadByte(); e == nil { + t.Errorf("Unreading %q at beginning: expected error", s) + } + var res bytes.Buffer + for { + b, e := reader.ReadByte() + if e == io.EOF { + break + } + if e != nil { + t.Errorf("Reading %q: %s", s, e) + break + } + res.WriteByte(b) + // unread and read again + e = reader.UnreadByte() + if e != nil { + t.Errorf("Unreading %q: %s", s, e) + break + } + b1, e := reader.ReadByte() + if e != nil { + t.Errorf("Reading %q after unreading: %s", s, e) + break + } + if b1 != b { + t.Errorf("Reading %q after unreading: want byte %q, got %q", s, b, b1) + break + } + } + if res.String() != s { + t.Errorf("Reader(%q).ReadByte() produced %q", s, res.String()) + } + } +} + +func TestReadRune(t *testing.T) { + testStrings := []string{"", abcd, faces, commas} + for _, s := range testStrings { + reader := NewReader(s) + if e := reader.UnreadRune(); e == nil { + t.Errorf("Unreading %q at beginning: expected error", s) + } + res := "" + for { + r, z, e := reader.ReadRune() + if e == io.EOF { + break + } + if e != nil { + t.Errorf("Reading %q: %s", s, e) + break + } + res += string(r) + // unread and read again + e = reader.UnreadRune() + if e != nil { + t.Errorf("Unreading %q: %s", s, e) + break + } + r1, z1, e := reader.ReadRune() + if e != nil { + t.Errorf("Reading %q after unreading: %s", s, e) + break + } + if r1 != r { + t.Errorf("Reading %q after unreading: want rune %q, got %q", s, r, r1) + break + } + if z1 != z { + t.Errorf("Reading %q after unreading: want size %d, got %d", s, z, z1) + break + } + } + if res != s { + t.Errorf("Reader(%q).ReadRune() produced %q", s, res) + } + } +} + +var UnreadRuneErrorTests = []struct { + name string + f func(*Reader) +}{ + {"Read", func(r *Reader) { r.Read([]byte{0}) }}, + {"ReadByte", func(r *Reader) { r.ReadByte() }}, + {"UnreadRune", func(r *Reader) { r.UnreadRune() }}, + {"Seek", func(r *Reader) { r.Seek(0, 1) }}, + {"WriteTo", func(r *Reader) { r.WriteTo(&bytes.Buffer{}) }}, +} + +func TestUnreadRuneError(t *testing.T) { + for _, tt := range UnreadRuneErrorTests { + reader := NewReader("0123456789") + if _, _, err := reader.ReadRune(); err != nil { + // should not happen + t.Fatal(err) + } + tt.f(reader) + err := reader.UnreadRune() + if err == nil { + t.Errorf("Unreading after %s: expected error", tt.name) + } + } +} + +var ReplaceTests = []struct { + in string + old, new string + n int + out string +}{ + {"hello", "l", "L", 0, "hello"}, + {"hello", "l", "L", -1, "heLLo"}, + {"hello", "x", "X", -1, "hello"}, + {"", "x", "X", -1, ""}, + {"radar", "r", "<r>", -1, "<r>ada<r>"}, + {"", "", "<>", -1, "<>"}, + {"banana", "a", "<>", -1, "b<>n<>n<>"}, + {"banana", "a", "<>", 1, "b<>nana"}, + {"banana", "a", "<>", 1000, "b<>n<>n<>"}, + {"banana", "an", "<>", -1, "b<><>a"}, + {"banana", "ana", "<>", -1, "b<>na"}, + {"banana", "", "<>", -1, "<>b<>a<>n<>a<>n<>a<>"}, + {"banana", "", "<>", 10, "<>b<>a<>n<>a<>n<>a<>"}, + {"banana", "", "<>", 6, "<>b<>a<>n<>a<>n<>a"}, + {"banana", "", "<>", 5, "<>b<>a<>n<>a<>na"}, + {"banana", "", "<>", 1, "<>banana"}, + {"banana", "a", "a", -1, "banana"}, + {"banana", "a", "a", 1, "banana"}, + {"☺☻☹", "", "<>", -1, "<>☺<>☻<>☹<>"}, +} + +func TestReplace(t *testing.T) { + for _, tt := range ReplaceTests { + if s := Replace(tt.in, tt.old, tt.new, tt.n); s != tt.out { + t.Errorf("Replace(%q, %q, %q, %d) = %q, want %q", tt.in, tt.old, tt.new, tt.n, s, tt.out) + } + } +} + +var TitleTests = []struct { + in, out string +}{ + {"", ""}, + {"a", "A"}, + {" aaa aaa aaa ", " Aaa Aaa Aaa "}, + {" Aaa Aaa Aaa ", " Aaa Aaa Aaa "}, + {"123a456", "123a456"}, + {"double-blind", "Double-Blind"}, + {"ÿøû", "Ÿøû"}, + {"with_underscore", "With_underscore"}, + {"unicode \xe2\x80\xa8 line separator", "Unicode \xe2\x80\xa8 Line Separator"}, +} + +func TestTitle(t *testing.T) { + for _, tt := range TitleTests { + if s := Title(tt.in); s != tt.out { + t.Errorf("Title(%q) = %q, want %q", tt.in, s, tt.out) + } + } +} + +var ContainsTests = []struct { + str, substr string + expected bool +}{ + {"abc", "bc", true}, + {"abc", "bcd", false}, + {"abc", "", true}, + {"", "a", false}, +} + +func TestContains(t *testing.T) { + for _, ct := range ContainsTests { + if Contains(ct.str, ct.substr) != ct.expected { + t.Errorf("Contains(%s, %s) = %v, want %v", + ct.str, ct.substr, !ct.expected, ct.expected) + } + } +} + +var ContainsAnyTests = []struct { + str, substr string + expected bool +}{ + {"", "", false}, + {"", "a", false}, + {"", "abc", false}, + {"a", "", false}, + {"a", "a", true}, + {"aaa", "a", true}, + {"abc", "xyz", false}, + {"abc", "xcz", true}, + {"a☺b☻c☹d", "uvw☻xyz", true}, + {"aRegExp*", ".(|)*+?^$[]", true}, + {dots + dots + dots, " ", false}, +} + +func TestContainsAny(t *testing.T) { + for _, ct := range ContainsAnyTests { + if ContainsAny(ct.str, ct.substr) != ct.expected { + t.Errorf("ContainsAny(%s, %s) = %v, want %v", + ct.str, ct.substr, !ct.expected, ct.expected) + } + } +} + +var ContainsRuneTests = []struct { + str string + r rune + expected bool +}{ + {"", 'a', false}, + {"a", 'a', true}, + {"aaa", 'a', true}, + {"abc", 'y', false}, + {"abc", 'c', true}, + {"a☺b☻c☹d", 'x', false}, + {"a☺b☻c☹d", '☻', true}, + {"aRegExp*", '*', true}, +} + +func TestContainsRune(t *testing.T) { + for _, ct := range ContainsRuneTests { + if ContainsRune(ct.str, ct.r) != ct.expected { + t.Errorf("ContainsRune(%q, %q) = %v, want %v", + ct.str, ct.r, !ct.expected, ct.expected) + } + } +} + +var EqualFoldTests = []struct { + s, t string + out bool +}{ + {"abc", "abc", true}, + {"ABcd", "ABcd", true}, + {"123abc", "123ABC", true}, + {"αβδ", "ΑΒΔ", true}, + {"abc", "xyz", false}, + {"abc", "XYZ", false}, + {"abcdefghijk", "abcdefghijX", false}, + {"abcdefghijk", "abcdefghij\u212A", true}, + {"abcdefghijK", "abcdefghij\u212A", true}, + {"abcdefghijkz", "abcdefghij\u212Ay", false}, + {"abcdefghijKz", "abcdefghij\u212Ay", false}, +} + +func TestEqualFold(t *testing.T) { + for _, tt := range EqualFoldTests { + if out := EqualFold(tt.s, tt.t); out != tt.out { + t.Errorf("EqualFold(%#q, %#q) = %v, want %v", tt.s, tt.t, out, tt.out) + } + if out := EqualFold(tt.t, tt.s); out != tt.out { + t.Errorf("EqualFold(%#q, %#q) = %v, want %v", tt.t, tt.s, out, tt.out) + } + } +} + +var CountTests = []struct { + s, sep string + num int +}{ + {"", "", 1}, + {"", "notempty", 0}, + {"notempty", "", 9}, + {"smaller", "not smaller", 0}, + {"12345678987654321", "6", 2}, + {"611161116", "6", 3}, + {"notequal", "NotEqual", 0}, + {"equal", "equal", 1}, + {"abc1231231123q", "123", 3}, + {"11111", "11", 2}, +} + +func TestCount(t *testing.T) { + for _, tt := range CountTests { + if num := Count(tt.s, tt.sep); num != tt.num { + t.Errorf("Count(\"%s\", \"%s\") = %d, want %d", tt.s, tt.sep, num, tt.num) + } + } +} + +func makeBenchInputHard() string { + tokens := [...]string{ + "<a>", "<p>", "<b>", "<strong>", + "</a>", "</p>", "</b>", "</strong>", + "hello", "world", + } + x := make([]byte, 0, 1<<20) + for { + i := rand.Intn(len(tokens)) + if len(x)+len(tokens[i]) >= 1<<20 { + break + } + x = append(x, tokens[i]...) + } + return string(x) +} + +var benchInputHard = makeBenchInputHard() + +func benchmarkIndexHard(b *testing.B, sep string) { + for i := 0; i < b.N; i++ { + Index(benchInputHard, sep) + } +} + +func benchmarkLastIndexHard(b *testing.B, sep string) { + for i := 0; i < b.N; i++ { + LastIndex(benchInputHard, sep) + } +} + +func benchmarkCountHard(b *testing.B, sep string) { + for i := 0; i < b.N; i++ { + Count(benchInputHard, sep) + } +} + +func BenchmarkIndexHard1(b *testing.B) { benchmarkIndexHard(b, "<>") } +func BenchmarkIndexHard2(b *testing.B) { benchmarkIndexHard(b, "</pre>") } +func BenchmarkIndexHard3(b *testing.B) { benchmarkIndexHard(b, "<b>hello world</b>") } + +func BenchmarkLastIndexHard1(b *testing.B) { benchmarkLastIndexHard(b, "<>") } +func BenchmarkLastIndexHard2(b *testing.B) { benchmarkLastIndexHard(b, "</pre>") } +func BenchmarkLastIndexHard3(b *testing.B) { benchmarkLastIndexHard(b, "<b>hello world</b>") } + +func BenchmarkCountHard1(b *testing.B) { benchmarkCountHard(b, "<>") } +func BenchmarkCountHard2(b *testing.B) { benchmarkCountHard(b, "</pre>") } +func BenchmarkCountHard3(b *testing.B) { benchmarkCountHard(b, "<b>hello world</b>") } + +var benchInputTorture = Repeat("ABC", 1<<10) + "123" + Repeat("ABC", 1<<10) +var benchNeedleTorture = Repeat("ABC", 1<<10+1) + +func BenchmarkIndexTorture(b *testing.B) { + for i := 0; i < b.N; i++ { + Index(benchInputTorture, benchNeedleTorture) + } +} + +func BenchmarkCountTorture(b *testing.B) { + for i := 0; i < b.N; i++ { + Count(benchInputTorture, benchNeedleTorture) + } +} + +func BenchmarkCountTortureOverlapping(b *testing.B) { + A := Repeat("ABC", 1<<20) + B := Repeat("ABC", 1<<10) + for i := 0; i < b.N; i++ { + Count(A, B) + } +} + +var makeFieldsInput = func() string { + x := make([]byte, 1<<20) + // Input is ~10% space, ~10% 2-byte UTF-8, rest ASCII non-space. + for i := range x { + switch rand.Intn(10) { + case 0: + x[i] = ' ' + case 1: + if i > 0 && x[i-1] == 'x' { + copy(x[i-1:], "χ") + break + } + fallthrough + default: + x[i] = 'x' + } + } + return string(x) +} + +var fieldsInput = makeFieldsInput() + +func BenchmarkFields(b *testing.B) { + b.SetBytes(int64(len(fieldsInput))) + for i := 0; i < b.N; i++ { + Fields(fieldsInput) + } +} + +func BenchmarkFieldsFunc(b *testing.B) { + b.SetBytes(int64(len(fieldsInput))) + for i := 0; i < b.N; i++ { + FieldsFunc(fieldsInput, unicode.IsSpace) + } +} + +func BenchmarkSplit1(b *testing.B) { + for i := 0; i < b.N; i++ { + Split(benchInputHard, "") + } +} + +func BenchmarkSplit2(b *testing.B) { + for i := 0; i < b.N; i++ { + Split(benchInputHard, "/") + } +} + +func BenchmarkSplit3(b *testing.B) { + for i := 0; i < b.N; i++ { + Split(benchInputHard, "hello") + } +} + +func BenchmarkRepeat(b *testing.B) { + for i := 0; i < b.N; i++ { + Repeat("-", 80) + } +} |