358 lines
9.2 KiB
Go
358 lines
9.2 KiB
Go
|
// Copyright 2015 The Go Authors. All rights reserved.
|
||
|
// Use of this source code is governed by a BSD-style
|
||
|
// license that can be found in the LICENSE file.
|
||
|
|
||
|
package search
|
||
|
|
||
|
import (
|
||
|
"reflect"
|
||
|
"strings"
|
||
|
"testing"
|
||
|
|
||
|
"golang.org/x/text/language"
|
||
|
)
|
||
|
|
||
|
func TestCompile(t *testing.T) {
|
||
|
for i, tc := range []struct {
|
||
|
desc string
|
||
|
pattern string
|
||
|
options []Option
|
||
|
n int
|
||
|
}{{
|
||
|
desc: "empty",
|
||
|
pattern: "",
|
||
|
n: 0,
|
||
|
}, {
|
||
|
desc: "single",
|
||
|
pattern: "a",
|
||
|
n: 1,
|
||
|
}, {
|
||
|
desc: "keep modifier",
|
||
|
pattern: "a\u0300", // U+0300: COMBINING GRAVE ACCENT
|
||
|
n: 2,
|
||
|
}, {
|
||
|
desc: "remove modifier",
|
||
|
pattern: "a\u0300", // U+0300: COMBINING GRAVE ACCENT
|
||
|
options: []Option{IgnoreDiacritics},
|
||
|
n: 1,
|
||
|
}, {
|
||
|
desc: "single with double collation element",
|
||
|
pattern: "ä",
|
||
|
n: 2,
|
||
|
}, {
|
||
|
desc: "leading variable",
|
||
|
pattern: " a",
|
||
|
n: 2,
|
||
|
}, {
|
||
|
desc: "trailing variable",
|
||
|
pattern: "aa ",
|
||
|
n: 3,
|
||
|
}, {
|
||
|
desc: "leading and trailing variable",
|
||
|
pattern: " äb ",
|
||
|
n: 5,
|
||
|
}, {
|
||
|
desc: "keep interior variable",
|
||
|
pattern: " ä b ",
|
||
|
n: 6,
|
||
|
}, {
|
||
|
desc: "keep interior variables",
|
||
|
pattern: " b ä ",
|
||
|
n: 7,
|
||
|
}, {
|
||
|
desc: "remove ignoreables (zero-weights across the board)",
|
||
|
pattern: "\u009Db\u009Dä\u009D", // U+009D: OPERATING SYSTEM COMMAND
|
||
|
n: 3,
|
||
|
}} {
|
||
|
m := New(language.Und, tc.options...)
|
||
|
p := m.CompileString(tc.pattern)
|
||
|
if len(p.ce) != tc.n {
|
||
|
t.Errorf("%d:%s: Compile(%+q): got %d; want %d", i, tc.desc, tc.pattern, len(p.ce), tc.n)
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func TestNorm(t *testing.T) {
|
||
|
// U+0300: COMBINING GRAVE ACCENT (CCC=230)
|
||
|
// U+031B: COMBINING HORN (CCC=216)
|
||
|
for _, tc := range []struct {
|
||
|
desc string
|
||
|
a string
|
||
|
b string
|
||
|
want bool // a and b compile into the same pattern?
|
||
|
}{{
|
||
|
"simple",
|
||
|
"eee\u0300\u031b",
|
||
|
"eee\u031b\u0300",
|
||
|
true,
|
||
|
}, {
|
||
|
"large number of modifiers in pattern",
|
||
|
strings.Repeat("\u0300", 29) + "\u0318",
|
||
|
"\u0318" + strings.Repeat("\u0300", 29),
|
||
|
true,
|
||
|
}, {
|
||
|
"modifier overflow in pattern",
|
||
|
strings.Repeat("\u0300", 30) + "\u0318",
|
||
|
"\u0318" + strings.Repeat("\u0300", 30),
|
||
|
false,
|
||
|
}} {
|
||
|
m := New(language.Und)
|
||
|
a := m.CompileString(tc.a)
|
||
|
b := m.CompileString(tc.b)
|
||
|
if got := reflect.DeepEqual(a, b); got != tc.want {
|
||
|
t.Errorf("Compile(a) == Compile(b) == %v; want %v", got, tc.want)
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func TestForwardSearch(t *testing.T) {
|
||
|
for i, tc := range []struct {
|
||
|
desc string
|
||
|
tag string
|
||
|
options []Option
|
||
|
pattern string
|
||
|
text string
|
||
|
want []int
|
||
|
}{{
|
||
|
// The semantics of an empty search is to match nothing.
|
||
|
// TODO: change this to be in line with strings.Index? It is quite a
|
||
|
// different beast, so not sure yet.
|
||
|
|
||
|
desc: "empty pattern and text",
|
||
|
tag: "und",
|
||
|
pattern: "",
|
||
|
text: "",
|
||
|
want: nil, // TODO: consider: []int{0, 0},
|
||
|
}, {
|
||
|
desc: "non-empty pattern and empty text",
|
||
|
tag: "und",
|
||
|
pattern: " ",
|
||
|
text: "",
|
||
|
want: nil,
|
||
|
}, {
|
||
|
desc: "empty pattern and non-empty text",
|
||
|
tag: "und",
|
||
|
pattern: "",
|
||
|
text: "abc",
|
||
|
want: nil, // TODO: consider: []int{0, 0, 1, 1, 2, 2, 3, 3},
|
||
|
}, {
|
||
|
// Variable-only patterns. We don't support variables at the moment,
|
||
|
// but verify that, given this, the behavior is indeed as expected.
|
||
|
|
||
|
desc: "exact match of variable",
|
||
|
tag: "und",
|
||
|
pattern: " ",
|
||
|
text: " ",
|
||
|
want: []int{0, 1},
|
||
|
}, {
|
||
|
desc: "variables not handled by default",
|
||
|
tag: "und",
|
||
|
pattern: "- ",
|
||
|
text: " -",
|
||
|
want: nil, // Would be (1, 2) for a median match with variable}.
|
||
|
}, {
|
||
|
desc: "multiple subsequent identical variables",
|
||
|
tag: "und",
|
||
|
pattern: " ",
|
||
|
text: " ",
|
||
|
want: []int{0, 1, 1, 2, 2, 3, 3, 4},
|
||
|
}, {
|
||
|
desc: "text with variables",
|
||
|
tag: "und",
|
||
|
options: []Option{IgnoreDiacritics},
|
||
|
pattern: "abc",
|
||
|
text: "3 abc 3",
|
||
|
want: []int{2, 5},
|
||
|
}, {
|
||
|
desc: "pattern with interior variables",
|
||
|
tag: "und",
|
||
|
options: []Option{IgnoreDiacritics},
|
||
|
pattern: "a b c",
|
||
|
text: "3 a b c abc a b c 3",
|
||
|
want: []int{2, 7}, // Would have 3 matches using variable.
|
||
|
|
||
|
// TODO: Different variable handling settings.
|
||
|
}, {
|
||
|
// Options.
|
||
|
|
||
|
desc: "match all levels",
|
||
|
tag: "und",
|
||
|
pattern: "Abc",
|
||
|
text: "abcAbcABCÁbcábc",
|
||
|
want: []int{3, 6},
|
||
|
}, {
|
||
|
desc: "ignore diacritics in text",
|
||
|
tag: "und",
|
||
|
options: []Option{IgnoreDiacritics},
|
||
|
pattern: "Abc",
|
||
|
text: "Ábc",
|
||
|
want: []int{0, 4},
|
||
|
}, {
|
||
|
desc: "ignore diacritics in pattern",
|
||
|
tag: "und",
|
||
|
options: []Option{IgnoreDiacritics},
|
||
|
pattern: "Ábc",
|
||
|
text: "Abc",
|
||
|
want: []int{0, 3},
|
||
|
}, {
|
||
|
desc: "ignore diacritics",
|
||
|
tag: "und",
|
||
|
options: []Option{IgnoreDiacritics},
|
||
|
pattern: "Abc",
|
||
|
text: "abcAbcABCÁbcábc",
|
||
|
want: []int{3, 6, 9, 13},
|
||
|
}, {
|
||
|
desc: "ignore case",
|
||
|
tag: "und",
|
||
|
options: []Option{IgnoreCase},
|
||
|
pattern: "Abc",
|
||
|
text: "abcAbcABCÁbcábc",
|
||
|
want: []int{0, 3, 3, 6, 6, 9},
|
||
|
}, {
|
||
|
desc: "ignore case and diacritics",
|
||
|
tag: "und",
|
||
|
options: []Option{IgnoreCase, IgnoreDiacritics},
|
||
|
pattern: "Abc",
|
||
|
text: "abcAbcABCÁbcábc",
|
||
|
want: []int{0, 3, 3, 6, 6, 9, 9, 13, 13, 17},
|
||
|
}, {
|
||
|
desc: "ignore width to fullwidth",
|
||
|
tag: "und",
|
||
|
options: []Option{IgnoreWidth},
|
||
|
pattern: "abc",
|
||
|
text: "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C
|
||
|
want: []int{4, 13},
|
||
|
}, {
|
||
|
// TODO: distinguish between case and width.
|
||
|
desc: "don't ignore width to fullwidth, ignoring only case",
|
||
|
tag: "und",
|
||
|
options: []Option{IgnoreCase},
|
||
|
pattern: "abc",
|
||
|
text: "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C
|
||
|
want: []int{4, 13},
|
||
|
}, {
|
||
|
desc: "ignore width to fullwidth and diacritics",
|
||
|
tag: "und",
|
||
|
options: []Option{IgnoreWidth, IgnoreDiacritics},
|
||
|
pattern: "abc",
|
||
|
text: "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C
|
||
|
want: []int{4, 13},
|
||
|
}, {
|
||
|
desc: "whole grapheme, single rune",
|
||
|
tag: "und",
|
||
|
pattern: "eee",
|
||
|
text: "123 eeé 123",
|
||
|
want: nil,
|
||
|
}, {
|
||
|
// Note: rules on when to apply contractions may, for certain languages,
|
||
|
// differ between search and collation. For example, "ch" is not
|
||
|
// considered a contraction for the purpose of searching in Spanish.
|
||
|
// Therefore, be careful picking this test.
|
||
|
desc: "whole grapheme, contractions",
|
||
|
tag: "da",
|
||
|
pattern: "aba",
|
||
|
// Fails at the primary level, because "aa" is a contraction.
|
||
|
text: "123 abaa 123",
|
||
|
want: []int{},
|
||
|
}, {
|
||
|
desc: "whole grapheme, trailing modifier",
|
||
|
tag: "und",
|
||
|
pattern: "eee",
|
||
|
text: "123 eee\u0300 123", // U+0300: COMBINING GRAVE ACCENT
|
||
|
want: nil,
|
||
|
}, {
|
||
|
// Language-specific matching.
|
||
|
|
||
|
desc: "",
|
||
|
tag: "da",
|
||
|
options: []Option{IgnoreCase},
|
||
|
pattern: "Århus",
|
||
|
text: "AarhusÅrhus Århus ",
|
||
|
want: []int{0, 6, 6, 12, 14, 20},
|
||
|
}, {
|
||
|
desc: "",
|
||
|
tag: "da",
|
||
|
options: []Option{IgnoreCase},
|
||
|
pattern: "Aarhus",
|
||
|
text: "Århus Aarhus",
|
||
|
want: []int{0, 6, 7, 13},
|
||
|
}, {
|
||
|
desc: "",
|
||
|
tag: "en", // Å does not match A for English.
|
||
|
options: []Option{IgnoreCase},
|
||
|
pattern: "Aarhus",
|
||
|
text: "Århus",
|
||
|
want: nil,
|
||
|
}, {
|
||
|
desc: "ignore modifier in text",
|
||
|
options: []Option{IgnoreDiacritics},
|
||
|
tag: "und",
|
||
|
pattern: "eee",
|
||
|
text: "123 eee\u0300 123", // U+0300: COMBINING GRAVE ACCENT
|
||
|
want: []int{4, 9}, // Matches on grapheme boundary.
|
||
|
}, {
|
||
|
desc: "ignore multiple modifiers in text",
|
||
|
options: []Option{IgnoreDiacritics},
|
||
|
tag: "und",
|
||
|
pattern: "eee",
|
||
|
text: "123 eee\u0300\u0300 123", // U+0300: COMBINING GRAVE ACCENT
|
||
|
want: []int{4, 11}, // Matches on grapheme boundary.
|
||
|
}, {
|
||
|
desc: "ignore modifier in pattern",
|
||
|
options: []Option{IgnoreDiacritics},
|
||
|
tag: "und",
|
||
|
pattern: "eee\u0300", // U+0300: COMBINING GRAVE ACCENT
|
||
|
text: "123 eee 123",
|
||
|
want: []int{4, 7},
|
||
|
}, {
|
||
|
desc: "ignore multiple modifiers in pattern",
|
||
|
options: []Option{IgnoreDiacritics},
|
||
|
tag: "und",
|
||
|
pattern: "eee\u0300\u0300", // U+0300: COMBINING GRAVE ACCENT
|
||
|
text: "123 eee 123",
|
||
|
want: []int{4, 7},
|
||
|
}, {
|
||
|
desc: "match non-normalized pattern",
|
||
|
tag: "und",
|
||
|
// U+0300: COMBINING GRAVE ACCENT (CCC=230)
|
||
|
// U+031B: COMBINING HORN (CCC=216)
|
||
|
pattern: "eee\u0300\u031b",
|
||
|
text: "123 eee\u031b\u0300 123",
|
||
|
want: []int{4, 11},
|
||
|
}, {
|
||
|
desc: "match non-normalized text",
|
||
|
tag: "und",
|
||
|
// U+0300: COMBINING GRAVE ACCENT (CCC=230)
|
||
|
// U+031B: COMBINING HORN (CCC=216)
|
||
|
pattern: "eee\u031b\u0300",
|
||
|
text: "123 eee\u0300\u031b 123",
|
||
|
want: []int{4, 11},
|
||
|
}} {
|
||
|
m := New(language.MustParse(tc.tag), tc.options...)
|
||
|
p := m.CompileString(tc.pattern)
|
||
|
for j := 0; j < len(tc.text); {
|
||
|
start, end := p.IndexString(tc.text[j:])
|
||
|
if start == -1 && end == -1 {
|
||
|
j++
|
||
|
continue
|
||
|
}
|
||
|
start += j
|
||
|
end += j
|
||
|
j = end
|
||
|
if len(tc.want) == 0 {
|
||
|
t.Errorf("%d:%s: found unexpected result [%d %d]", i, tc.desc, start, end)
|
||
|
break
|
||
|
}
|
||
|
if tc.want[0] != start || tc.want[1] != end {
|
||
|
t.Errorf("%d:%s: got [%d %d]; want %v", i, tc.desc, start, end, tc.want[:2])
|
||
|
tc.want = tc.want[2:]
|
||
|
break
|
||
|
}
|
||
|
tc.want = tc.want[2:]
|
||
|
}
|
||
|
if len(tc.want) != 0 {
|
||
|
t.Errorf("%d:%s: %d extra results", i, tc.desc, len(tc.want)/2)
|
||
|
}
|
||
|
}
|
||
|
}
|