blob: 9eacfaddfea003587fd4a3ce29bb995f64040c39 [file] [log] [blame] [edit]
// Copyright 2017 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Select test data comes from
// The Project Gutenberg eBook of The humour of Ireland, by D. J., (David James), (1866-1917) O'Donoghue
package stringclassifier
import (
"reflect"
"regexp"
"sort"
"testing"
"github.com/sergi/go-diff/diffmatchpatch"
)
var (
gettysburg = `Four score and seven years ago our fathers brought forth
on this continent, a new nation, conceived in Liberty, and dedicated to the
proposition that all men are created equal.`
modifiedGettysburg = `Four score and seven years ago our fathers brought forth
on this continent, a nation that was new and improved, conceived in Liberty, and
dedicated to the proposition that all men are created equal.`
gettysburgExtraWord = `Four score and seven years ago our fathers brought forth
on this continent, a new nation, conceived in Liberty, and dedicated to the
proposition that all men are created equal.Foobar`
declaration = `When in the Course of human events, it becomes necessary
for one people to dissolve the political bands which have connected them with
another, and to assume among the powers of the earth, the separate and equal
station to which the Laws of Nature and of Nature's God entitle them, a decent
respect to the opinions of mankind requires that they should declare the causes
which impel them to the separation.`
loremipsum = `Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla
varius enim mattis, rhoncus lectus id, aliquet sem. Phasellus eget ex in dolor
feugiat ultricies. Etiam interdum sit amet nisl in placerat. Sed vitae enim
vulputate, tempus leo commodo, accumsan nulla.`
modifiedLorem = `Lorem ipsum dolor amet, consectetur adipiscing elit. Nulla
varius enim mattis, lectus id, aliquet rhoncus sem. Phasellus eget ex in dolor
feugiat ultricies. Etiam interdum sit amet sit nisl in placerat. Sed vitae enim
vulputate, tempus leo commodo, accumsan nulla.`
lessModifiedLorem = `Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla
varius enim mattis, rhoncus lectus id, aliquet. Phasellus eget ex in dolor
feugiat ultricies. Etiam interdum sit amet nisl in placerat. Sed vitae enim
vulputate, tempus leo commodo, accumsan nulla.`
humourOfIreland = `As a rule, Irish poets have not extracted a pessimistic
philosophy from liquor; they are “elevated,” not depressed, and do not deem
it essential to the production of a poem that its author should be a cynic or
an evil prophet. One of the best attributes of Irish poetry is its constant
expression of the natural emotions. Previous to the close of the
seventeenth[xvi] century, it is said, drunkenness was not suggested by the
poets as common in Ireland—the popularity of Bacchanalian songs since that
date seems to prove that the vice soon became a virtue. Maginn is the
noisiest of modern revellers, and easily roars the others down.
`
fellowInTheGoatSkin = `There was a poor widow living down there near the Iron
Forge when the country was all covered with forests, and you might walk on
the tops of trees from Carnew to the Lady’s Island, and she had one boy. She
was very poor, as I said before, and was not able to buy clothes for her son.
So when she was going out she fixed him snug and combustible in the ash-pit,
and piled the warm ashes about him. The boy knew no better, and was as happy
as the day was long; and he was happier still when a neighbour[10] gave his
mother a kid to keep him company when herself was abroad. The kid and the lad
played like two may-boys; and when she was old enough to give milk, wasn’t it
a godsend to the little family? You won’t prevent the boy from growing up
into a young man, but not a screed of clothes had he then no more than when
he was a gorsoon.
`
oldCrowYoungCrow = `There was an old crow teaching a young crow one day, and
he said to him, “Now, my son,” says he, “listen to the advice I’m going to
give you. If you see a person coming near you and stooping, mind yourself,
and be on your keeping; he’s stooping for a stone to throw at you.”
“But tell me,” says the young crow, “what should I do if he had a stone
already down in his pocket?”
“Musha, go ’long out of that,” says the old crow, “you’ve learned enough; the
devil another learning I’m able to give you.”
`
nullifiable = `[[ , _ , _ , _
? _ : _
? _ : _
? _ : _
]
}
`
nonWords = regexp.MustCompile("[[:punct:]]+")
)
// removeNonWords removes non-words from the string, replacing them with empty
// string. (This is meant to exercise tokenization problems.)
func removeNonWords(s string) string {
return nonWords.ReplaceAllString(s, "")
}
func TestClassify_NearestMatch(t *testing.T) {
c := New(DefaultConfidenceThreshold, FlattenWhitespace)
c.AddValue("gettysburg", gettysburg)
c.AddValue("declaration", declaration)
c.AddValue("loremipsum", loremipsum)
tests := []struct {
description string
input string // input string to match
name string // name of expected nearest match
minConf float64 // the lowest confidence accepted for the match
maxConf float64 // the highest confidence we expect for this match
}{
{
description: "Full Declaration",
input: declaration,
name: "declaration",
minConf: 1.0,
maxConf: 1.0,
},
{
description: "Modified Lorem",
input: modifiedLorem,
name: "loremipsum",
minConf: 0.90,
maxConf: 0.91,
},
{
description: "Modified Gettysburg",
input: modifiedGettysburg,
name: "gettysburg",
minConf: 0.86,
maxConf: 0.87,
},
}
for _, tt := range tests {
m := c.NearestMatch(tt.input)
if got, want := m.Name, tt.name; got != want {
t.Errorf("NearestMatch(%q) = %q, want %q", tt.description, got, want)
}
if got, want := m.Confidence, tt.minConf; got < want {
t.Errorf("NearestMatch(%q) returned confidence %v, want minimum of %v", tt.description, got, want)
}
if got, want := m.Confidence, tt.maxConf; got > want {
t.Errorf("NearestMatch(%q) = %v, want maxiumum of %v", tt.description, got, want)
}
}
}
type result struct {
key string // key of expected nearest match
offset int // offset of match in unknown string
// The confidence values are retrieved by simply running the classifier
// and noting the output. A value greater than the "max" is fine and
// the tests can be adjusted to account for it. A value less than "min"
// should be carefully scrutinzed before adjusting the tests.
minConf float64 // the lowest confidence accepted for the match
maxConf float64 // the highest confidence we expect for this match
}
func TestClassify_MultipleMatch(t *testing.T) {
c := New(DefaultConfidenceThreshold, FlattenWhitespace)
c.AddValue("gettysburg", gettysburg)
c.AddValue("declaration", declaration)
c.AddValue("declaration-close", declaration[:len(declaration)/2-1]+"_"+declaration[len(declaration)/2:])
c.AddValue("loremipsum", loremipsum)
cNormalize := New(DefaultConfidenceThreshold, FlattenWhitespace, removeNonWords)
cNormalize.AddValue("gettysburg", gettysburg)
tests := []struct {
description string
c *Classifier
input string // input string to match
want []result
}{
{
description: "Exact text match",
c: c,
input: fellowInTheGoatSkin + declaration + humourOfIreland,
want: []result{
{
key: "declaration",
offset: 845,
minConf: 1.0,
maxConf: 1.0,
},
},
},
{
description: "Partial text match",
c: c,
input: fellowInTheGoatSkin + modifiedLorem + humourOfIreland,
want: []result{
{
key: "loremipsum",
offset: 845,
minConf: 0.90,
maxConf: 0.91,
},
},
},
{
description: "Two partial matches",
c: c,
input: fellowInTheGoatSkin + modifiedLorem + humourOfIreland + modifiedGettysburg + oldCrowYoungCrow,
want: []result{
{
key: "loremipsum",
offset: 845,
minConf: 0.90,
maxConf: 0.91,
},
{
key: "gettysburg",
offset: 1750,
minConf: 0.86,
maxConf: 0.87,
},
},
},
{
description: "Partial matches of similar text",
c: c,
input: fellowInTheGoatSkin + modifiedLorem + humourOfIreland + lessModifiedLorem + oldCrowYoungCrow,
want: []result{
{
key: "loremipsum",
offset: 1750,
minConf: 0.98,
maxConf: 0.99,
},
{
key: "loremipsum",
offset: 845,
minConf: 0.90,
maxConf: 0.91,
},
},
},
{
description: "Nullifiable text",
c: c,
input: nullifiable,
want: nil,
},
{
description: "No match",
c: c,
input: fellowInTheGoatSkin + humourOfIreland,
want: nil,
},
{
description: "Exact text match, with extra word and non-word normalizer",
c: cNormalize,
input: fellowInTheGoatSkin + gettysburgExtraWord + humourOfIreland,
want: []result{
{
key: "gettysburg",
offset: 825,
minConf: 1.0,
maxConf: 1.0,
},
},
},
}
for _, tt := range tests {
matches := tt.c.MultipleMatch(tt.input)
if len(matches) != len(tt.want) {
t.Errorf("MultipleMatch(%q) not enough matches = %v, want %v", tt.description, len(matches), len(tt.want))
}
for i := 0; i < len(matches); i++ {
m := matches[i]
w := tt.want[i]
if got, want := m.Name, w.key; got != want {
t.Errorf("MultipleMatch(%q) = %q, want %q", tt.description, got, want)
}
if got, want := m.Confidence, w.minConf; got < want {
t.Errorf("MultipleMatch(%q) %q = %v, want minimum of %v", tt.description, w.key, got, want)
}
if got, want := m.Confidence, w.maxConf; got > want {
t.Errorf("MultipleMatch(%q) %q = %v, want maximum of %v", tt.description, w.key, got, want)
}
if got, want := m.Offset, w.offset; got != want {
t.Errorf("MultipleMatch(%q) %q = %v, want offset of %v", tt.description, w.key, got, want)
}
}
}
}
func TestClassify_DiffRatio(t *testing.T) {
tests := []struct {
x, y string
want float64
}{
{"", "", 1.0},
{"a", "b", 1.0},
{"", "abc", 0},
{"ab", "c", 0.5},
{"a", "bc", 0.5},
{"a", "bcde", 0.25},
}
for _, tt := range tests {
if got, want := diffRatio(tt.x, tt.y), tt.want; got != want {
t.Errorf("diffRatio(%q, %q) = %f, want %f", tt.x, tt.y, got, want)
}
}
}
func TestClassify_Matches(t *testing.T) {
tests := []struct {
description string
matches Matches
want Matches
}{
{
description: "Different names, same confidences, same offset",
matches: Matches{
&Match{
Name: "b",
Confidence: 0.42,
Offset: 0,
},
&Match{
Name: "a",
Confidence: 0.42,
Offset: 0,
},
},
want: Matches{
&Match{
Name: "a",
Confidence: 0.42,
Offset: 0,
},
&Match{
Name: "b",
Confidence: 0.42,
Offset: 0,
},
},
},
{
description: "Same names, different confidences, same offset",
matches: Matches{
&Match{
Name: "b",
Confidence: 0.42,
Offset: 0,
},
&Match{
Name: "b",
Confidence: 0.90,
Offset: 0,
},
},
want: Matches{
&Match{
Name: "b",
Confidence: 0.90,
Offset: 0,
},
&Match{
Name: "b",
Confidence: 0.42,
Offset: 0,
},
},
},
{
description: "Same names, same confidences, different offsets",
matches: Matches{
&Match{
Name: "b",
Confidence: 0.42,
Offset: 42,
},
&Match{
Name: "b",
Confidence: 0.42,
Offset: 0,
},
},
want: Matches{
&Match{
Name: "b",
Confidence: 0.42,
Offset: 0,
},
&Match{
Name: "b",
Confidence: 0.42,
Offset: 42,
},
},
},
{
description: "Different names, different confidences, same offset",
matches: Matches{
&Match{
Name: "b",
Confidence: 0.42,
Offset: 0,
},
&Match{
Name: "a",
Confidence: 0.90,
Offset: 0,
},
},
want: Matches{
&Match{
Name: "a",
Confidence: 0.90,
Offset: 0,
},
&Match{
Name: "b",
Confidence: 0.42,
Offset: 0,
},
},
},
{
description: "Different names, same confidences, different offset",
matches: Matches{
&Match{
Name: "b",
Confidence: 0.42,
Offset: 37,
},
&Match{
Name: "a",
Confidence: 0.42,
Offset: 0,
},
},
want: Matches{
&Match{
Name: "a",
Confidence: 0.42,
Offset: 0,
},
&Match{
Name: "b",
Confidence: 0.42,
Offset: 37,
},
},
},
{
description: "Different names, different confidences, different offset",
matches: Matches{
&Match{
Name: "a",
Confidence: 0.42,
Offset: 0,
},
&Match{
Name: "b",
Confidence: 0.90,
Offset: 37,
},
},
want: Matches{
&Match{
Name: "b",
Confidence: 0.90,
Offset: 37,
},
&Match{
Name: "a",
Confidence: 0.42,
Offset: 0,
},
},
},
}
for _, tt := range tests {
sort.Sort(tt.matches)
if !reflect.DeepEqual(tt.matches, tt.want) {
for _, x := range tt.matches {
t.Errorf("got: %v", x)
}
for _, x := range tt.want {
t.Errorf("want: %v", x)
}
t.Errorf("MatchesSort(%q) = %v, want %v", tt.description, tt.matches, tt.want)
}
}
}
func TestClassify_DiffRangeEnd(t *testing.T) {
dmp := diffmatchpatch.New()
tests := []struct {
description string
unknown string
known string
end int
}{
{
description: "identical",
unknown: declaration,
known: declaration,
end: 1,
},
{
description: "lorem",
unknown: lessModifiedLorem,
known: loremipsum,
end: 3,
},
{
description: "gettysburg",
unknown: modifiedGettysburg,
known: gettysburg,
end: 19,
},
}
for _, tt := range tests {
diffs := dmp.DiffMain(tt.unknown, tt.known, true)
if e := diffRangeEnd(tt.known, diffs); e != tt.end {
t.Errorf("DiffRangeEnd(%q) = end %v, want %v", tt.description, e, tt.end)
}
}
}
func BenchmarkClassifier(b *testing.B) {
c := New(DefaultConfidenceThreshold, FlattenWhitespace)
c.AddValue("gettysburg", gettysburg)
c.AddValue("declaration", declaration)
c.AddValue("loremipsum", loremipsum)
b.ResetTimer()
for i := 0; i < b.N; i++ {
c.NearestMatch(modifiedLorem)
}
}