stringclassifier/classifier_test.go - platform/external/licenseclassifier - Git at Google

 // Copyright 2017 Google Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //	http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 // Select test data comes from
 // The Project Gutenberg eBook of The humour of Ireland, by D. J., (David James), (1866-1917) O'Donoghue

 package stringclassifier

 import (
 	"reflect"
 	"regexp"
 	"sort"
 	"testing"

 	"github.com/sergi/go-diff/diffmatchpatch"
 )

 var (
 	gettysburg = `Four score and seven years ago our fathers brought forth
 on this continent, a new nation, conceived in Liberty, and dedicated to the
 proposition that all men are created equal.`
 	modifiedGettysburg = `Four score and seven years ago our fathers brought forth
 on this continent, a nation that was new and improved, conceived in Liberty, and
 dedicated to the proposition that all men are created equal.`
 	gettysburgExtraWord = `Four score and seven years ago our fathers brought forth
 on this continent, a new nation, conceived in Liberty, and dedicated to the
 proposition that all men are created equal.Foobar`

 	declaration = `When in the Course of human events, it becomes necessary
 for one people to dissolve the political bands which have connected them with
 another, and to assume among the powers of the earth, the separate and equal
 station to which the Laws of Nature and of Nature's God entitle them, a decent
 respect to the opinions of mankind requires that they should declare the causes
 which impel them to the separation.`

 	loremipsum = `Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla
 varius enim mattis, rhoncus lectus id, aliquet sem. Phasellus eget ex in dolor
 feugiat ultricies. Etiam interdum sit amet nisl in placerat.  Sed vitae enim
 vulputate, tempus leo commodo, accumsan nulla.`
 	modifiedLorem = `Lorem ipsum dolor amet, consectetur adipiscing elit. Nulla
 varius enim mattis, lectus id, aliquet rhoncus  sem. Phasellus eget ex in dolor
 feugiat ultricies. Etiam interdum sit amet sit  nisl in placerat.  Sed vitae enim
 vulputate, tempus leo commodo, accumsan nulla.`
 	lessModifiedLorem = `Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla
 varius enim mattis, rhoncus lectus id, aliquet. Phasellus eget ex in dolor
 feugiat ultricies. Etiam interdum sit amet nisl in placerat.  Sed vitae enim
 vulputate, tempus leo commodo, accumsan nulla.`
 	humourOfIreland = `As a rule, Irish poets have not extracted a pessimistic
 philosophy from liquor; they are “elevated,” not depressed, and do not deem
 it essential to the production of a poem that its author should be a cynic or
 an evil prophet. One of the best attributes of Irish poetry is its constant
 expression of the natural emotions. Previous to the close of the
 seventeenth[xvi] century, it is said, drunkenness was not suggested by the
 poets as common in Ireland—the popularity of Bacchanalian songs since that
 date seems to prove that the vice soon became a virtue. Maginn is the
 noisiest of modern revellers, and easily roars the others down.
 `
 	fellowInTheGoatSkin = `There was a poor widow living down there near the Iron
 Forge when the country was all covered with forests, and you might walk on
 the tops of trees from Carnew to the Lady’s Island, and she had one boy. She
 was very poor, as I said before, and was not able to buy clothes for her son.
 So when she was going out she fixed him snug and combustible in the ash-pit,
 and piled the warm ashes about him. The boy knew no better, and was as happy
 as the day was long; and he was happier still when a neighbour[10] gave his
 mother a kid to keep him company when herself was abroad. The kid and the lad
 played like two may-boys; and when she was old enough to give milk, wasn’t it
 a godsend to the little family? You won’t prevent the boy from growing up
 into a young man, but not a screed of clothes had he then no more than when
 he was a gorsoon.
 `
 	oldCrowYoungCrow = `There was an old crow teaching a young crow one day, and
 he said to him, “Now, my son,” says he, “listen to the advice I’m going to
 give you. If you see a person coming near you and stooping, mind yourself,
 and be on your keeping; he’s stooping for a stone to throw at you.”

 “But tell me,” says the young crow, “what should I do if he had a stone
 already down in his pocket?”

 “Musha, go ’long out of that,” says the old crow, “you’ve learned enough; the
 devil another learning I’m able to give you.”
 `
 	nullifiable = `[[ , _ , _ , _
 ? _ : _
 ? _ : _
 ? _ : _
 ]
 }
 `
 	nonWords = regexp.MustCompile("[[:punct:]]+")
 )

 // removeNonWords removes non-words from the string, replacing them with empty
 // string. (This is meant to exercise tokenization problems.)
 func removeNonWords(s string) string {
 	return nonWords.ReplaceAllString(s, "")
 }

 func TestClassify_NearestMatch(t *testing.T) {
 	c := New(DefaultConfidenceThreshold, FlattenWhitespace)
 	c.AddValue("gettysburg", gettysburg)
 	c.AddValue("declaration", declaration)
 	c.AddValue("loremipsum", loremipsum)

 	tests := []struct {
 		description string
 		input       string  // input string to match
 		name        string  // name of expected nearest match
 		minConf     float64 // the lowest confidence accepted for the match
 		maxConf     float64 // the highest confidence we expect for this match
 	}{
 		{
 			description: "Full Declaration",
 			input:       declaration,
 			name:        "declaration",
 			minConf:     1.0,
 			maxConf:     1.0,
 		},
 		{
 			description: "Modified Lorem",
 			input:       modifiedLorem,
 			name:        "loremipsum",
 			minConf:     0.90,
 			maxConf:     0.91,
 		},
 		{
 			description: "Modified Gettysburg",
 			input:       modifiedGettysburg,
 			name:        "gettysburg",
 			minConf:     0.86,
 			maxConf:     0.87,
 		},
 	}

 	for _, tt := range tests {
 		m := c.NearestMatch(tt.input)

 		if got, want := m.Name, tt.name; got != want {
 			t.Errorf("NearestMatch(%q) = %q, want %q", tt.description, got, want)
 		}
 		if got, want := m.Confidence, tt.minConf; got < want {
 			t.Errorf("NearestMatch(%q) returned confidence %v, want minimum of %v", tt.description, got, want)
 		}
 		if got, want := m.Confidence, tt.maxConf; got > want {
 			t.Errorf("NearestMatch(%q) = %v, want maxiumum of %v", tt.description, got, want)
 		}
 	}
 }

 type result struct {
 	key    string // key of expected nearest match
 	offset int    // offset of match in unknown string

 	// The confidence values are retrieved by simply running the classifier
 	// and noting the output. A value greater than the "max" is fine and
 	// the tests can be adjusted to account for it. A value less than "min"
 	// should be carefully scrutinzed before adjusting the tests.
 	minConf float64 // the lowest confidence accepted for the match
 	maxConf float64 // the highest confidence we expect for this match
 }

 func TestClassify_MultipleMatch(t *testing.T) {
 	c := New(DefaultConfidenceThreshold, FlattenWhitespace)
 	c.AddValue("gettysburg", gettysburg)
 	c.AddValue("declaration", declaration)
 	c.AddValue("declaration-close", declaration[:len(declaration)/2-1]+"_"+declaration[len(declaration)/2:])
 	c.AddValue("loremipsum", loremipsum)

 	cNormalize := New(DefaultConfidenceThreshold, FlattenWhitespace, removeNonWords)
 	cNormalize.AddValue("gettysburg", gettysburg)

 	tests := []struct {
 		description string
 		c           *Classifier
 		input       string // input string to match
 		want        []result
 	}{
 		{
 			description: "Exact text match",
 			c:           c,
 			input:       fellowInTheGoatSkin + declaration + humourOfIreland,
 			want: []result{
 				{
 					key:     "declaration",
 					offset:  845,
 					minConf: 1.0,
 					maxConf: 1.0,
 				},
 			},
 		},
 		{
 			description: "Partial text match",
 			c:           c,
 			input:       fellowInTheGoatSkin + modifiedLorem + humourOfIreland,
 			want: []result{
 				{
 					key:     "loremipsum",
 					offset:  845,
 					minConf: 0.90,
 					maxConf: 0.91,
 				},
 			},
 		},
 		{
 			description: "Two partial matches",
 			c:           c,
 			input:       fellowInTheGoatSkin + modifiedLorem + humourOfIreland + modifiedGettysburg + oldCrowYoungCrow,
 			want: []result{
 				{
 					key:     "loremipsum",
 					offset:  845,
 					minConf: 0.90,
 					maxConf: 0.91,
 				},
 				{
 					key:     "gettysburg",
 					offset:  1750,
 					minConf: 0.86,
 					maxConf: 0.87,
 				},
 			},
 		},
 		{
 			description: "Partial matches of similar text",
 			c:           c,
 			input:       fellowInTheGoatSkin + modifiedLorem + humourOfIreland + lessModifiedLorem + oldCrowYoungCrow,
 			want: []result{
 				{
 					key:     "loremipsum",
 					offset:  1750,
 					minConf: 0.98,
 					maxConf: 0.99,
 				},
 				{
 					key:     "loremipsum",
 					offset:  845,
 					minConf: 0.90,
 					maxConf: 0.91,
 				},
 			},
 		},
 		{
 			description: "Nullifiable text",
 			c:           c,
 			input:       nullifiable,
 			want:        nil,
 		},
 		{
 			description: "No match",
 			c:           c,
 			input:       fellowInTheGoatSkin + humourOfIreland,
 			want:        nil,
 		},
 		{
 			description: "Exact text match, with extra word and non-word normalizer",
 			c:           cNormalize,
 			input:       fellowInTheGoatSkin + gettysburgExtraWord + humourOfIreland,
 			want: []result{
 				{
 					key:     "gettysburg",
 					offset:  825,
 					minConf: 1.0,
 					maxConf: 1.0,
 				},
 			},
 		},
 	}

 	for _, tt := range tests {
 		matches := tt.c.MultipleMatch(tt.input)
 		if len(matches) != len(tt.want) {
 			t.Errorf("MultipleMatch(%q) not enough matches = %v, want %v", tt.description, len(matches), len(tt.want))
 		}

 		for i := 0; i < len(matches); i++ {
 			m := matches[i]
 			w := tt.want[i]
 			if got, want := m.Name, w.key; got != want {
 				t.Errorf("MultipleMatch(%q) = %q, want %q", tt.description, got, want)
 			}
 			if got, want := m.Confidence, w.minConf; got < want {
 				t.Errorf("MultipleMatch(%q) %q = %v, want minimum of %v", tt.description, w.key, got, want)
 			}
 			if got, want := m.Confidence, w.maxConf; got > want {
 				t.Errorf("MultipleMatch(%q) %q = %v, want maximum of %v", tt.description, w.key, got, want)
 			}
 			if got, want := m.Offset, w.offset; got != want {
 				t.Errorf("MultipleMatch(%q) %q = %v, want offset of %v", tt.description, w.key, got, want)
 			}
 		}
 	}
 }

 func TestClassify_DiffRatio(t *testing.T) {
 	tests := []struct {
 		x, y string
 		want float64
 	}{
 		{"", "", 1.0},
 		{"a", "b", 1.0},
 		{"", "abc", 0},
 		{"ab", "c", 0.5},
 		{"a", "bc", 0.5},
 		{"a", "bcde", 0.25},
 	}

 	for _, tt := range tests {
 		if got, want := diffRatio(tt.x, tt.y), tt.want; got != want {
 			t.Errorf("diffRatio(%q, %q) = %f, want %f", tt.x, tt.y, got, want)
 		}
 	}
 }

 func TestClassify_Matches(t *testing.T) {
 	tests := []struct {
 		description string
 		matches     Matches
 		want        Matches
 	}{
 		{
 			description: "Different names, same confidences, same offset",
 			matches: Matches{
 				&Match{
 					Name:       "b",
 					Confidence: 0.42,
 					Offset:     0,
 				},
 				&Match{
 					Name:       "a",
 					Confidence: 0.42,
 					Offset:     0,
 				},
 			},
 			want: Matches{
 				&Match{
 					Name:       "a",
 					Confidence: 0.42,
 					Offset:     0,
 				},
 				&Match{
 					Name:       "b",
 					Confidence: 0.42,
 					Offset:     0,
 				},
 			},
 		},
 		{
 			description: "Same names, different confidences, same offset",
 			matches: Matches{
 				&Match{
 					Name:       "b",
 					Confidence: 0.42,
 					Offset:     0,
 				},
 				&Match{
 					Name:       "b",
 					Confidence: 0.90,
 					Offset:     0,
 				},
 			},
 			want: Matches{
 				&Match{
 					Name:       "b",
 					Confidence: 0.90,
 					Offset:     0,
 				},
 				&Match{
 					Name:       "b",
 					Confidence: 0.42,
 					Offset:     0,
 				},
 			},
 		},
 		{
 			description: "Same names, same confidences, different offsets",
 			matches: Matches{
 				&Match{
 					Name:       "b",
 					Confidence: 0.42,
 					Offset:     42,
 				},
 				&Match{
 					Name:       "b",
 					Confidence: 0.42,
 					Offset:     0,
 				},
 			},
 			want: Matches{
 				&Match{
 					Name:       "b",
 					Confidence: 0.42,
 					Offset:     0,
 				},
 				&Match{
 					Name:       "b",
 					Confidence: 0.42,
 					Offset:     42,
 				},
 			},
 		},

 		{
 			description: "Different names, different confidences, same offset",
 			matches: Matches{
 				&Match{
 					Name:       "b",
 					Confidence: 0.42,
 					Offset:     0,
 				},
 				&Match{
 					Name:       "a",
 					Confidence: 0.90,
 					Offset:     0,
 				},
 			},
 			want: Matches{
 				&Match{
 					Name:       "a",
 					Confidence: 0.90,
 					Offset:     0,
 				},
 				&Match{
 					Name:       "b",
 					Confidence: 0.42,
 					Offset:     0,
 				},
 			},
 		},
 		{
 			description: "Different names, same confidences, different offset",
 			matches: Matches{
 				&Match{
 					Name:       "b",
 					Confidence: 0.42,
 					Offset:     37,
 				},
 				&Match{
 					Name:       "a",
 					Confidence: 0.42,
 					Offset:     0,
 				},
 			},
 			want: Matches{
 				&Match{
 					Name:       "a",
 					Confidence: 0.42,
 					Offset:     0,
 				},
 				&Match{
 					Name:       "b",
 					Confidence: 0.42,
 					Offset:     37,
 				},
 			},
 		},
 		{
 			description: "Different names, different confidences, different offset",
 			matches: Matches{
 				&Match{
 					Name:       "a",
 					Confidence: 0.42,
 					Offset:     0,
 				},
 				&Match{
 					Name:       "b",
 					Confidence: 0.90,
 					Offset:     37,
 				},
 			},
 			want: Matches{
 				&Match{
 					Name:       "b",
 					Confidence: 0.90,
 					Offset:     37,
 				},
 				&Match{
 					Name:       "a",
 					Confidence: 0.42,
 					Offset:     0,
 				},
 			},
 		},
 	}

 	for _, tt := range tests {
 		sort.Sort(tt.matches)
 		if !reflect.DeepEqual(tt.matches, tt.want) {
 			for _, x := range tt.matches {
 				t.Errorf("got: %v", x)
 			}
 			for _, x := range tt.want {
 				t.Errorf("want: %v", x)
 			}
 			t.Errorf("MatchesSort(%q) = %v, want %v", tt.description, tt.matches, tt.want)
 		}
 	}
 }

 func TestClassify_DiffRangeEnd(t *testing.T) {
 	dmp := diffmatchpatch.New()
 	tests := []struct {
 		description string
 		unknown     string
 		known       string
 		end         int
 	}{
 		{
 			description: "identical",
 			unknown:     declaration,
 			known:       declaration,
 			end:         1,
 		},
 		{
 			description: "lorem",
 			unknown:     lessModifiedLorem,
 			known:       loremipsum,
 			end:         3,
 		},
 		{
 			description: "gettysburg",
 			unknown:     modifiedGettysburg,
 			known:       gettysburg,
 			end:         19,
 		},
 	}

 	for _, tt := range tests {
 		diffs := dmp.DiffMain(tt.unknown, tt.known, true)
 		if e := diffRangeEnd(tt.known, diffs); e != tt.end {
 			t.Errorf("DiffRangeEnd(%q) = end %v, want %v", tt.description, e, tt.end)
 		}
 	}
 }

 func BenchmarkClassifier(b *testing.B) {
 	c := New(DefaultConfidenceThreshold, FlattenWhitespace)
 	c.AddValue("gettysburg", gettysburg)
 	c.AddValue("declaration", declaration)
 	c.AddValue("loremipsum", loremipsum)

 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
 		c.NearestMatch(modifiedLorem)
 	}
 }
	// Copyright 2017 Google Inc.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	//
	// Select test data comes from
	// The Project Gutenberg eBook of The humour of Ireland, by D. J., (David James), (1866-1917) O'Donoghue

	package stringclassifier

	import (
	"reflect"
	"regexp"
	"sort"
	"testing"

	"github.com/sergi/go-diff/diffmatchpatch"
	)

	var (
	gettysburg = `Four score and seven years ago our fathers brought forth
	on this continent, a new nation, conceived in Liberty, and dedicated to the
	proposition that all men are created equal.`
	modifiedGettysburg = `Four score and seven years ago our fathers brought forth
	on this continent, a nation that was new and improved, conceived in Liberty, and
	dedicated to the proposition that all men are created equal.`
	gettysburgExtraWord = `Four score and seven years ago our fathers brought forth
	on this continent, a new nation, conceived in Liberty, and dedicated to the
	proposition that all men are created equal.Foobar`

	declaration = `When in the Course of human events, it becomes necessary
	for one people to dissolve the political bands which have connected them with
	another, and to assume among the powers of the earth, the separate and equal
	station to which the Laws of Nature and of Nature's God entitle them, a decent
	respect to the opinions of mankind requires that they should declare the causes
	which impel them to the separation.`

	loremipsum = `Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla
	varius enim mattis, rhoncus lectus id, aliquet sem. Phasellus eget ex in dolor
	feugiat ultricies. Etiam interdum sit amet nisl in placerat. Sed vitae enim
	vulputate, tempus leo commodo, accumsan nulla.`
	modifiedLorem = `Lorem ipsum dolor amet, consectetur adipiscing elit. Nulla
	varius enim mattis, lectus id, aliquet rhoncus sem. Phasellus eget ex in dolor
	feugiat ultricies. Etiam interdum sit amet sit nisl in placerat. Sed vitae enim
	vulputate, tempus leo commodo, accumsan nulla.`
	lessModifiedLorem = `Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla
	varius enim mattis, rhoncus lectus id, aliquet. Phasellus eget ex in dolor
	feugiat ultricies. Etiam interdum sit amet nisl in placerat. Sed vitae enim
	vulputate, tempus leo commodo, accumsan nulla.`
	humourOfIreland = `As a rule, Irish poets have not extracted a pessimistic
	philosophy from liquor; they are “elevated,” not depressed, and do not deem
	it essential to the production of a poem that its author should be a cynic or
	an evil prophet. One of the best attributes of Irish poetry is its constant
	expression of the natural emotions. Previous to the close of the
	seventeenth[xvi] century, it is said, drunkenness was not suggested by the
	poets as common in Ireland—the popularity of Bacchanalian songs since that
	date seems to prove that the vice soon became a virtue. Maginn is the
	noisiest of modern revellers, and easily roars the others down.
	`
	fellowInTheGoatSkin = `There was a poor widow living down there near the Iron
	Forge when the country was all covered with forests, and you might walk on
	the tops of trees from Carnew to the Lady’s Island, and she had one boy. She
	was very poor, as I said before, and was not able to buy clothes for her son.
	So when she was going out she fixed him snug and combustible in the ash-pit,
	and piled the warm ashes about him. The boy knew no better, and was as happy
	as the day was long; and he was happier still when a neighbour[10] gave his
	mother a kid to keep him company when herself was abroad. The kid and the lad
	played like two may-boys; and when she was old enough to give milk, wasn’t it
	a godsend to the little family? You won’t prevent the boy from growing up
	into a young man, but not a screed of clothes had he then no more than when
	he was a gorsoon.
	`
	oldCrowYoungCrow = `There was an old crow teaching a young crow one day, and
	he said to him, “Now, my son,” says he, “listen to the advice I’m going to
	give you. If you see a person coming near you and stooping, mind yourself,
	and be on your keeping; he’s stooping for a stone to throw at you.”

	“But tell me,” says the young crow, “what should I do if he had a stone
	already down in his pocket?”

	“Musha, go ’long out of that,” says the old crow, “you’ve learned enough; the
	devil another learning I’m able to give you.”
	`
	nullifiable = `[[ , _ , _ , _
	? _ : _
	? _ : _
	? _ : _
	]
	}
	`
	nonWords = regexp.MustCompile("[[:punct:]]+")
	)

	// removeNonWords removes non-words from the string, replacing them with empty
	// string. (This is meant to exercise tokenization problems.)
	func removeNonWords(s string) string {
	return nonWords.ReplaceAllString(s, "")
	}

	func TestClassify_NearestMatch(t *testing.T) {
	c := New(DefaultConfidenceThreshold, FlattenWhitespace)
	c.AddValue("gettysburg", gettysburg)
	c.AddValue("declaration", declaration)
	c.AddValue("loremipsum", loremipsum)

	tests := []struct {
	description string
	input string // input string to match
	name string // name of expected nearest match
	minConf float64 // the lowest confidence accepted for the match
	maxConf float64 // the highest confidence we expect for this match
	}{
	{
	description: "Full Declaration",
	input: declaration,
	name: "declaration",
	minConf: 1.0,
	maxConf: 1.0,
	},
	{
	description: "Modified Lorem",
	input: modifiedLorem,
	name: "loremipsum",
	minConf: 0.90,
	maxConf: 0.91,
	},
	{
	description: "Modified Gettysburg",
	input: modifiedGettysburg,
	name: "gettysburg",
	minConf: 0.86,
	maxConf: 0.87,
	},
	}

	for _, tt := range tests {
	m := c.NearestMatch(tt.input)

	if got, want := m.Name, tt.name; got != want {
	t.Errorf("NearestMatch(%q) = %q, want %q", tt.description, got, want)
	}
	if got, want := m.Confidence, tt.minConf; got < want {
	t.Errorf("NearestMatch(%q) returned confidence %v, want minimum of %v", tt.description, got, want)
	}
	if got, want := m.Confidence, tt.maxConf; got > want {
	t.Errorf("NearestMatch(%q) = %v, want maxiumum of %v", tt.description, got, want)
	}
	}
	}

	type result struct {
	key string // key of expected nearest match
	offset int // offset of match in unknown string

	// The confidence values are retrieved by simply running the classifier
	// and noting the output. A value greater than the "max" is fine and
	// the tests can be adjusted to account for it. A value less than "min"
	// should be carefully scrutinzed before adjusting the tests.
	minConf float64 // the lowest confidence accepted for the match
	maxConf float64 // the highest confidence we expect for this match
	}

	func TestClassify_MultipleMatch(t *testing.T) {
	c := New(DefaultConfidenceThreshold, FlattenWhitespace)
	c.AddValue("gettysburg", gettysburg)
	c.AddValue("declaration", declaration)
	c.AddValue("declaration-close", declaration[:len(declaration)/2-1]+"_"+declaration[len(declaration)/2:])
	c.AddValue("loremipsum", loremipsum)

	cNormalize := New(DefaultConfidenceThreshold, FlattenWhitespace, removeNonWords)
	cNormalize.AddValue("gettysburg", gettysburg)

	tests := []struct {
	description string
	c *Classifier
	input string // input string to match
	want []result
	}{
	{
	description: "Exact text match",
	c: c,
	input: fellowInTheGoatSkin + declaration + humourOfIreland,
	want: []result{
	{
	key: "declaration",
	offset: 845,
	minConf: 1.0,
	maxConf: 1.0,
	},
	},
	},
	{
	description: "Partial text match",
	c: c,
	input: fellowInTheGoatSkin + modifiedLorem + humourOfIreland,
	want: []result{
	{
	key: "loremipsum",
	offset: 845,
	minConf: 0.90,
	maxConf: 0.91,
	},
	},
	},
	{
	description: "Two partial matches",
	c: c,
	input: fellowInTheGoatSkin + modifiedLorem + humourOfIreland + modifiedGettysburg + oldCrowYoungCrow,
	want: []result{
	{
	key: "loremipsum",
	offset: 845,
	minConf: 0.90,
	maxConf: 0.91,
	},
	{
	key: "gettysburg",
	offset: 1750,
	minConf: 0.86,
	maxConf: 0.87,
	},
	},
	},
	{
	description: "Partial matches of similar text",
	c: c,
	input: fellowInTheGoatSkin + modifiedLorem + humourOfIreland + lessModifiedLorem + oldCrowYoungCrow,
	want: []result{
	{
	key: "loremipsum",
	offset: 1750,
	minConf: 0.98,
	maxConf: 0.99,
	},
	{
	key: "loremipsum",
	offset: 845,
	minConf: 0.90,
	maxConf: 0.91,
	},
	},
	},
	{
	description: "Nullifiable text",
	c: c,
	input: nullifiable,
	want: nil,
	},
	{
	description: "No match",
	c: c,
	input: fellowInTheGoatSkin + humourOfIreland,
	want: nil,
	},
	{
	description: "Exact text match, with extra word and non-word normalizer",
	c: cNormalize,
	input: fellowInTheGoatSkin + gettysburgExtraWord + humourOfIreland,
	want: []result{
	{
	key: "gettysburg",
	offset: 825,
	minConf: 1.0,
	maxConf: 1.0,
	},
	},
	},
	}

	for _, tt := range tests {
	matches := tt.c.MultipleMatch(tt.input)
	if len(matches) != len(tt.want) {
	t.Errorf("MultipleMatch(%q) not enough matches = %v, want %v", tt.description, len(matches), len(tt.want))
	}

	for i := 0; i < len(matches); i++ {
	m := matches[i]
	w := tt.want[i]
	if got, want := m.Name, w.key; got != want {
	t.Errorf("MultipleMatch(%q) = %q, want %q", tt.description, got, want)
	}
	if got, want := m.Confidence, w.minConf; got < want {
	t.Errorf("MultipleMatch(%q) %q = %v, want minimum of %v", tt.description, w.key, got, want)
	}
	if got, want := m.Confidence, w.maxConf; got > want {
	t.Errorf("MultipleMatch(%q) %q = %v, want maximum of %v", tt.description, w.key, got, want)
	}
	if got, want := m.Offset, w.offset; got != want {
	t.Errorf("MultipleMatch(%q) %q = %v, want offset of %v", tt.description, w.key, got, want)
	}
	}
	}
	}

	func TestClassify_DiffRatio(t *testing.T) {
	tests := []struct {
	x, y string
	want float64
	}{
	{"", "", 1.0},
	{"a", "b", 1.0},
	{"", "abc", 0},
	{"ab", "c", 0.5},
	{"a", "bc", 0.5},
	{"a", "bcde", 0.25},
	}

	for _, tt := range tests {
	if got, want := diffRatio(tt.x, tt.y), tt.want; got != want {
	t.Errorf("diffRatio(%q, %q) = %f, want %f", tt.x, tt.y, got, want)
	}
	}
	}

	func TestClassify_Matches(t *testing.T) {
	tests := []struct {
	description string
	matches Matches
	want Matches
	}{
	{
	description: "Different names, same confidences, same offset",
	matches: Matches{
	&Match{
	Name: "b",
	Confidence: 0.42,
	Offset: 0,
	},
	&Match{
	Name: "a",
	Confidence: 0.42,
	Offset: 0,
	},
	},
	want: Matches{
	&Match{
	Name: "a",
	Confidence: 0.42,
	Offset: 0,
	},
	&Match{
	Name: "b",
	Confidence: 0.42,
	Offset: 0,
	},
	},
	},
	{
	description: "Same names, different confidences, same offset",
	matches: Matches{
	&Match{
	Name: "b",
	Confidence: 0.42,
	Offset: 0,
	},
	&Match{
	Name: "b",
	Confidence: 0.90,
	Offset: 0,
	},
	},
	want: Matches{
	&Match{
	Name: "b",
	Confidence: 0.90,
	Offset: 0,
	},
	&Match{
	Name: "b",
	Confidence: 0.42,
	Offset: 0,
	},
	},
	},
	{
	description: "Same names, same confidences, different offsets",
	matches: Matches{
	&Match{
	Name: "b",
	Confidence: 0.42,
	Offset: 42,
	},
	&Match{
	Name: "b",
	Confidence: 0.42,
	Offset: 0,
	},
	},
	want: Matches{
	&Match{
	Name: "b",
	Confidence: 0.42,
	Offset: 0,
	},
	&Match{
	Name: "b",
	Confidence: 0.42,
	Offset: 42,
	},
	},
	},

	{
	description: "Different names, different confidences, same offset",
	matches: Matches{
	&Match{
	Name: "b",
	Confidence: 0.42,
	Offset: 0,
	},
	&Match{
	Name: "a",
	Confidence: 0.90,
	Offset: 0,
	},
	},
	want: Matches{
	&Match{
	Name: "a",
	Confidence: 0.90,
	Offset: 0,
	},
	&Match{
	Name: "b",
	Confidence: 0.42,
	Offset: 0,
	},
	},
	},
	{
	description: "Different names, same confidences, different offset",
	matches: Matches{
	&Match{
	Name: "b",
	Confidence: 0.42,
	Offset: 37,
	},
	&Match{
	Name: "a",
	Confidence: 0.42,
	Offset: 0,
	},
	},
	want: Matches{
	&Match{
	Name: "a",
	Confidence: 0.42,
	Offset: 0,
	},
	&Match{
	Name: "b",
	Confidence: 0.42,
	Offset: 37,
	},
	},
	},
	{
	description: "Different names, different confidences, different offset",
	matches: Matches{
	&Match{
	Name: "a",
	Confidence: 0.42,
	Offset: 0,
	},
	&Match{
	Name: "b",
	Confidence: 0.90,
	Offset: 37,
	},
	},
	want: Matches{
	&Match{
	Name: "b",
	Confidence: 0.90,
	Offset: 37,
	},
	&Match{
	Name: "a",
	Confidence: 0.42,
	Offset: 0,
	},
	},
	},
	}

	for _, tt := range tests {
	sort.Sort(tt.matches)
	if !reflect.DeepEqual(tt.matches, tt.want) {
	for _, x := range tt.matches {
	t.Errorf("got: %v", x)
	}
	for _, x := range tt.want {
	t.Errorf("want: %v", x)
	}
	t.Errorf("MatchesSort(%q) = %v, want %v", tt.description, tt.matches, tt.want)
	}
	}
	}

	func TestClassify_DiffRangeEnd(t *testing.T) {
	dmp := diffmatchpatch.New()
	tests := []struct {
	description string
	unknown string
	known string
	end int
	}{
	{
	description: "identical",
	unknown: declaration,
	known: declaration,
	end: 1,
	},
	{
	description: "lorem",
	unknown: lessModifiedLorem,
	known: loremipsum,
	end: 3,
	},
	{
	description: "gettysburg",
	unknown: modifiedGettysburg,
	known: gettysburg,
	end: 19,
	},
	}

	for _, tt := range tests {
	diffs := dmp.DiffMain(tt.unknown, tt.known, true)
	if e := diffRangeEnd(tt.known, diffs); e != tt.end {
	t.Errorf("DiffRangeEnd(%q) = end %v, want %v", tt.description, e, tt.end)
	}
	}
	}

	func BenchmarkClassifier(b *testing.B) {
	c := New(DefaultConfidenceThreshold, FlattenWhitespace)
	c.AddValue("gettysburg", gettysburg)
	c.AddValue("declaration", declaration)
	c.AddValue("loremipsum", loremipsum)

	b.ResetTimer()
	for i := 0; i < b.N; i++ {
	c.NearestMatch(modifiedLorem)
	}
	}