blob: c54c0739788deaf24e0e487cd62d3102b1e4a0eb [file] [log] [blame]
// Copyright 2017 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package searchset
import (
"reflect"
"testing"
"github.com/google/licenseclassifier/stringclassifier/searchset/tokenizer"
)
const (
shortPostmodernThesis = "In the works of Joyce, a predominant concept is the concept of semioticist"
postmodernThesis = `1. Joyce and neotextual modernist theory
In the works of Joyce, a predominant concept is the concept of semioticist
culture. The without/within distinction intrinsic to Finnegan's Wake emerges
again in Ulysses, although in a more neomodern sense.
`
)
func TestSearchSet_New(t *testing.T) {
tests := []struct {
description string
text string
granularity int
want *SearchSet
}{
{
description: "Empty string",
text: "",
granularity: DefaultGranularity,
want: &SearchSet{
Tokens: nil,
Hashes: make(tokenizer.Hash),
Checksums: nil,
ChecksumRanges: nil,
},
},
{
description: "Small string",
text: "Hello world",
granularity: 4,
want: &SearchSet{
Tokens: tokenizer.Tokens{
{Text: "Hello", Offset: 0},
{Text: "world", Offset: 6},
},
Hashes: tokenizer.Hash{2346098258: tokenizer.TokenRanges{{Start: 0, End: 2}}},
Checksums: []uint32{2346098258},
ChecksumRanges: tokenizer.TokenRanges{{Start: 0, End: 2}},
nodes: []*node{{2346098258, &tokenizer.TokenRange{Start: 0, End: 2}}},
},
},
}
for _, tt := range tests {
if got := New(tt.text, tt.granularity); !reflect.DeepEqual(got, tt.want) {
t.Errorf("New(%q) = %+v, want %+v", tt.description, got, tt.want)
}
}
}
func TestSearchSet_NodeConstruction(t *testing.T) {
s := New(shortPostmodernThesis, DefaultGranularity)
want := []string{
"[0:3]", "[1:4]", "[2:5]", "[3:6]", "[4:7]", "[5:8]", "[6:9]",
"[7:10]", "[8:11]", "[9:12]", "[10:13]", "[11:14]",
}
if len(s.nodes) != len(want) {
t.Errorf("Number of nodes %v, want %v", len(s.nodes), len(want))
return
}
for i := 0; i < len(s.nodes); i++ {
if got := s.nodes[i].String(); got != want[i] {
t.Errorf("Nodes = got:\n%s\nwant:\n%s", got, want[i])
}
}
}
func TestSearchSet_CoalesceTokenRanges(t *testing.T) {
tests := []struct {
description string
mr MatchRanges
want MatchRanges
}{
{
description: "Non-overlapping Ranges",
mr: MatchRanges{
{SrcStart: 0, SrcEnd: 27, TargetStart: 0, TargetEnd: 27},
{SrcStart: 37, SrcEnd: 927, TargetStart: 37, TargetEnd: 927},
},
want: MatchRanges{
{SrcStart: 0, SrcEnd: 27, TargetStart: 0, TargetEnd: 27},
{SrcStart: 37, SrcEnd: 927, TargetStart: 37, TargetEnd: 927},
},
},
{
description: "Identical Ranges",
mr: MatchRanges{
{SrcStart: 0, SrcEnd: 37, TargetStart: 0, TargetEnd: 37},
{SrcStart: 0, SrcEnd: 37, TargetStart: 0, TargetEnd: 37},
},
want: MatchRanges{{SrcStart: 0, SrcEnd: 37, TargetStart: 0, TargetEnd: 37}},
},
{
description: "Sequential Ranges",
mr: MatchRanges{
{SrcStart: 0, SrcEnd: 37, TargetStart: 0, TargetEnd: 37},
{SrcStart: 37, SrcEnd: 927, TargetStart: 37, TargetEnd: 927},
},
want: MatchRanges{{SrcStart: 0, SrcEnd: 927, TargetStart: 0, TargetEnd: 927}},
},
{
description: "Overlapping Ranges - Same Start",
mr: MatchRanges{
{SrcStart: 0, SrcEnd: 37, TargetStart: 0, TargetEnd: 37},
{SrcStart: 0, SrcEnd: 927, TargetStart: 0, TargetEnd: 927},
},
want: MatchRanges{{SrcStart: 0, SrcEnd: 927, TargetStart: 0, TargetEnd: 927}},
},
{
description: "Overlapping Ranges - Different Start",
mr: MatchRanges{
{SrcStart: 0, SrcEnd: 37, TargetStart: 0, TargetEnd: 37},
{SrcStart: 27, SrcEnd: 927, TargetStart: 27, TargetEnd: 927},
},
want: MatchRanges{{SrcStart: 0, SrcEnd: 927, TargetStart: 0, TargetEnd: 927}},
},
{
description: "Overlapping Ranges - Same End",
mr: MatchRanges{
{SrcStart: 0, SrcEnd: 37, TargetStart: 0, TargetEnd: 37},
{SrcStart: 27, SrcEnd: 37, TargetStart: 27, TargetEnd: 37},
},
want: MatchRanges{{SrcStart: 0, SrcEnd: 37, TargetStart: 0, TargetEnd: 37}},
},
{
description: "Completely Overlapping Ranges",
mr: MatchRanges{
{SrcStart: 0, SrcEnd: 42, TargetStart: 0, TargetEnd: 42},
{SrcStart: 27, SrcEnd: 37, TargetStart: 27, TargetEnd: 37},
},
want: MatchRanges{{SrcStart: 0, SrcEnd: 42, TargetStart: 0, TargetEnd: 42}},
},
}
for _, tt := range tests {
got := coalesceMatchRanges(tt.mr)
if !reflect.DeepEqual(got, tt.want) {
t.Errorf("CoalesceTokenRanges(%q) = %+v, want %+v", tt.description, got, tt.want)
}
}
}
func TestSearchSet_FindPotentialMatches(t *testing.T) {
known := New(postmodernThesis, DefaultGranularity)
size := len(postmodernThesis)
modified := "hello world "
modified += postmodernThesis[:size/3] + " hello world "
modified += postmodernThesis[size/3 : 2*size/3-4]
modified += postmodernThesis[2*size/3+7:]
unknown := New(modified, DefaultGranularity)
want := []MatchRanges{{
{SrcStart: 0, SrcEnd: 15, TargetStart: 2, TargetEnd: 17},
{SrcStart: 16, SrcEnd: 28, TargetStart: 21, TargetEnd: 33},
{SrcStart: 31, SrcEnd: 46, TargetStart: 34, TargetEnd: 49},
}}
got := FindPotentialMatches(known, unknown)
if len(got) != len(want) {
t.Errorf("Number of matches %v, want %v", len(got), len(want))
}
if !reflect.DeepEqual(got, want) {
t.Errorf("Offsets = %+v, want %+v", got, want)
}
known = New(`again in Ulysses, although in a more neomodern sense.
culture. The without/within distinction intrinsic to Finnegan's Wake emerges
`, DefaultGranularity)
want = []MatchRanges{
{
{SrcStart: 11, SrcEnd: 18, TargetStart: 26, TargetEnd: 33},
{SrcStart: 21, SrcEnd: 25, TargetStart: 34, TargetEnd: 38},
},
{{SrcStart: 0, SrcEnd: 11, TargetStart: 38, TargetEnd: 49}},
}
got = FindPotentialMatches(known, unknown)
if len(got) != len(want) {
t.Errorf("Number of matches %v, want %v", len(got), len(want))
}
if !reflect.DeepEqual(got, want) {
t.Errorf("Offsets = %+v, want %+v", got, want)
}
}
func TestSearchSet_GetMatchedRanges(t *testing.T) {
const (
source = "a b c d e f g c d e h i j"
target = "a b c _ _ c d e _ f g h _ c d e _ h i j"
)
src := New(source, DefaultGranularity)
tar := New(target, DefaultGranularity)
want := []MatchRanges{
{
{SrcStart: 0, SrcEnd: 3, TargetStart: 0, TargetEnd: 3},
{SrcStart: 2, SrcEnd: 5, TargetStart: 5, TargetEnd: 8},
{SrcStart: 7, SrcEnd: 10, TargetStart: 13, TargetEnd: 16},
{SrcStart: 10, SrcEnd: 13, TargetStart: 17, TargetEnd: 20},
},
}
got := getMatchedRanges(src, tar)
if len(got) != len(want) {
t.Errorf("Number of matches %v, want %v", len(got), len(want))
}
if !reflect.DeepEqual(got, want) {
t.Errorf("Match Ranges = %+v, want %+v", got, want)
}
}
func TestSearchSet_TargetMatchedRanges(t *testing.T) {
const (
source = "a b c d e f g c d e h i j"
target = "a b c d e _ _ c d e _ f g h _ c d e _ h i j"
)
src := New(source, DefaultGranularity)
tar := New(target, DefaultGranularity)
want := MatchRanges{
{SrcStart: 0, SrcEnd: 3, TargetStart: 0, TargetEnd: 3},
{SrcStart: 1, SrcEnd: 4, TargetStart: 1, TargetEnd: 4},
{SrcStart: 2, SrcEnd: 5, TargetStart: 2, TargetEnd: 5},
{SrcStart: 7, SrcEnd: 10, TargetStart: 2, TargetEnd: 5},
{SrcStart: 2, SrcEnd: 5, TargetStart: 7, TargetEnd: 10},
{SrcStart: 7, SrcEnd: 10, TargetStart: 7, TargetEnd: 10},
{SrcStart: 2, SrcEnd: 5, TargetStart: 15, TargetEnd: 18},
{SrcStart: 7, SrcEnd: 10, TargetStart: 15, TargetEnd: 18},
{SrcStart: 10, SrcEnd: 13, TargetStart: 19, TargetEnd: 22},
}
got := targetMatchedRanges(src, tar)
if len(got) != len(want) {
t.Errorf("Number of matches %v, want %v", len(got), len(want))
}
if !reflect.DeepEqual(got, want) {
t.Errorf("Match Range = %+v, want %+v", got, want)
}
}
func TestSearchSet_UntangleSourceRanges(t *testing.T) {
tests := []struct {
description string
mr MatchRanges
want MatchRanges
}{
{
description: "Single Match - In Order",
mr: MatchRanges{
{SrcStart: 0, SrcEnd: 3, TargetStart: 10, TargetEnd: 13},
{SrcStart: 5, SrcEnd: 10, TargetStart: 14, TargetEnd: 19},
{SrcStart: 6, SrcEnd: 11, TargetStart: 15, TargetEnd: 20},
},
want: MatchRanges{
{SrcStart: 0, SrcEnd: 3, TargetStart: 10, TargetEnd: 13},
{SrcStart: 5, SrcEnd: 10, TargetStart: 14, TargetEnd: 19},
{SrcStart: 6, SrcEnd: 11, TargetStart: 15, TargetEnd: 20},
},
},
{
description: "Single Match - Out of Order",
mr: MatchRanges{
{SrcStart: 0, SrcEnd: 3, TargetStart: 10, TargetEnd: 13},
{SrcStart: 5, SrcEnd: 10, TargetStart: 14, TargetEnd: 19},
{SrcStart: 15, SrcEnd: 20, TargetStart: 14, TargetEnd: 19},
{SrcStart: 6, SrcEnd: 11, TargetStart: 15, TargetEnd: 20},
{SrcStart: 5, SrcEnd: 10, TargetStart: 24, TargetEnd: 19},
{SrcStart: 15, SrcEnd: 20, TargetStart: 24, TargetEnd: 19},
{SrcStart: 23, SrcEnd: 29, TargetStart: 30, TargetEnd: 37},
},
want: MatchRanges{
{SrcStart: 0, SrcEnd: 3, TargetStart: 10, TargetEnd: 13},
{SrcStart: 5, SrcEnd: 10, TargetStart: 14, TargetEnd: 19},
{SrcStart: 6, SrcEnd: 11, TargetStart: 15, TargetEnd: 20},
{SrcStart: 15, SrcEnd: 20, TargetStart: 24, TargetEnd: 19},
{SrcStart: 23, SrcEnd: 29, TargetStart: 30, TargetEnd: 37},
},
},
{
description: "Multiple Match - In Order",
mr: MatchRanges{
{SrcStart: 0, SrcEnd: 3, TargetStart: 10, TargetEnd: 13},
{SrcStart: 5, SrcEnd: 10, TargetStart: 14, TargetEnd: 19},
{SrcStart: 6, SrcEnd: 11, TargetStart: 15, TargetEnd: 20},
{SrcStart: 0, SrcEnd: 3, TargetStart: 110, TargetEnd: 113},
{SrcStart: 5, SrcEnd: 10, TargetStart: 114, TargetEnd: 119},
{SrcStart: 6, SrcEnd: 11, TargetStart: 115, TargetEnd: 120},
},
want: MatchRanges{
{SrcStart: 0, SrcEnd: 3, TargetStart: 10, TargetEnd: 13},
{SrcStart: 5, SrcEnd: 10, TargetStart: 14, TargetEnd: 19},
{SrcStart: 6, SrcEnd: 11, TargetStart: 15, TargetEnd: 20},
{SrcStart: 0, SrcEnd: 3, TargetStart: 110, TargetEnd: 113},
{SrcStart: 5, SrcEnd: 10, TargetStart: 114, TargetEnd: 119},
{SrcStart: 6, SrcEnd: 11, TargetStart: 115, TargetEnd: 120},
},
},
{
description: "Multiple Match - Out of Order",
mr: MatchRanges{
{SrcStart: 0, SrcEnd: 3, TargetStart: 10, TargetEnd: 13},
{SrcStart: 5, SrcEnd: 10, TargetStart: 14, TargetEnd: 19},
{SrcStart: 15, SrcEnd: 20, TargetStart: 14, TargetEnd: 19},
{SrcStart: 6, SrcEnd: 11, TargetStart: 15, TargetEnd: 20},
{SrcStart: 5, SrcEnd: 10, TargetStart: 24, TargetEnd: 19},
{SrcStart: 15, SrcEnd: 20, TargetStart: 24, TargetEnd: 19},
{SrcStart: 23, SrcEnd: 29, TargetStart: 30, TargetEnd: 37},
{SrcStart: 0, SrcEnd: 3, TargetStart: 110, TargetEnd: 113},
{SrcStart: 5, SrcEnd: 10, TargetStart: 114, TargetEnd: 119},
{SrcStart: 15, SrcEnd: 20, TargetStart: 114, TargetEnd: 119},
{SrcStart: 6, SrcEnd: 11, TargetStart: 115, TargetEnd: 120},
{SrcStart: 5, SrcEnd: 10, TargetStart: 124, TargetEnd: 119},
{SrcStart: 15, SrcEnd: 20, TargetStart: 124, TargetEnd: 119},
{SrcStart: 23, SrcEnd: 29, TargetStart: 130, TargetEnd: 137},
},
want: MatchRanges{
{SrcStart: 0, SrcEnd: 3, TargetStart: 10, TargetEnd: 13},
{SrcStart: 5, SrcEnd: 10, TargetStart: 14, TargetEnd: 19},
{SrcStart: 6, SrcEnd: 11, TargetStart: 15, TargetEnd: 20},
{SrcStart: 15, SrcEnd: 20, TargetStart: 24, TargetEnd: 19},
{SrcStart: 23, SrcEnd: 29, TargetStart: 30, TargetEnd: 37},
{SrcStart: 0, SrcEnd: 3, TargetStart: 110, TargetEnd: 113},
{SrcStart: 5, SrcEnd: 10, TargetStart: 114, TargetEnd: 119},
{SrcStart: 6, SrcEnd: 11, TargetStart: 115, TargetEnd: 120},
{SrcStart: 15, SrcEnd: 20, TargetStart: 124, TargetEnd: 119},
{SrcStart: 23, SrcEnd: 29, TargetStart: 130, TargetEnd: 137},
},
},
}
for _, tt := range tests {
got := untangleSourceRanges(tt.mr)
if len(got) != len(tt.want) {
t.Errorf("Number of matches %v, want %v", len(got), len(tt.want))
}
if !reflect.DeepEqual(got, tt.want) {
t.Errorf("UntangleSourceRanges(%q) = %v, want %v", tt.description, got, tt.want)
}
}
}
func TestSearchSet_SplitRanges(t *testing.T) {
tests := []struct {
description string
mr MatchRanges
want []MatchRanges
}{
{
description: "Single Match Range",
mr: MatchRanges{
{SrcStart: 0, SrcEnd: 10, TargetStart: 5, TargetEnd: 15},
{SrcStart: 20, SrcEnd: 30, TargetStart: 25, TargetEnd: 35},
},
want: []MatchRanges{
{
{SrcStart: 0, SrcEnd: 10, TargetStart: 5, TargetEnd: 15},
{SrcStart: 20, SrcEnd: 30, TargetStart: 25, TargetEnd: 35},
},
},
},
{
description: "Two Match Ranges",
mr: MatchRanges{
{SrcStart: 0, SrcEnd: 10, TargetStart: 5, TargetEnd: 15},
{SrcStart: 20, SrcEnd: 30, TargetStart: 25, TargetEnd: 35},
{SrcStart: 3, SrcEnd: 10, TargetStart: 108, TargetEnd: 115},
{SrcStart: 23, SrcEnd: 30, TargetStart: 25, TargetEnd: 35},
},
want: []MatchRanges{
{
{SrcStart: 0, SrcEnd: 10, TargetStart: 5, TargetEnd: 15},
{SrcStart: 20, SrcEnd: 30, TargetStart: 25, TargetEnd: 35},
},
{
{SrcStart: 3, SrcEnd: 10, TargetStart: 108, TargetEnd: 115},
{SrcStart: 23, SrcEnd: 30, TargetStart: 25, TargetEnd: 35},
},
},
},
}
for _, tt := range tests {
got := splitRanges(tt.mr)
if len(got) != len(tt.want) {
t.Errorf("Number of matches %v, want %v", len(got), len(tt.want))
}
if !reflect.DeepEqual(got, tt.want) {
t.Errorf("SplitRanges(%q) = %v, want %v", tt.description, got, tt.want)
}
}
}
func TestSearchSet_MergeConsecutiveRanges(t *testing.T) {
tests := []struct {
description string
mr []MatchRanges
want []MatchRanges
}{
{
description: "No Overlap",
mr: []MatchRanges{
{
{SrcStart: 0, SrcEnd: 10, TargetStart: 5, TargetEnd: 15},
{SrcStart: 20, SrcEnd: 30, TargetStart: 25, TargetEnd: 35},
},
{
{SrcStart: 3, SrcEnd: 10, TargetStart: 108, TargetEnd: 115},
{SrcStart: 23, SrcEnd: 30, TargetStart: 25, TargetEnd: 35},
},
},
want: []MatchRanges{
{
{SrcStart: 0, SrcEnd: 10, TargetStart: 5, TargetEnd: 15},
{SrcStart: 20, SrcEnd: 30, TargetStart: 25, TargetEnd: 35},
},
{
{SrcStart: 3, SrcEnd: 10, TargetStart: 108, TargetEnd: 115},
{SrcStart: 23, SrcEnd: 30, TargetStart: 25, TargetEnd: 35},
},
},
},
{
description: "Consecutive Ranges No Overlap",
mr: []MatchRanges{
{
{SrcStart: 0, SrcEnd: 10, TargetStart: 5, TargetEnd: 15},
{SrcStart: 20, SrcEnd: 30, TargetStart: 25, TargetEnd: 35},
},
{
{SrcStart: 3, SrcEnd: 10, TargetStart: 35, TargetEnd: 41},
{SrcStart: 23, SrcEnd: 30, TargetStart: 125, TargetEnd: 135},
},
},
want: []MatchRanges{
{
{SrcStart: 0, SrcEnd: 10, TargetStart: 5, TargetEnd: 15},
{SrcStart: 20, SrcEnd: 30, TargetStart: 25, TargetEnd: 35},
},
{
{SrcStart: 3, SrcEnd: 10, TargetStart: 35, TargetEnd: 41},
{SrcStart: 23, SrcEnd: 30, TargetStart: 125, TargetEnd: 135},
},
},
},
{
description: "Consecutive Ranges with First Element Overlap",
mr: []MatchRanges{
{
{SrcStart: 0, SrcEnd: 10, TargetStart: 5, TargetEnd: 15},
{SrcStart: 20, SrcEnd: 30, TargetStart: 25, TargetEnd: 35},
},
{
{SrcStart: 3, SrcEnd: 10, TargetStart: 34, TargetEnd: 41},
{SrcStart: 33, SrcEnd: 40, TargetStart: 35, TargetEnd: 42},
},
},
want: []MatchRanges{
{
{SrcStart: 0, SrcEnd: 10, TargetStart: 5, TargetEnd: 15},
{SrcStart: 20, SrcEnd: 36, TargetStart: 25, TargetEnd: 41},
{SrcStart: 33, SrcEnd: 40, TargetStart: 35, TargetEnd: 42},
},
},
},
{
description: "Consecutive Ranges with Overlap",
mr: []MatchRanges{
{
{SrcStart: 0, SrcEnd: 10, TargetStart: 5, TargetEnd: 15},
{SrcStart: 20, SrcEnd: 30, TargetStart: 25, TargetEnd: 35},
},
{
{SrcStart: 3, SrcEnd: 10, TargetStart: 34, TargetEnd: 41},
{SrcStart: 33, SrcEnd: 40, TargetStart: 45, TargetEnd: 52},
},
},
want: []MatchRanges{
{
{SrcStart: 0, SrcEnd: 10, TargetStart: 5, TargetEnd: 15},
{SrcStart: 20, SrcEnd: 36, TargetStart: 25, TargetEnd: 41},
{SrcStart: 33, SrcEnd: 40, TargetStart: 45, TargetEnd: 52},
},
},
},
{
description: "Consecutive Ranges with Previous Deep Overlap",
mr: []MatchRanges{
{
{SrcStart: 0, SrcEnd: 10, TargetStart: 5, TargetEnd: 15},
{SrcStart: 20, SrcEnd: 30, TargetStart: 25, TargetEnd: 35},
{SrcStart: 120, SrcEnd: 130, TargetStart: 37, TargetEnd: 47},
{SrcStart: 122, SrcEnd: 132, TargetStart: 39, TargetEnd: 49},
},
{
{SrcStart: 3, SrcEnd: 10, TargetStart: 34, TargetEnd: 41},
{SrcStart: 33, SrcEnd: 40, TargetStart: 45, TargetEnd: 52},
},
},
want: []MatchRanges{
{
{SrcStart: 0, SrcEnd: 10, TargetStart: 5, TargetEnd: 15},
{SrcStart: 20, SrcEnd: 36, TargetStart: 25, TargetEnd: 41},
{SrcStart: 33, SrcEnd: 40, TargetStart: 45, TargetEnd: 52},
},
},
},
{
description: "Consecutive Ranges with Deep Overlap",
mr: []MatchRanges{
{
{SrcStart: 0, SrcEnd: 10, TargetStart: 5, TargetEnd: 15},
{SrcStart: 20, SrcEnd: 30, TargetStart: 25, TargetEnd: 35},
{SrcStart: 24, SrcEnd: 34, TargetStart: 29, TargetEnd: 39},
{SrcStart: 120, SrcEnd: 130, TargetStart: 37, TargetEnd: 47},
{SrcStart: 122, SrcEnd: 132, TargetStart: 39, TargetEnd: 49},
},
{
{SrcStart: 3, SrcEnd: 10, TargetStart: 26, TargetEnd: 33},
{SrcStart: 5, SrcEnd: 12, TargetStart: 28, TargetEnd: 35},
{SrcStart: 25, SrcEnd: 35, TargetStart: 31, TargetEnd: 41},
},
},
want: []MatchRanges{
{
{SrcStart: 0, SrcEnd: 10, TargetStart: 5, TargetEnd: 15},
{SrcStart: 20, SrcEnd: 30, TargetStart: 25, TargetEnd: 35},
{SrcStart: 24, SrcEnd: 34, TargetStart: 29, TargetEnd: 39},
{SrcStart: 25, SrcEnd: 35, TargetStart: 31, TargetEnd: 41},
},
},
},
}
for _, tt := range tests {
got := mergeConsecutiveRanges(tt.mr)
if len(got) != len(tt.want) {
t.Errorf("Number of matches %v, want %v", len(got), len(tt.want))
}
if !reflect.DeepEqual(got, tt.want) {
t.Errorf("SplitRanges(%q) = %v, want %v", tt.description, got, tt.want)
}
}
}