blob: 607b0d4f5f7ba87aa4d6016fdb5167ac119ca895 [file] [log] [blame] [edit]
// Copyright 2020 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package classifier
import (
"html"
"io"
"regexp"
"strings"
"unicode"
"unicode/utf8"
)
var eol = "\n"
func header(in string) bool {
if len(in) == 0 {
return false
}
p, e := in[:len(in)-1], in[len(in)-1]
switch e {
case '.', ':', ')':
if listMarker[p] {
if e != ')' {
return true
}
}
// Check for patterns like 1.2.3
for _, r := range p {
if unicode.IsDigit(r) || r == '.' {
continue
}
return false
}
return true
}
return false
}
var listMarker = func() map[string]bool {
const allListMarkers = "a b c d e f g h i j k l m n o p q r ii iii iv v vi vii viii ix xi xii xiii xiv xv"
l := map[string]bool{}
for _, marker := range strings.Split(allListMarkers, " ") {
l[marker] = true
}
return l
}()
// ignorableTexts is a list of lines at the start of the string we can remove
// to get a cleaner match.
var ignorableTexts = []*regexp.Regexp{
regexp.MustCompile(`(?i)^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`),
regexp.MustCompile(`(?i)^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`),
regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`),
}
// tokenizeStream reads bytes from src and produces an indexedDocument of its
// cotent. tokenizeStream will never return an error of its own, it can only
// return an error from the provided Reader. If the provided Reader never
// returns an error, it is safe to assume that tokenizeStream will not return an
// error.
func tokenizeStream(src io.Reader, normalize bool, dict *dictionary, updateDict bool) (*indexedDocument, error) {
const bufSize = 1024
// The longest UTF-8 encoded rune is 4 bytes, so we keep enough leftover bytes
// in the buffer to ensure we never run out of bytes trying to finish
// constructing a rune. These leftover 4 bytes will be copied to the start of
// the buffer before additional bytes are read.
tgt := bufSize - 4
rbuf := make([]byte, bufSize)
obuf := make([]byte, 0)
linebuf := make([]tokenID, 0)
idx := 0
line := 1 // 1s-based count
deferredEOL := false
deferredWord := false
// the tokenizer uses a local dictionary to conserve memory while
// analyzing the input doc to avoid polluting the global dictionary
ld := newDictionary()
var doc indexedDocument
isEOF := func(in error) bool {
return in == io.EOF || in == io.ErrUnexpectedEOF
}
// Read out the stream in chunks
for {
// Fill up the buffer with bytes to extract runes from
// idx is offset to hold any bytes left over from previous reads
n, err := io.ReadFull(src, rbuf[idx:])
if isEOF(err) {
// There are no more bytes to read, so we must now consume all bytes in the
// buffer.
tgt = idx + n
} else if err != nil {
return nil, err
}
for idx = 0; idx < tgt; {
r, n := utf8.DecodeRune(rbuf[idx:])
idx += n
if r == '\n' {
// Deal with carriage return
// If we are in a word (len(obuf) > 0)and the last rune is a -
// strike that rune and keep accumulating.
// Otherwise we treat it like a space and
// flush the word
if len(obuf) > 0 {
if obuf[len(obuf)-1] == '-' {
obuf = obuf[0 : len(obuf)-1]
deferredEOL = true
continue
}
// Append the word fragment to the line buffer
linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
}
// If there is something in the line to process, do so now
if len(linebuf) > 0 {
appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
linebuf = nil
obuf = nil
}
if !normalize {
tokID := dict.getIndex(eol)
if tokID == unknownIndex {
tokID = dict.add(eol)
}
doc.Tokens = append(doc.Tokens, indexedToken{
ID: tokID,
Line: line})
}
line++
continue
}
if len(obuf) == 0 {
if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '&' || r == '(' {
// Number or word character starts an interesting word
// Now we slurp up all non-space runes and aggregate it as
// a single word
// Buffer the initial token, normalizing to lower case if needed
if normalize {
r = unicode.ToLower(r)
}
obuf = utf8.AppendRune(obuf, r)
}
continue
}
// At this point, len(obuf) > 0 and we are accumulating more runes
// to complete a word.
if unicode.IsSpace(r) {
// If we have a deferred EOL, we need to pick up a non-space character
// to resume the hyphenated word, so we just consume spaces until that
// happens
if deferredEOL {
continue
}
// This is a space between word characters, so we assemble the word as a
// token and flush it out.
idx -= n
linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
if deferredWord {
appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
linebuf = nil
deferredWord = false
// Increment the line count now so the remainder token is credited
// to the previous line number.
line++
}
obuf = make([]byte, 0)
continue
}
if deferredEOL {
deferredEOL = false
deferredWord = true
}
// perform token mappings for punctuation to emulate
// normalizePunctuation. this returns a string and each rune needs to be
// injected.
if rep, found := punctuationMappings[r]; found {
for _, t := range rep {
obuf = utf8.AppendRune(obuf, unicode.ToLower(t))
}
continue
}
// if it's not punctuation, lowercase and buffer the token
obuf = utf8.AppendRune(obuf, unicode.ToLower(r))
}
// Break out if we have consumed all read bytes
if isEOF(err) {
break
}
// Copy the unconsumed bytes at the end of the buffer to the start
// of the buffer so the next read appends after them.
n = copy(rbuf, rbuf[idx:])
idx = n
}
// Process the remaining bytes in the buffer
if len(obuf) > 0 {
linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
}
if len(linebuf) > 0 {
appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
}
doc.dict = dict
doc.generateFrequencies()
doc.runes = diffWordsToRunes(&doc, 0, doc.size())
doc.Norm = doc.normalized()
return &doc, nil
}
func appendToDoc(doc *indexedDocument, dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool, linebuf []tokenID) {
tokens, m := stringifyLineBuf(dict, line, linebuf, ld, normalize, updateDict)
if tokens != nil {
doc.Tokens = append(doc.Tokens, tokens...)
} else if m != nil {
doc.Matches = append(doc.Matches, m)
}
}
func stringifyLineBuf(dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool) ([]indexedToken, *Match) {
if len(in) == 0 {
return nil, nil
}
var sb strings.Builder
for i, r := range in {
out := ld.getWord(r)
if out == "" {
continue
}
sb.WriteString(out)
if i < len(in)-1 {
sb.WriteByte(' ')
}
}
out := sb.String()
for _, re := range ignorableTexts {
if re.MatchString(out) {
return nil, &Match{Name: "Copyright", MatchType: "Copyright", Confidence: 1.0, StartLine: line, EndLine: line}
}
}
var tokens []indexedToken
for i, r := range in {
txt := cleanupToken(i, ld.getWord(r), normalize)
if txt != "" {
var tokID tokenID
if updateDict {
tokID = dict.add(txt)
} else {
tokID = dict.getIndex(txt)
}
tokens = append(tokens, indexedToken{
Line: line,
ID: tokID,
})
}
}
return tokens, nil
}
func normalizeToken(in string) string {
// This performs some preprocessing on the token.
// This is different than cleanupToken in that fixups here
// are not exact match on the token.
// Normalizing URLs from https to http is an example of a fix applied
// here.
return strings.ReplaceAll(in, "https", "http")
}
func flushBuf(pos int, obuf []byte, normalizeWord bool, ld *dictionary) tokenID {
// clean up the contents of the rune buffer
token := string(obuf)
// escape sequences can occur anywhere in the string, not just the beginning
// so always attempt to unescape the word's content.
token = html.UnescapeString(token)
clean := normalizeToken(token)
return ld.add(clean)
}
func cleanupToken(pos int, in string, normalizeWord bool) string {
r, _ := utf8.DecodeRuneInString(in)
var out strings.Builder
if pos == 0 && header(in) {
return ""
}
if !unicode.IsLetter(r) {
if unicode.IsDigit(r) {
// Based on analysis of the license corpus, the characters that are
// significant are numbers, periods, and dashes. Anything else can be
// safely discarded, and helps avoid matching failures due to inconsistent
// whitespacing and formatting.
for _, c := range in {
if unicode.IsDigit(c) || c == '.' || c == '-' {
out.WriteRune(c)
}
}
// Numbers should not end in a . since that doesn't indicate a version
// number, but usually an end of a line.
res := out.String()
for strings.HasSuffix(res, ".") {
res = res[0 : len(res)-1]
}
return res
}
}
// Remove internal hyphenization or URL constructs to better normalize strings
// for matching.
for _, c := range in {
if unicode.IsLetter(c) {
out.WriteRune(c)
}
}
tok := out.String()
if !normalizeWord {
return tok
}
if iw, ok := interchangeableWords[tok]; ok && normalizeWord {
return iw
}
return tok
}
var interchangeableWords = map[string]string{
"analyse": "analyze",
"artefact": "artifact",
"authorisation": "authorization",
"authorised": "authorized",
"calibre": "caliber",
"cancelled": "canceled",
"capitalisations": "capitalizations",
"catalogue": "catalog",
"categorise": "categorize",
"centre": "center",
"emphasised": "emphasized",
"favour": "favor",
"favourite": "favorite",
"fulfil": "fulfill",
"fulfilment": "fulfillment",
"https": "http",
"initialise": "initialize",
"judgment": "judgement",
"labelling": "labeling",
"labour": "labor",
"licence": "license",
"maximise": "maximize",
"modelled": "modeled",
"modelling": "modeling",
"offence": "offense",
"optimise": "optimize",
"organisation": "organization",
"organise": "organize",
"practise": "practice",
"programme": "program",
"realise": "realize",
"recognise": "recognize",
"signalling": "signaling",
"utilisation": "utilization",
"whilst": "while",
"wilful": "wilfull",
// TODO: These three need tokenizer magic
"non commercial": "noncommercial",
"per cent": "percent",
"sub license": "sublicense",
}
var punctuationMappings = map[rune]string{
'-': "-",
'‒': "-",
'–': "-",
'—': "-",
'‐': "-",
'©': "(c)",
'§': "(s)",
'¤': "(s)",
'·': " ",
'*': " ",
}