v2/tokenizer.go - platform/external/licenseclassifier - Git at Google

 // Copyright 2020 Google Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 package classifier

 import (
 	"html"
 	"io"
 	"regexp"
 	"strings"
 	"unicode"
 	"unicode/utf8"
 )

 var eol = "\n"

 func header(in string) bool {
 	if len(in) == 0 {
 		return false
 	}
 	p, e := in[:len(in)-1], in[len(in)-1]
 	switch e {
 	case '.', ':', ')':
 		if listMarker[p] {
 			if e != ')' {
 				return true
 			}
 		}
 		// Check for patterns like 1.2.3
 		for _, r := range p {
 			if unicode.IsDigit(r) || r == '.' {
 				continue
 			}
 			return false
 		}
 		return true
 	}
 	return false
 }

 var listMarker = func() map[string]bool {
 	const allListMarkers = "a b c d e f g h i j k l m n o p q r ii iii iv v vi vii viii ix xi xii xiii xiv xv"
 	l := map[string]bool{}
 	for _, marker := range strings.Split(allListMarkers, " ") {
 		l[marker] = true
 	}
 	return l
 }()

 // ignorableTexts is a list of lines at the start of the string we can remove
 // to get a cleaner match.
 var ignorableTexts = []*regexp.Regexp{
 	regexp.MustCompile(`(?i)^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`),
 	regexp.MustCompile(`(?i)^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`),
 	regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`),
 }

 // tokenizeStream reads bytes from src and produces an indexedDocument of its
 // cotent. tokenizeStream will never return an error of its own, it can only
 // return an error from the provided Reader. If the provided Reader never
 // returns an error, it is safe to assume that tokenizeStream will not return an
 // error.
 func tokenizeStream(src io.Reader, normalize bool, dict *dictionary, updateDict bool) (*indexedDocument, error) {
 	const bufSize = 1024
 	// The longest UTF-8 encoded rune is 4 bytes, so we keep enough leftover bytes
 	// in the buffer to ensure we never run out of bytes trying to finish
 	// constructing a rune. These leftover 4 bytes will be copied to the start of
 	// the buffer before additional bytes are read.
 	tgt := bufSize - 4

 	rbuf := make([]byte, bufSize)
 	obuf := make([]byte, 0)
 	linebuf := make([]tokenID, 0)
 	idx := 0
 	line := 1 // 1s-based count
 	deferredEOL := false
 	deferredWord := false
 	// the tokenizer uses a local dictionary to conserve memory while
 	// analyzing the input doc to avoid polluting the global dictionary
 	ld := newDictionary()

 	var doc indexedDocument

 	isEOF := func(in error) bool {
 		return in == io.EOF || in == io.ErrUnexpectedEOF
 	}

 	// Read out the stream in chunks
 	for {
 		// Fill up the buffer with bytes to extract runes from
 		// idx is offset to hold any bytes left over from previous reads
 		n, err := io.ReadFull(src, rbuf[idx:])
 		if isEOF(err) {
 			// There are no more bytes to read, so we must now consume all bytes in the
 			// buffer.
 			tgt = idx + n
 		} else if err != nil {
 			return nil, err
 		}

 		for idx = 0; idx < tgt; {
 			r, n := utf8.DecodeRune(rbuf[idx:])
 			idx += n

 			if r == '\n' {
 				// Deal with carriage return

 				// If we are in a word (len(obuf) > 0)and the last rune is a -
 				// strike that rune and keep accumulating.
 				// Otherwise we treat it like a space and
 				// flush the word

 				if len(obuf) > 0 {
 					if obuf[len(obuf)-1] == '-' {
 						obuf = obuf[0 : len(obuf)-1]
 						deferredEOL = true
 						continue
 					}

 					// Append the word fragment to the line buffer
 					linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
 				}

 				// If there is something in the line to process, do so now
 				if len(linebuf) > 0 {
 					appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
 					linebuf = nil
 					obuf = nil
 				}
 				if !normalize {
 					tokID := dict.getIndex(eol)
 					if tokID == unknownIndex {
 						tokID = dict.add(eol)
 					}
 					doc.Tokens = append(doc.Tokens, indexedToken{
 						ID:   tokID,
 						Line: line})
 				}
 				line++
 				continue
 			}

 			if len(obuf) == 0 {
 				if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '&' || r == '(' {
 					// Number or word character starts an interesting word
 					// Now we slurp up all non-space runes and aggregate it as
 					// a single word

 					// Buffer the initial token, normalizing to lower case if needed
 					if normalize {
 						r = unicode.ToLower(r)
 					}
 					obuf = utf8.AppendRune(obuf, r)
 				}
 				continue
 			}

 			// At this point, len(obuf) > 0 and we are accumulating more runes
 			// to complete a word.
 			if unicode.IsSpace(r) {
 				// If we have a deferred EOL, we need to pick up a non-space character
 				// to resume the hyphenated word, so we just consume spaces until that
 				// happens
 				if deferredEOL {
 					continue
 				}

 				// This is a space between word characters, so we assemble the word as a
 				// token and flush it out.
 				idx -= n

 				linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
 				if deferredWord {
 					appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
 					linebuf = nil
 					deferredWord = false
 					// Increment the line count now so the remainder token is credited
 					// to the previous line number.
 					line++
 				}
 				obuf = make([]byte, 0)
 				continue
 			}

 			if deferredEOL {
 				deferredEOL = false
 				deferredWord = true
 			}
 			// perform token mappings for punctuation to emulate
 			// normalizePunctuation. this returns a string and each rune needs to be
 			// injected.
 			if rep, found := punctuationMappings[r]; found {
 				for _, t := range rep {
 					obuf = utf8.AppendRune(obuf, unicode.ToLower(t))
 				}
 				continue
 			}

 			// if it's not punctuation, lowercase and buffer the token
 			obuf = utf8.AppendRune(obuf, unicode.ToLower(r))
 		}

 		// Break out if we have consumed all read bytes
 		if isEOF(err) {
 			break
 		}

 		// Copy the unconsumed bytes at the end of the buffer to the start
 		// of the buffer so the next read appends after them.
 		n = copy(rbuf, rbuf[idx:])
 		idx = n
 	}

 	// Process the remaining bytes in the buffer
 	if len(obuf) > 0 {
 		linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
 	}
 	if len(linebuf) > 0 {
 		appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
 	}

 	doc.dict = dict
 	doc.generateFrequencies()
 	doc.runes = diffWordsToRunes(&doc, 0, doc.size())
 	doc.Norm = doc.normalized()
 	return &doc, nil
 }

 func appendToDoc(doc *indexedDocument, dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool, linebuf []tokenID) {
 	tokens, m := stringifyLineBuf(dict, line, linebuf, ld, normalize, updateDict)
 	if tokens != nil {
 		doc.Tokens = append(doc.Tokens, tokens...)
 	} else if m != nil {
 		doc.Matches = append(doc.Matches, m)
 	}
 }

 func stringifyLineBuf(dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool) ([]indexedToken, *Match) {
 	if len(in) == 0 {
 		return nil, nil
 	}
 	var sb strings.Builder
 	for i, r := range in {
 		out := ld.getWord(r)
 		if out == "" {
 			continue
 		}
 		sb.WriteString(out)
 		if i < len(in)-1 {
 			sb.WriteByte(' ')
 		}
 	}

 	out := sb.String()

 	for _, re := range ignorableTexts {
 		if re.MatchString(out) {
 			return nil, &Match{Name: "Copyright", MatchType: "Copyright", Confidence: 1.0, StartLine: line, EndLine: line}
 		}
 	}

 	var tokens []indexedToken
 	for i, r := range in {
 		txt := cleanupToken(i, ld.getWord(r), normalize)
 		if txt != "" {
 			var tokID tokenID
 			if updateDict {
 				tokID = dict.add(txt)
 			} else {
 				tokID = dict.getIndex(txt)
 			}
 			tokens = append(tokens, indexedToken{
 				Line: line,
 				ID:   tokID,
 			})
 		}
 	}

 	return tokens, nil
 }

 func normalizeToken(in string) string {
 	// This performs some preprocessing on the token.
 	// This is different than cleanupToken in that fixups here
 	// are not exact match on the token.
 	// Normalizing URLs from https to http is an example of a fix applied
 	// here.
 	return strings.ReplaceAll(in, "https", "http")
 }

 func flushBuf(pos int, obuf []byte, normalizeWord bool, ld *dictionary) tokenID {
 	// clean up the contents of the rune buffer
 	token := string(obuf)
 	// escape sequences can occur anywhere in the string, not just the beginning
 	// so always attempt to unescape the word's content.
 	token = html.UnescapeString(token)

 	clean := normalizeToken(token)

 	return ld.add(clean)
 }

 func cleanupToken(pos int, in string, normalizeWord bool) string {
 	r, _ := utf8.DecodeRuneInString(in)
 	var out strings.Builder
 	if pos == 0 && header(in) {
 		return ""
 	}

 	if !unicode.IsLetter(r) {
 		if unicode.IsDigit(r) {
 			// Based on analysis of the license corpus, the characters that are
 			// significant are numbers, periods, and dashes. Anything else can be
 			// safely discarded, and helps avoid matching failures due to inconsistent
 			// whitespacing and formatting.
 			for _, c := range in {
 				if unicode.IsDigit(c) || c == '.' || c == '-' {
 					out.WriteRune(c)
 				}
 			}

 			// Numbers should not end in a .  since that doesn't indicate a version
 			// number, but usually an end of a line.
 			res := out.String()
 			for strings.HasSuffix(res, ".") {
 				res = res[0 : len(res)-1]
 			}
 			return res
 		}
 	}

 	// Remove internal hyphenization or URL constructs to better normalize strings
 	// for matching.

 	for _, c := range in {
 		if unicode.IsLetter(c) {
 			out.WriteRune(c)
 		}
 	}

 	tok := out.String()
 	if !normalizeWord {
 		return tok
 	}

 	if iw, ok := interchangeableWords[tok]; ok && normalizeWord {
 		return iw
 	}
 	return tok
 }

 var interchangeableWords = map[string]string{
 	"analyse":         "analyze",
 	"artefact":        "artifact",
 	"authorisation":   "authorization",
 	"authorised":      "authorized",
 	"calibre":         "caliber",
 	"cancelled":       "canceled",
 	"capitalisations": "capitalizations",
 	"catalogue":       "catalog",
 	"categorise":      "categorize",
 	"centre":          "center",
 	"emphasised":      "emphasized",
 	"favour":          "favor",
 	"favourite":       "favorite",
 	"fulfil":          "fulfill",
 	"fulfilment":      "fulfillment",
 	"https":           "http",
 	"initialise":      "initialize",
 	"judgment":        "judgement",
 	"labelling":       "labeling",
 	"labour":          "labor",
 	"licence":         "license",
 	"maximise":        "maximize",
 	"modelled":        "modeled",
 	"modelling":       "modeling",
 	"offence":         "offense",
 	"optimise":        "optimize",
 	"organisation":    "organization",
 	"organise":        "organize",
 	"practise":        "practice",
 	"programme":       "program",
 	"realise":         "realize",
 	"recognise":       "recognize",
 	"signalling":      "signaling",
 	"utilisation":     "utilization",
 	"whilst":          "while",
 	"wilful":          "wilfull",
 	// TODO: These three need tokenizer magic
 	"non commercial": "noncommercial",
 	"per cent":       "percent",
 	"sub license":    "sublicense",
 }

 var punctuationMappings = map[rune]string{
 	'-': "-",
 	'‒': "-",
 	'–': "-",
 	'—': "-",
 	'‐': "-",
 	'©': "(c)",
 	'§': "(s)",
 	'¤': "(s)",
 	'·': " ",
 	'*': " ",
 }
	// Copyright 2020 Google Inc.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	package classifier

	import (
	"html"
	"io"
	"regexp"
	"strings"
	"unicode"
	"unicode/utf8"
	)

	var eol = "\n"

	func header(in string) bool {
	if len(in) == 0 {
	return false
	}
	p, e := in[:len(in)-1], in[len(in)-1]
	switch e {
	case '.', ':', ')':
	if listMarker[p] {
	if e != ')' {
	return true
	}
	}
	// Check for patterns like 1.2.3
	for _, r := range p {
	if unicode.IsDigit(r) \|\| r == '.' {
	continue
	}
	return false
	}
	return true
	}
	return false
	}

	var listMarker = func() map[string]bool {
	const allListMarkers = "a b c d e f g h i j k l m n o p q r ii iii iv v vi vii viii ix xi xii xiii xiv xv"
	l := map[string]bool{}
	for _, marker := range strings.Split(allListMarkers, " ") {
	l[marker] = true
	}
	return l
	}()

	// ignorableTexts is a list of lines at the start of the string we can remove
	// to get a cleaner match.
	var ignorableTexts = []*regexp.Regexp{
	regexp.MustCompile(`(?i)^(.{1,5})?copyright (\(c\) )?(\[yyyy\]\|\d{4})[,.]?.*$`),
	regexp.MustCompile(`(?i)^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`),
	regexp.MustCompile(`(?i)^\d{4}-(\d{2}\|[a-z]{3})-\d{2}$`),
	}

	// tokenizeStream reads bytes from src and produces an indexedDocument of its
	// cotent. tokenizeStream will never return an error of its own, it can only
	// return an error from the provided Reader. If the provided Reader never
	// returns an error, it is safe to assume that tokenizeStream will not return an
	// error.
	func tokenizeStream(src io.Reader, normalize bool, dict dictionary, updateDict bool) (indexedDocument, error) {
	const bufSize = 1024
	// The longest UTF-8 encoded rune is 4 bytes, so we keep enough leftover bytes
	// in the buffer to ensure we never run out of bytes trying to finish
	// constructing a rune. These leftover 4 bytes will be copied to the start of
	// the buffer before additional bytes are read.
	tgt := bufSize - 4

	rbuf := make([]byte, bufSize)
	obuf := make([]byte, 0)
	linebuf := make([]tokenID, 0)
	idx := 0
	line := 1 // 1s-based count
	deferredEOL := false
	deferredWord := false
	// the tokenizer uses a local dictionary to conserve memory while
	// analyzing the input doc to avoid polluting the global dictionary
	ld := newDictionary()

	var doc indexedDocument

	isEOF := func(in error) bool {
	return in == io.EOF \|\| in == io.ErrUnexpectedEOF
	}

	// Read out the stream in chunks
	for {
	// Fill up the buffer with bytes to extract runes from
	// idx is offset to hold any bytes left over from previous reads
	n, err := io.ReadFull(src, rbuf[idx:])
	if isEOF(err) {
	// There are no more bytes to read, so we must now consume all bytes in the
	// buffer.
	tgt = idx + n
	} else if err != nil {
	return nil, err
	}

	for idx = 0; idx < tgt; {
	r, n := utf8.DecodeRune(rbuf[idx:])
	idx += n

	if r == '\n' {
	// Deal with carriage return

	// If we are in a word (len(obuf) > 0)and the last rune is a -
	// strike that rune and keep accumulating.
	// Otherwise we treat it like a space and
	// flush the word

	if len(obuf) > 0 {
	if obuf[len(obuf)-1] == '-' {
	obuf = obuf[0 : len(obuf)-1]
	deferredEOL = true
	continue
	}

	// Append the word fragment to the line buffer
	linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
	}

	// If there is something in the line to process, do so now
	if len(linebuf) > 0 {
	appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
	linebuf = nil
	obuf = nil
	}
	if !normalize {
	tokID := dict.getIndex(eol)
	if tokID == unknownIndex {
	tokID = dict.add(eol)
	}
	doc.Tokens = append(doc.Tokens, indexedToken{
	ID: tokID,
	Line: line})
	}
	line++
	continue
	}

	if len(obuf) == 0 {
	if unicode.IsLetter(r) \|\| unicode.IsDigit(r) \|\| r == '&' \|\| r == '(' {
	// Number or word character starts an interesting word
	// Now we slurp up all non-space runes and aggregate it as
	// a single word

	// Buffer the initial token, normalizing to lower case if needed
	if normalize {
	r = unicode.ToLower(r)
	}
	obuf = utf8.AppendRune(obuf, r)
	}
	continue
	}

	// At this point, len(obuf) > 0 and we are accumulating more runes
	// to complete a word.
	if unicode.IsSpace(r) {
	// If we have a deferred EOL, we need to pick up a non-space character
	// to resume the hyphenated word, so we just consume spaces until that
	// happens
	if deferredEOL {
	continue
	}

	// This is a space between word characters, so we assemble the word as a
	// token and flush it out.
	idx -= n

	linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
	if deferredWord {
	appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
	linebuf = nil
	deferredWord = false
	// Increment the line count now so the remainder token is credited
	// to the previous line number.
	line++
	}
	obuf = make([]byte, 0)
	continue
	}

	if deferredEOL {
	deferredEOL = false
	deferredWord = true
	}
	// perform token mappings for punctuation to emulate
	// normalizePunctuation. this returns a string and each rune needs to be
	// injected.
	if rep, found := punctuationMappings[r]; found {
	for _, t := range rep {
	obuf = utf8.AppendRune(obuf, unicode.ToLower(t))
	}
	continue
	}

	// if it's not punctuation, lowercase and buffer the token
	obuf = utf8.AppendRune(obuf, unicode.ToLower(r))
	}

	// Break out if we have consumed all read bytes
	if isEOF(err) {
	break
	}

	// Copy the unconsumed bytes at the end of the buffer to the start
	// of the buffer so the next read appends after them.
	n = copy(rbuf, rbuf[idx:])
	idx = n
	}

	// Process the remaining bytes in the buffer
	if len(obuf) > 0 {
	linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
	}
	if len(linebuf) > 0 {
	appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
	}

	doc.dict = dict
	doc.generateFrequencies()
	doc.runes = diffWordsToRunes(&doc, 0, doc.size())
	doc.Norm = doc.normalized()
	return &doc, nil
	}

	func appendToDoc(doc indexedDocument, dict dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool, linebuf []tokenID) {
	tokens, m := stringifyLineBuf(dict, line, linebuf, ld, normalize, updateDict)
	if tokens != nil {
	doc.Tokens = append(doc.Tokens, tokens...)
	} else if m != nil {
	doc.Matches = append(doc.Matches, m)
	}
	}

	func stringifyLineBuf(dict dictionary, line int, in []tokenID, ld dictionary, normalize bool, updateDict bool) ([]indexedToken, *Match) {
	if len(in) == 0 {
	return nil, nil
	}
	var sb strings.Builder
	for i, r := range in {
	out := ld.getWord(r)
	if out == "" {
	continue
	}
	sb.WriteString(out)
	if i < len(in)-1 {
	sb.WriteByte(' ')
	}
	}

	out := sb.String()

	for _, re := range ignorableTexts {
	if re.MatchString(out) {
	return nil, &Match{Name: "Copyright", MatchType: "Copyright", Confidence: 1.0, StartLine: line, EndLine: line}
	}
	}

	var tokens []indexedToken
	for i, r := range in {
	txt := cleanupToken(i, ld.getWord(r), normalize)
	if txt != "" {
	var tokID tokenID
	if updateDict {
	tokID = dict.add(txt)
	} else {
	tokID = dict.getIndex(txt)
	}
	tokens = append(tokens, indexedToken{
	Line: line,
	ID: tokID,
	})
	}
	}

	return tokens, nil
	}

	func normalizeToken(in string) string {
	// This performs some preprocessing on the token.
	// This is different than cleanupToken in that fixups here
	// are not exact match on the token.
	// Normalizing URLs from https to http is an example of a fix applied
	// here.
	return strings.ReplaceAll(in, "https", "http")
	}

	func flushBuf(pos int, obuf []byte, normalizeWord bool, ld *dictionary) tokenID {
	// clean up the contents of the rune buffer
	token := string(obuf)
	// escape sequences can occur anywhere in the string, not just the beginning
	// so always attempt to unescape the word's content.
	token = html.UnescapeString(token)

	clean := normalizeToken(token)

	return ld.add(clean)
	}

	func cleanupToken(pos int, in string, normalizeWord bool) string {
	r, _ := utf8.DecodeRuneInString(in)
	var out strings.Builder
	if pos == 0 && header(in) {
	return ""
	}

	if !unicode.IsLetter(r) {
	if unicode.IsDigit(r) {
	// Based on analysis of the license corpus, the characters that are
	// significant are numbers, periods, and dashes. Anything else can be
	// safely discarded, and helps avoid matching failures due to inconsistent
	// whitespacing and formatting.
	for _, c := range in {
	if unicode.IsDigit(c) \|\| c == '.' \|\| c == '-' {
	out.WriteRune(c)
	}
	}

	// Numbers should not end in a . since that doesn't indicate a version
	// number, but usually an end of a line.
	res := out.String()
	for strings.HasSuffix(res, ".") {
	res = res[0 : len(res)-1]
	}
	return res
	}
	}

	// Remove internal hyphenization or URL constructs to better normalize strings
	// for matching.

	for _, c := range in {
	if unicode.IsLetter(c) {
	out.WriteRune(c)
	}
	}

	tok := out.String()
	if !normalizeWord {
	return tok
	}

	if iw, ok := interchangeableWords[tok]; ok && normalizeWord {
	return iw
	}
	return tok
	}

	var interchangeableWords = map[string]string{
	"analyse": "analyze",
	"artefact": "artifact",
	"authorisation": "authorization",
	"authorised": "authorized",
	"calibre": "caliber",
	"cancelled": "canceled",
	"capitalisations": "capitalizations",
	"catalogue": "catalog",
	"categorise": "categorize",
	"centre": "center",
	"emphasised": "emphasized",
	"favour": "favor",
	"favourite": "favorite",
	"fulfil": "fulfill",
	"fulfilment": "fulfillment",
	"https": "http",
	"initialise": "initialize",
	"judgment": "judgement",
	"labelling": "labeling",
	"labour": "labor",
	"licence": "license",
	"maximise": "maximize",
	"modelled": "modeled",
	"modelling": "modeling",
	"offence": "offense",
	"optimise": "optimize",
	"organisation": "organization",
	"organise": "organize",
	"practise": "practice",
	"programme": "program",
	"realise": "realize",
	"recognise": "recognize",
	"signalling": "signaling",
	"utilisation": "utilization",
	"whilst": "while",
	"wilful": "wilfull",
	// TODO: These three need tokenizer magic
	"non commercial": "noncommercial",
	"per cent": "percent",
	"sub license": "sublicense",
	}

	var punctuationMappings = map[rune]string{
	'-': "-",
	'‒': "-",
	'–': "-",
	'—': "-",
	'‐': "-",
	'©': "(c)",
	'§': "(s)",
	'¤': "(s)",
	'·': " ",
	'*': " ",
	}