blob: a5176ca7c224af9067eae984f10dea1b35dd0fd7 [file] [log] [blame] [edit]
// Package idsearcher is used to search for short-form IDs in files
// within a directory, and to build an SPDX Document containing those
// license findings.
// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
package idsearcher
import (
"bufio"
"fmt"
"github.com/spdx/tools-golang/spdx/v2_3"
"os"
"path/filepath"
"regexp"
"sort"
"strings"
"github.com/spdx/tools-golang/builder"
"github.com/spdx/tools-golang/spdx/v2_1"
"github.com/spdx/tools-golang/spdx/v2_2"
"github.com/spdx/tools-golang/utils"
)
// ===== 2.1 Searcher functions =====
// Config2_1 is a collection of configuration settings for docbuilder
// (for version 2.1 SPDX Documents). A few mandatory fields are set here
// so that they can be repeatedly reused in multiple calls to Build2_1.
type Config2_1 struct {
// NamespacePrefix should be a URI representing a prefix for the
// namespace with which the SPDX Document will be associated.
// It will be used in the DocumentNamespace field in the CreationInfo
// section, followed by the per-Document package name and a random UUID.
NamespacePrefix string
// BuilderPathsIgnored lists certain paths to be omitted from the built
// document. Each string should be a path, relative to the package's
// dirRoot, to a specific file or (for all files in a directory) ending
// in a slash. Prefix the string with "**" to omit all instances of that
// file / directory, regardless of where it is in the file tree.
BuilderPathsIgnored []string
// SearcherPathsIgnored lists certain paths that should not be searched
// by idsearcher, even if those paths have Files present. It uses the
// same format as BuilderPathsIgnored.
SearcherPathsIgnored []string
}
// BuildIDsDocument2_1 creates an SPDX Document (version 2.1) and searches for
// short-form IDs in each file, filling in license fields as appropriate. It
// returns that document or error if any is encountered. Arguments:
// - packageName: name of package / directory
// - dirRoot: path to directory to be analyzed
// - namespacePrefix: URI representing a prefix for the
// namespace with which the SPDX Document will be associated
func BuildIDsDocument2_1(packageName string, dirRoot string, idconfig *Config2_1) (*v2_1.Document, error) {
// first, build the Document using builder
bconfig := &builder.Config2_1{
NamespacePrefix: idconfig.NamespacePrefix,
CreatorType: "Tool",
Creator: "github.com/spdx/tools-golang/idsearcher",
PathsIgnored: idconfig.BuilderPathsIgnored,
}
doc, err := builder.Build2_1(packageName, dirRoot, bconfig)
if err != nil {
return nil, err
}
if doc == nil {
return nil, fmt.Errorf("builder returned nil Document")
}
if doc.Packages == nil {
return nil, fmt.Errorf("builder returned nil Packages map")
}
if len(doc.Packages) != 1 {
return nil, fmt.Errorf("builder returned %d Packages", len(doc.Packages))
}
// now, walk through each file and find its licenses (if any)
pkg := doc.Packages[0]
if pkg == nil {
return nil, fmt.Errorf("builder returned nil Package")
}
if pkg.Files == nil {
return nil, fmt.Errorf("builder returned nil Files in Package")
}
licsForPackage := map[string]int{}
for _, f := range pkg.Files {
// start by initializing / clearing values
f.LicenseInfoInFiles = []string{"NOASSERTION"}
f.LicenseConcluded = "NOASSERTION"
// check whether the searcher should ignore this file
if utils.ShouldIgnore(f.FileName, idconfig.SearcherPathsIgnored) {
continue
}
fPath := filepath.Join(dirRoot, f.FileName)
// FIXME this is not preferable -- ignoring error
ids, _ := searchFileIDs(fPath)
// FIXME for now, proceed onwards with whatever IDs we obtained.
// FIXME instead of ignoring the error, should probably either log it,
// FIXME and/or enable the caller to configure what should happen.
// separate out for this file's licenses
licsForFile := map[string]int{}
licsParens := []string{}
for _, lid := range ids {
// get individual elements and add for file and package
licElements := getIndividualLicenses(lid)
for _, elt := range licElements {
licsForFile[elt] = 1
licsForPackage[elt] = 1
}
// parenthesize if needed and add to slice for joining
licsParens = append(licsParens, makeElement(lid))
}
// OK -- now we can fill in the file's details, or NOASSERTION if none
if len(licsForFile) > 0 {
f.LicenseInfoInFiles = []string{}
for lic := range licsForFile {
f.LicenseInfoInFiles = append(f.LicenseInfoInFiles, lic)
}
sort.Strings(f.LicenseInfoInFiles)
// avoid adding parens and joining for single-ID items
if len(licsParens) == 1 {
f.LicenseConcluded = ids[0]
} else {
f.LicenseConcluded = strings.Join(licsParens, " AND ")
}
}
}
// and finally, we can fill in the package's details
if len(licsForPackage) == 0 {
pkg.PackageLicenseInfoFromFiles = []string{"NOASSERTION"}
} else {
pkg.PackageLicenseInfoFromFiles = []string{}
for lic := range licsForPackage {
pkg.PackageLicenseInfoFromFiles = append(pkg.PackageLicenseInfoFromFiles, lic)
}
sort.Strings(pkg.PackageLicenseInfoFromFiles)
}
return doc, nil
}
// ===== 2.2 Searcher functions =====
// Config2_2 is a collection of configuration settings for docbuilder
// (for version 2.2 SPDX Documents). A few mandatory fields are set here
// so that they can be repeatedly reused in multiple calls to Build2_2.
type Config2_2 struct {
// NamespacePrefix should be a URI representing a prefix for the
// namespace with which the SPDX Document will be associated.
// It will be used in the DocumentNamespace field in the CreationInfo
// section, followed by the per-Document package name and a random UUID.
NamespacePrefix string
// BuilderPathsIgnored lists certain paths to be omitted from the built
// document. Each string should be a path, relative to the package's
// dirRoot, to a specific file or (for all files in a directory) ending
// in a slash. Prefix the string with "**" to omit all instances of that
// file / directory, regardless of where it is in the file tree.
BuilderPathsIgnored []string
// SearcherPathsIgnored lists certain paths that should not be searched
// by idsearcher, even if those paths have Files present. It uses the
// same format as BuilderPathsIgnored.
SearcherPathsIgnored []string
}
// BuildIDsDocument2_2 creates an SPDX Document (version 2.2) and searches for
// short-form IDs in each file, filling in license fields as appropriate. It
// returns that document or error if any is encountered. Arguments:
// - packageName: name of package / directory
// - dirRoot: path to directory to be analyzed
// - namespacePrefix: URI representing a prefix for the
// namespace with which the SPDX Document will be associated
func BuildIDsDocument2_2(packageName string, dirRoot string, idconfig *Config2_2) (*v2_2.Document, error) {
// first, build the Document using builder
bconfig := &builder.Config2_2{
NamespacePrefix: idconfig.NamespacePrefix,
CreatorType: "Tool",
Creator: "github.com/spdx/tools-golang/idsearcher",
PathsIgnored: idconfig.BuilderPathsIgnored,
}
doc, err := builder.Build2_2(packageName, dirRoot, bconfig)
if err != nil {
return nil, err
}
if doc == nil {
return nil, fmt.Errorf("builder returned nil Document")
}
if doc.Packages == nil {
return nil, fmt.Errorf("builder returned nil Packages map")
}
if len(doc.Packages) != 1 {
return nil, fmt.Errorf("builder returned %d Packages", len(doc.Packages))
}
// now, walk through each file and find its licenses (if any)
pkg := doc.Packages[0]
if pkg == nil {
return nil, fmt.Errorf("builder returned nil Package")
}
if pkg.Files == nil {
return nil, fmt.Errorf("builder returned nil Files in Package")
}
licsForPackage := map[string]int{}
for _, f := range pkg.Files {
// start by initializing / clearing values
f.LicenseInfoInFiles = []string{"NOASSERTION"}
f.LicenseConcluded = "NOASSERTION"
// check whether the searcher should ignore this file
if utils.ShouldIgnore(f.FileName, idconfig.SearcherPathsIgnored) {
continue
}
fPath := filepath.Join(dirRoot, f.FileName)
// FIXME this is not preferable -- ignoring error
ids, _ := searchFileIDs(fPath)
// FIXME for now, proceed onwards with whatever IDs we obtained.
// FIXME instead of ignoring the error, should probably either log it,
// FIXME and/or enable the caller to configure what should happen.
// separate out for this file's licenses
licsForFile := map[string]int{}
licsParens := []string{}
for _, lid := range ids {
// get individual elements and add for file and package
licElements := getIndividualLicenses(lid)
for _, elt := range licElements {
licsForFile[elt] = 1
licsForPackage[elt] = 1
}
// parenthesize if needed and add to slice for joining
licsParens = append(licsParens, makeElement(lid))
}
// OK -- now we can fill in the file's details, or NOASSERTION if none
if len(licsForFile) > 0 {
f.LicenseInfoInFiles = []string{}
for lic := range licsForFile {
f.LicenseInfoInFiles = append(f.LicenseInfoInFiles, lic)
}
sort.Strings(f.LicenseInfoInFiles)
// avoid adding parens and joining for single-ID items
if len(licsParens) == 1 {
f.LicenseConcluded = ids[0]
} else {
f.LicenseConcluded = strings.Join(licsParens, " AND ")
}
}
}
// and finally, we can fill in the package's details
if len(licsForPackage) == 0 {
pkg.PackageLicenseInfoFromFiles = []string{"NOASSERTION"}
} else {
pkg.PackageLicenseInfoFromFiles = []string{}
for lic := range licsForPackage {
pkg.PackageLicenseInfoFromFiles = append(pkg.PackageLicenseInfoFromFiles, lic)
}
sort.Strings(pkg.PackageLicenseInfoFromFiles)
}
return doc, nil
}
// ===== 2.3 Searcher functions =====
// Config2_3 is a collection of configuration settings for docbuilder
// (for version 2.3 SPDX Documents). A few mandatory fields are set here
// so that they can be repeatedly reused in multiple calls to Build2_3.
type Config2_3 struct {
// NamespacePrefix should be a URI representing a prefix for the
// namespace with which the SPDX Document will be associated.
// It will be used in the DocumentNamespace field in the CreationInfo
// section, followed by the per-Document package name and a random UUID.
NamespacePrefix string
// BuilderPathsIgnored lists certain paths to be omitted from the built
// document. Each string should be a path, relative to the package's
// dirRoot, to a specific file or (for all files in a directory) ending
// in a slash. Prefix the string with "**" to omit all instances of that
// file / directory, regardless of where it is in the file tree.
BuilderPathsIgnored []string
// SearcherPathsIgnored lists certain paths that should not be searched
// by idsearcher, even if those paths have Files present. It uses the
// same format as BuilderPathsIgnored.
SearcherPathsIgnored []string
}
// BuildIDsDocument2_3 creates an SPDX Document (version 2.3) and searches for
// short-form IDs in each file, filling in license fields as appropriate. It
// returns that document or error if any is encountered. Arguments:
// - packageName: name of package / directory
// - dirRoot: path to directory to be analyzed
// - namespacePrefix: URI representing a prefix for the
// namespace with which the SPDX Document will be associated
func BuildIDsDocument2_3(packageName string, dirRoot string, idconfig *Config2_3) (*v2_3.Document, error) {
// first, build the Document using builder
bconfig := &builder.Config2_3{
NamespacePrefix: idconfig.NamespacePrefix,
CreatorType: "Tool",
Creator: "github.com/spdx/tools-golang/idsearcher",
PathsIgnored: idconfig.BuilderPathsIgnored,
}
doc, err := builder.Build2_3(packageName, dirRoot, bconfig)
if err != nil {
return nil, err
}
if doc == nil {
return nil, fmt.Errorf("builder returned nil Document")
}
if doc.Packages == nil {
return nil, fmt.Errorf("builder returned nil Packages map")
}
if len(doc.Packages) != 1 {
return nil, fmt.Errorf("builder returned %d Packages", len(doc.Packages))
}
// now, walk through each file and find its licenses (if any)
pkg := doc.Packages[0]
if pkg == nil {
return nil, fmt.Errorf("builder returned nil Package")
}
if pkg.Files == nil {
return nil, fmt.Errorf("builder returned nil Files in Package")
}
licsForPackage := map[string]int{}
for _, f := range pkg.Files {
// start by initializing / clearing values
f.LicenseInfoInFiles = []string{"NOASSERTION"}
f.LicenseConcluded = "NOASSERTION"
// check whether the searcher should ignore this file
if utils.ShouldIgnore(f.FileName, idconfig.SearcherPathsIgnored) {
continue
}
fPath := filepath.Join(dirRoot, f.FileName)
// FIXME this is not preferable -- ignoring error
ids, _ := searchFileIDs(fPath)
// FIXME for now, proceed onwards with whatever IDs we obtained.
// FIXME instead of ignoring the error, should probably either log it,
// FIXME and/or enable the caller to configure what should happen.
// separate out for this file's licenses
licsForFile := map[string]int{}
licsParens := []string{}
for _, lid := range ids {
// get individual elements and add for file and package
licElements := getIndividualLicenses(lid)
for _, elt := range licElements {
licsForFile[elt] = 1
licsForPackage[elt] = 1
}
// parenthesize if needed and add to slice for joining
licsParens = append(licsParens, makeElement(lid))
}
// OK -- now we can fill in the file's details, or NOASSERTION if none
if len(licsForFile) > 0 {
f.LicenseInfoInFiles = []string{}
for lic := range licsForFile {
f.LicenseInfoInFiles = append(f.LicenseInfoInFiles, lic)
}
sort.Strings(f.LicenseInfoInFiles)
// avoid adding parens and joining for single-ID items
if len(licsParens) == 1 {
f.LicenseConcluded = ids[0]
} else {
f.LicenseConcluded = strings.Join(licsParens, " AND ")
}
}
}
// and finally, we can fill in the package's details
if len(licsForPackage) == 0 {
pkg.PackageLicenseInfoFromFiles = []string{"NOASSERTION"}
} else {
pkg.PackageLicenseInfoFromFiles = []string{}
for lic := range licsForPackage {
pkg.PackageLicenseInfoFromFiles = append(pkg.PackageLicenseInfoFromFiles, lic)
}
sort.Strings(pkg.PackageLicenseInfoFromFiles)
}
return doc, nil
}
// ===== Utility functions (not version-specific) =====
func searchFileIDs(filePath string) ([]string, error) {
idsMap := map[string]int{}
ids := []string{}
f, err := os.Open(filePath)
if err != nil {
return nil, err
}
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
if strings.Contains(scanner.Text(), "SPDX-License-Identifier:") {
strs := strings.SplitN(scanner.Text(), "SPDX-License-Identifier:", 2)
// if prefixed by more than n characters, it's probably not a
// short-form ID; it's probably code to detect short-form IDs.
// Like this function itself, for example =)
prefix := stripTrash(strs[0])
if len(prefix) > 5 {
continue
}
// stop before trailing */ if it is present
lidToExtract := strs[1]
lidToExtract = strings.Split(lidToExtract, "*/")[0]
lid := strings.TrimSpace(lidToExtract)
lid = stripTrash(lid)
idsMap[lid] = 1
}
}
// FIXME for now, ignore scanner errors because we want to return whatever
// FIXME IDs were in fact found. should probably be changed to either
// FIXME log the error, and/or be configurable for what should happen.
// if err = scanner.Err(); err != nil {
// return nil, err
// }
// now, convert map to string
for lid := range idsMap {
ids = append(ids, lid)
}
// and sort it
sort.Strings(ids)
return ids, nil
}
func stripTrash(lid string) string {
re := regexp.MustCompile(`[^\w\s\d.\-\+()]+`)
return re.ReplaceAllString(lid, "")
}
func makeElement(lic string) string {
if strings.Contains(lic, " AND ") || strings.Contains(lic, " OR ") {
return fmt.Sprintf("(%s)", lic)
}
return lic
}
func getIndividualLicenses(lic string) []string {
// replace parens and '+' with spaces
lic = strings.Replace(lic, "(", " ", -1)
lic = strings.Replace(lic, ")", " ", -1)
lic = strings.Replace(lic, "+", " ", -1)
// now, split by spaces, trim, and add to slice
licElements := strings.Split(lic, " ")
lics := []string{}
for _, elt := range licElements {
elt := strings.TrimSpace(elt)
// don't add if empty or if case-insensitive operator
if elt == "" || strings.EqualFold(elt, "AND") ||
strings.EqualFold(elt, "OR") || strings.EqualFold(elt, "WITH") {
continue
}
lics = append(lics, elt)
}
// sort before returning
sort.Strings(lics)
return lics
}