// Package idsearcher is used to search for short-form IDs in files
// within a directory, and to build an SPDX Document containing those
// license findings.
// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
package idsearcher

import (
	"bufio"
	"fmt"
	"github.com/spdx/tools-golang/spdx/v2_3"
	"os"
	"path/filepath"
	"regexp"
	"sort"
	"strings"

	"github.com/spdx/tools-golang/builder"
	"github.com/spdx/tools-golang/spdx/v2_1"
	"github.com/spdx/tools-golang/spdx/v2_2"
	"github.com/spdx/tools-golang/utils"
)

// ===== 2.1 Searcher functions =====

// Config2_1 is a collection of configuration settings for docbuilder
// (for version 2.1 SPDX Documents). A few mandatory fields are set here
// so that they can be repeatedly reused in multiple calls to Build2_1.
type Config2_1 struct {
	// NamespacePrefix should be a URI representing a prefix for the
	// namespace with which the SPDX Document will be associated.
	// It will be used in the DocumentNamespace field in the CreationInfo
	// section, followed by the per-Document package name and a random UUID.
	NamespacePrefix string

	// BuilderPathsIgnored lists certain paths to be omitted from the built
	// document. Each string should be a path, relative to the package's
	// dirRoot, to a specific file or (for all files in a directory) ending
	// in a slash. Prefix the string with "**" to omit all instances of that
	// file / directory, regardless of where it is in the file tree.
	BuilderPathsIgnored []string

	// SearcherPathsIgnored lists certain paths that should not be searched
	// by idsearcher, even if those paths have Files present. It uses the
	// same format as BuilderPathsIgnored.
	SearcherPathsIgnored []string
}

// BuildIDsDocument2_1 creates an SPDX Document (version 2.1) and searches for
// short-form IDs in each file, filling in license fields as appropriate. It
// returns that document or error if any is encountered. Arguments:
//   - packageName: name of package / directory
//   - dirRoot: path to directory to be analyzed
//   - namespacePrefix: URI representing a prefix for the
//     namespace with which the SPDX Document will be associated
func BuildIDsDocument2_1(packageName string, dirRoot string, idconfig *Config2_1) (*v2_1.Document, error) {
	// first, build the Document using builder
	bconfig := &builder.Config2_1{
		NamespacePrefix: idconfig.NamespacePrefix,
		CreatorType:     "Tool",
		Creator:         "github.com/spdx/tools-golang/idsearcher",
		PathsIgnored:    idconfig.BuilderPathsIgnored,
	}
	doc, err := builder.Build2_1(packageName, dirRoot, bconfig)
	if err != nil {
		return nil, err
	}
	if doc == nil {
		return nil, fmt.Errorf("builder returned nil Document")
	}
	if doc.Packages == nil {
		return nil, fmt.Errorf("builder returned nil Packages map")
	}
	if len(doc.Packages) != 1 {
		return nil, fmt.Errorf("builder returned %d Packages", len(doc.Packages))
	}

	// now, walk through each file and find its licenses (if any)
	pkg := doc.Packages[0]
	if pkg == nil {
		return nil, fmt.Errorf("builder returned nil Package")
	}
	if pkg.Files == nil {
		return nil, fmt.Errorf("builder returned nil Files in Package")
	}
	licsForPackage := map[string]int{}
	for _, f := range pkg.Files {
		// start by initializing / clearing values
		f.LicenseInfoInFiles = []string{"NOASSERTION"}
		f.LicenseConcluded = "NOASSERTION"

		// check whether the searcher should ignore this file
		if utils.ShouldIgnore(f.FileName, idconfig.SearcherPathsIgnored) {
			continue
		}

		fPath := filepath.Join(dirRoot, f.FileName)
		// FIXME this is not preferable -- ignoring error
		ids, _ := searchFileIDs(fPath)
		// FIXME for now, proceed onwards with whatever IDs we obtained.
		// FIXME instead of ignoring the error, should probably either log it,
		// FIXME and/or enable the caller to configure what should happen.

		// separate out for this file's licenses
		licsForFile := map[string]int{}
		licsParens := []string{}
		for _, lid := range ids {
			// get individual elements and add for file and package
			licElements := getIndividualLicenses(lid)
			for _, elt := range licElements {
				licsForFile[elt] = 1
				licsForPackage[elt] = 1
			}
			// parenthesize if needed and add to slice for joining
			licsParens = append(licsParens, makeElement(lid))
		}

		// OK -- now we can fill in the file's details, or NOASSERTION if none
		if len(licsForFile) > 0 {
			f.LicenseInfoInFiles = []string{}
			for lic := range licsForFile {
				f.LicenseInfoInFiles = append(f.LicenseInfoInFiles, lic)
			}
			sort.Strings(f.LicenseInfoInFiles)
			// avoid adding parens and joining for single-ID items
			if len(licsParens) == 1 {
				f.LicenseConcluded = ids[0]
			} else {
				f.LicenseConcluded = strings.Join(licsParens, " AND ")
			}
		}
	}

	// and finally, we can fill in the package's details
	if len(licsForPackage) == 0 {
		pkg.PackageLicenseInfoFromFiles = []string{"NOASSERTION"}
	} else {
		pkg.PackageLicenseInfoFromFiles = []string{}
		for lic := range licsForPackage {
			pkg.PackageLicenseInfoFromFiles = append(pkg.PackageLicenseInfoFromFiles, lic)
		}
		sort.Strings(pkg.PackageLicenseInfoFromFiles)
	}

	return doc, nil
}

// ===== 2.2 Searcher functions =====

// Config2_2 is a collection of configuration settings for docbuilder
// (for version 2.2 SPDX Documents). A few mandatory fields are set here
// so that they can be repeatedly reused in multiple calls to Build2_2.
type Config2_2 struct {
	// NamespacePrefix should be a URI representing a prefix for the
	// namespace with which the SPDX Document will be associated.
	// It will be used in the DocumentNamespace field in the CreationInfo
	// section, followed by the per-Document package name and a random UUID.
	NamespacePrefix string

	// BuilderPathsIgnored lists certain paths to be omitted from the built
	// document. Each string should be a path, relative to the package's
	// dirRoot, to a specific file or (for all files in a directory) ending
	// in a slash. Prefix the string with "**" to omit all instances of that
	// file / directory, regardless of where it is in the file tree.
	BuilderPathsIgnored []string

	// SearcherPathsIgnored lists certain paths that should not be searched
	// by idsearcher, even if those paths have Files present. It uses the
	// same format as BuilderPathsIgnored.
	SearcherPathsIgnored []string
}

// BuildIDsDocument2_2 creates an SPDX Document (version 2.2) and searches for
// short-form IDs in each file, filling in license fields as appropriate. It
// returns that document or error if any is encountered. Arguments:
//   - packageName: name of package / directory
//   - dirRoot: path to directory to be analyzed
//   - namespacePrefix: URI representing a prefix for the
//     namespace with which the SPDX Document will be associated
func BuildIDsDocument2_2(packageName string, dirRoot string, idconfig *Config2_2) (*v2_2.Document, error) {
	// first, build the Document using builder
	bconfig := &builder.Config2_2{
		NamespacePrefix: idconfig.NamespacePrefix,
		CreatorType:     "Tool",
		Creator:         "github.com/spdx/tools-golang/idsearcher",
		PathsIgnored:    idconfig.BuilderPathsIgnored,
	}
	doc, err := builder.Build2_2(packageName, dirRoot, bconfig)
	if err != nil {
		return nil, err
	}
	if doc == nil {
		return nil, fmt.Errorf("builder returned nil Document")
	}
	if doc.Packages == nil {
		return nil, fmt.Errorf("builder returned nil Packages map")
	}
	if len(doc.Packages) != 1 {
		return nil, fmt.Errorf("builder returned %d Packages", len(doc.Packages))
	}

	// now, walk through each file and find its licenses (if any)
	pkg := doc.Packages[0]
	if pkg == nil {
		return nil, fmt.Errorf("builder returned nil Package")
	}
	if pkg.Files == nil {
		return nil, fmt.Errorf("builder returned nil Files in Package")
	}
	licsForPackage := map[string]int{}
	for _, f := range pkg.Files {
		// start by initializing / clearing values
		f.LicenseInfoInFiles = []string{"NOASSERTION"}
		f.LicenseConcluded = "NOASSERTION"

		// check whether the searcher should ignore this file
		if utils.ShouldIgnore(f.FileName, idconfig.SearcherPathsIgnored) {
			continue
		}

		fPath := filepath.Join(dirRoot, f.FileName)
		// FIXME this is not preferable -- ignoring error
		ids, _ := searchFileIDs(fPath)
		// FIXME for now, proceed onwards with whatever IDs we obtained.
		// FIXME instead of ignoring the error, should probably either log it,
		// FIXME and/or enable the caller to configure what should happen.

		// separate out for this file's licenses
		licsForFile := map[string]int{}
		licsParens := []string{}
		for _, lid := range ids {
			// get individual elements and add for file and package
			licElements := getIndividualLicenses(lid)
			for _, elt := range licElements {
				licsForFile[elt] = 1
				licsForPackage[elt] = 1
			}
			// parenthesize if needed and add to slice for joining
			licsParens = append(licsParens, makeElement(lid))
		}

		// OK -- now we can fill in the file's details, or NOASSERTION if none
		if len(licsForFile) > 0 {
			f.LicenseInfoInFiles = []string{}
			for lic := range licsForFile {
				f.LicenseInfoInFiles = append(f.LicenseInfoInFiles, lic)
			}
			sort.Strings(f.LicenseInfoInFiles)
			// avoid adding parens and joining for single-ID items
			if len(licsParens) == 1 {
				f.LicenseConcluded = ids[0]
			} else {
				f.LicenseConcluded = strings.Join(licsParens, " AND ")
			}
		}
	}

	// and finally, we can fill in the package's details
	if len(licsForPackage) == 0 {
		pkg.PackageLicenseInfoFromFiles = []string{"NOASSERTION"}
	} else {
		pkg.PackageLicenseInfoFromFiles = []string{}
		for lic := range licsForPackage {
			pkg.PackageLicenseInfoFromFiles = append(pkg.PackageLicenseInfoFromFiles, lic)
		}
		sort.Strings(pkg.PackageLicenseInfoFromFiles)
	}

	return doc, nil
}

// ===== 2.3 Searcher functions =====

// Config2_3 is a collection of configuration settings for docbuilder
// (for version 2.3 SPDX Documents). A few mandatory fields are set here
// so that they can be repeatedly reused in multiple calls to Build2_3.
type Config2_3 struct {
	// NamespacePrefix should be a URI representing a prefix for the
	// namespace with which the SPDX Document will be associated.
	// It will be used in the DocumentNamespace field in the CreationInfo
	// section, followed by the per-Document package name and a random UUID.
	NamespacePrefix string

	// BuilderPathsIgnored lists certain paths to be omitted from the built
	// document. Each string should be a path, relative to the package's
	// dirRoot, to a specific file or (for all files in a directory) ending
	// in a slash. Prefix the string with "**" to omit all instances of that
	// file / directory, regardless of where it is in the file tree.
	BuilderPathsIgnored []string

	// SearcherPathsIgnored lists certain paths that should not be searched
	// by idsearcher, even if those paths have Files present. It uses the
	// same format as BuilderPathsIgnored.
	SearcherPathsIgnored []string
}

// BuildIDsDocument2_3 creates an SPDX Document (version 2.3) and searches for
// short-form IDs in each file, filling in license fields as appropriate. It
// returns that document or error if any is encountered. Arguments:
//   - packageName: name of package / directory
//   - dirRoot: path to directory to be analyzed
//   - namespacePrefix: URI representing a prefix for the
//     namespace with which the SPDX Document will be associated
func BuildIDsDocument2_3(packageName string, dirRoot string, idconfig *Config2_3) (*v2_3.Document, error) {
	// first, build the Document using builder
	bconfig := &builder.Config2_3{
		NamespacePrefix: idconfig.NamespacePrefix,
		CreatorType:     "Tool",
		Creator:         "github.com/spdx/tools-golang/idsearcher",
		PathsIgnored:    idconfig.BuilderPathsIgnored,
	}
	doc, err := builder.Build2_3(packageName, dirRoot, bconfig)
	if err != nil {
		return nil, err
	}
	if doc == nil {
		return nil, fmt.Errorf("builder returned nil Document")
	}
	if doc.Packages == nil {
		return nil, fmt.Errorf("builder returned nil Packages map")
	}
	if len(doc.Packages) != 1 {
		return nil, fmt.Errorf("builder returned %d Packages", len(doc.Packages))
	}

	// now, walk through each file and find its licenses (if any)
	pkg := doc.Packages[0]
	if pkg == nil {
		return nil, fmt.Errorf("builder returned nil Package")
	}
	if pkg.Files == nil {
		return nil, fmt.Errorf("builder returned nil Files in Package")
	}
	licsForPackage := map[string]int{}
	for _, f := range pkg.Files {
		// start by initializing / clearing values
		f.LicenseInfoInFiles = []string{"NOASSERTION"}
		f.LicenseConcluded = "NOASSERTION"

		// check whether the searcher should ignore this file
		if utils.ShouldIgnore(f.FileName, idconfig.SearcherPathsIgnored) {
			continue
		}

		fPath := filepath.Join(dirRoot, f.FileName)
		// FIXME this is not preferable -- ignoring error
		ids, _ := searchFileIDs(fPath)
		// FIXME for now, proceed onwards with whatever IDs we obtained.
		// FIXME instead of ignoring the error, should probably either log it,
		// FIXME and/or enable the caller to configure what should happen.

		// separate out for this file's licenses
		licsForFile := map[string]int{}
		licsParens := []string{}
		for _, lid := range ids {
			// get individual elements and add for file and package
			licElements := getIndividualLicenses(lid)
			for _, elt := range licElements {
				licsForFile[elt] = 1
				licsForPackage[elt] = 1
			}
			// parenthesize if needed and add to slice for joining
			licsParens = append(licsParens, makeElement(lid))
		}

		// OK -- now we can fill in the file's details, or NOASSERTION if none
		if len(licsForFile) > 0 {
			f.LicenseInfoInFiles = []string{}
			for lic := range licsForFile {
				f.LicenseInfoInFiles = append(f.LicenseInfoInFiles, lic)
			}
			sort.Strings(f.LicenseInfoInFiles)
			// avoid adding parens and joining for single-ID items
			if len(licsParens) == 1 {
				f.LicenseConcluded = ids[0]
			} else {
				f.LicenseConcluded = strings.Join(licsParens, " AND ")
			}
		}
	}

	// and finally, we can fill in the package's details
	if len(licsForPackage) == 0 {
		pkg.PackageLicenseInfoFromFiles = []string{"NOASSERTION"}
	} else {
		pkg.PackageLicenseInfoFromFiles = []string{}
		for lic := range licsForPackage {
			pkg.PackageLicenseInfoFromFiles = append(pkg.PackageLicenseInfoFromFiles, lic)
		}
		sort.Strings(pkg.PackageLicenseInfoFromFiles)
	}

	return doc, nil
}

// ===== Utility functions (not version-specific) =====
func searchFileIDs(filePath string) ([]string, error) {
	idsMap := map[string]int{}
	ids := []string{}

	f, err := os.Open(filePath)
	if err != nil {
		return nil, err
	}
	defer f.Close()

	scanner := bufio.NewScanner(f)

	for scanner.Scan() {
		if strings.Contains(scanner.Text(), "SPDX-License-Identifier:") {
			strs := strings.SplitN(scanner.Text(), "SPDX-License-Identifier:", 2)

			// if prefixed by more than n characters, it's probably not a
			// short-form ID; it's probably code to detect short-form IDs.
			// Like this function itself, for example  =)
			prefix := stripTrash(strs[0])
			if len(prefix) > 5 {
				continue
			}

			// stop before trailing */ if it is present
			lidToExtract := strs[1]
			lidToExtract = strings.Split(lidToExtract, "*/")[0]
			lid := strings.TrimSpace(lidToExtract)
			lid = stripTrash(lid)
			idsMap[lid] = 1
		}
	}

	// FIXME for now, ignore scanner errors because we want to return whatever
	// FIXME IDs were in fact found. should probably be changed to either
	// FIXME log the error, and/or be configurable for what should happen.
	// if err = scanner.Err(); err != nil {
	// 	return nil, err
	// }

	// now, convert map to string
	for lid := range idsMap {
		ids = append(ids, lid)
	}

	// and sort it
	sort.Strings(ids)

	return ids, nil
}

func stripTrash(lid string) string {
	re := regexp.MustCompile(`[^\w\s\d.\-\+()]+`)
	return re.ReplaceAllString(lid, "")
}

func makeElement(lic string) string {
	if strings.Contains(lic, " AND ") || strings.Contains(lic, " OR ") {
		return fmt.Sprintf("(%s)", lic)
	}

	return lic
}

func getIndividualLicenses(lic string) []string {
	// replace parens and '+' with spaces
	lic = strings.Replace(lic, "(", " ", -1)
	lic = strings.Replace(lic, ")", " ", -1)
	lic = strings.Replace(lic, "+", " ", -1)

	// now, split by spaces, trim, and add to slice
	licElements := strings.Split(lic, " ")
	lics := []string{}
	for _, elt := range licElements {
		elt := strings.TrimSpace(elt)
		// don't add if empty or if case-insensitive operator
		if elt == "" || strings.EqualFold(elt, "AND") ||
			strings.EqualFold(elt, "OR") || strings.EqualFold(elt, "WITH") {
			continue
		}

		lics = append(lics, elt)
	}

	// sort before returning
	sort.Strings(lics)
	return lics
}
