From 8ba824d5a54f11613d783b75eef47bc75a2d2d7e Mon Sep 17 00:00:00 2001 From: Matthias Bertschy Date: Fri, 30 Jan 2026 17:16:02 +0100 Subject: [PATCH] feat: Phase 1 memory optimization for JavaScript SBOM generation Signed-off-by: Matthias Bertschy --- internal/licenses/context.go | 10 +++- internal/licenses/context_test.go | 11 ++-- .../cataloger/internal/dependency/resolver.go | 33 ++++++---- .../cataloger/javascript/parse_pnpm_lock.go | 26 +++++--- .../cataloger/javascript/parse_yarn_lock.go | 60 +++++++++++++------ 5 files changed, 93 insertions(+), 47 deletions(-) diff --git a/internal/licenses/context.go b/internal/licenses/context.go index 91301cc20b9..c2861ff0a7b 100644 --- a/internal/licenses/context.go +++ b/internal/licenses/context.go @@ -2,12 +2,15 @@ package licenses import ( "context" + "errors" ) type licenseScannerKey struct{} var ctxKey = licenseScannerKey{} +var ErrNoLicenseScanner = errors.New("no license scanner set in context") + func SetContextLicenseScanner(ctx context.Context, s Scanner) context.Context { return context.WithValue(ctx, ctxKey, s) } @@ -18,8 +21,9 @@ func IsContextLicenseScannerSet(ctx context.Context) bool { } func ContextLicenseScanner(ctx context.Context) (Scanner, error) { - if s, ok := ctx.Value(ctxKey).(Scanner); ok { - return s, nil + s, ok := ctx.Value(ctxKey).(Scanner) + if !ok { + return nil, ErrNoLicenseScanner } - return NewDefaultScanner() + return s, nil } diff --git a/internal/licenses/context_test.go b/internal/licenses/context_test.go index 9d979ea58bb..9e3ed4c27e9 100644 --- a/internal/licenses/context_test.go +++ b/internal/licenses/context_test.go @@ -33,16 +33,15 @@ func TestContextLicenseScanner(t *testing.T) { scanner := testScanner() ctx := SetContextLicenseScanner(context.Background(), scanner) s, err := ContextLicenseScanner(ctx) - if err != nil || s != scanner { - t.Fatal("expected scanner from context") - } + require.NoError(t, err) + require.Equal(t, scanner, s) }) t.Run("without scanner", func(t *testing.T) { ctx := context.Background() s, err := ContextLicenseScanner(ctx) - if err != nil || s == nil { - t.Fatal("expected default scanner") - } + require.Error(t, err) + require.ErrorIs(t, err, ErrNoLicenseScanner) + require.Nil(t, s) }) } diff --git a/syft/pkg/cataloger/internal/dependency/resolver.go b/syft/pkg/cataloger/internal/dependency/resolver.go index c7074df79d4..1175feeb9fe 100644 --- a/syft/pkg/cataloger/internal/dependency/resolver.go +++ b/syft/pkg/cataloger/internal/dependency/resolver.go @@ -3,14 +3,17 @@ package dependency import ( "sort" - "github.com/scylladb/go-set/strset" - "github.com/anchore/syft/internal" "github.com/anchore/syft/syft/artifact" "github.com/anchore/syft/syft/pkg" "github.com/anchore/syft/syft/pkg/cataloger/generic" ) +// pairKey is a struct used to track seen relationship pairs without string concatenation +type pairKey struct { + from, to artifact.ID +} + // Specification holds strings that indicate abstract resources that a package provides for other packages and // requires for itself. These strings can represent anything from file paths, package names, or any other concept // that is useful for dependency resolution within that packing ecosystem. @@ -64,15 +67,15 @@ func Resolve(specifier Specifier, pkgs []pkg.Package) (relationships []artifact. specsByPkg[id] = allProvides(pkgsProvidingResource, id, specifier(p)) } - seen := strset.New() + seen := make(map[pairKey]struct{}) for _, dependantPkg := range pkgs { specs := specsByPkg[dependantPkg.ID()] for _, spec := range specs { for _, resource := range deduplicate(spec.Requires) { for providingPkgID := range pkgsProvidingResource[resource] { - // prevent creating duplicate relationships - pairKey := string(providingPkgID) + "-" + string(dependantPkg.ID()) - if seen.Has(pairKey) { + // prevent creating duplicate relationships using struct key instead of string concatenation + key := pairKey{from: providingPkgID, to: dependantPkg.ID()} + if _, exists := seen[key]; exists { continue } @@ -86,7 +89,7 @@ func Resolve(specifier Specifier, pkgs []pkg.Package) (relationships []artifact. }, ) - seen.Add(pairKey) + seen[key] = struct{}{} } } } @@ -112,8 +115,16 @@ func allProvides(pkgsProvidingResource map[string]internal.Set[artifact.ID], id func deduplicate(ss []string) []string { // note: we sort the set such that multiple invocations of this function will be deterministic - set := strset.New(ss...) - list := set.List() - sort.Strings(list) - return list + // use map for O(1) lookups without strset overhead + unique := make(map[string]struct{}, len(ss)) + result := make([]string, 0, len(unique)) + + for _, s := range ss { + if _, exists := unique[s]; !exists { + unique[s] = struct{}{} + result = append(result, s) + } + } + sort.Strings(result) + return result } diff --git a/syft/pkg/cataloger/javascript/parse_pnpm_lock.go b/syft/pkg/cataloger/javascript/parse_pnpm_lock.go index 6ab32cae57c..2e0ff96dc4c 100644 --- a/syft/pkg/cataloger/javascript/parse_pnpm_lock.go +++ b/syft/pkg/cataloger/javascript/parse_pnpm_lock.go @@ -107,17 +107,21 @@ func (p *pnpmV6LockYaml) Parse(version float64, data []byte) ([]pnpmPackage, err log.WithFields("key", key).Trace("unable to parse pnpm package key") continue } - pkgKey := name + "@" + ver + pkgKey := strings.Join([]string{name, ver}, "@") integrity := "" if value, ok := pkgInfo.Resolution["integrity"]; ok { integrity = value } - dependencies := make(map[string]string) + dependencies := make(map[string]string, len(pkgInfo.Dependencies)) for depName, depVersion := range pkgInfo.Dependencies { - var normalizedVersion = strings.SplitN(depVersion, "(", 2)[0] - dependencies[depName] = normalizedVersion + // Use strings.Cut for more efficient splitting + if normalizedVersion, _, ok := strings.Cut(depVersion, "("); ok { + dependencies[depName] = normalizedVersion + } else { + dependencies[depName] = depVersion + } } packages[pkgKey] = pnpmPackage{Name: name, Version: ver, Integrity: integrity, Dependencies: dependencies, Dev: pkgInfo.Dev} @@ -143,7 +147,7 @@ func (p *pnpmV9LockYaml) Parse(_ float64, data []byte) ([]pnpmPackage, error) { log.WithFields("key", key).Trace("unable to parse pnpm v9 package key") continue } - pkgKey := name + "@" + ver + pkgKey := strings.Join([]string{name, ver}, "@") packages[pkgKey] = pnpmPackage{Name: name, Version: ver, Integrity: entry.Resolution["integrity"], Dev: entry.Dev} } @@ -153,12 +157,16 @@ func (p *pnpmV9LockYaml) Parse(_ float64, data []byte) ([]pnpmPackage, error) { log.WithFields("key", key).Trace("unable to parse pnpm v9 package snapshot key") continue } - pkgKey := name + "@" + ver + pkgKey := strings.Join([]string{name, ver}, "@") if pkg, ok := packages[pkgKey]; ok { - pkg.Dependencies = make(map[string]string) + pkg.Dependencies = make(map[string]string, len(snapshotInfo.Dependencies)) for name, versionSpecifier := range snapshotInfo.Dependencies { - var normalizedVersion = strings.SplitN(versionSpecifier, "(", 2)[0] - pkg.Dependencies[name] = normalizedVersion + // Use strings.Cut for more efficient splitting + if normalizedVersion, _, ok := strings.Cut(versionSpecifier, "("); ok { + pkg.Dependencies[name] = normalizedVersion + } else { + pkg.Dependencies[name] = versionSpecifier + } } packages[pkgKey] = pkg } else { diff --git a/syft/pkg/cataloger/javascript/parse_yarn_lock.go b/syft/pkg/cataloger/javascript/parse_yarn_lock.go index beb1ca87855..c42b3934d45 100644 --- a/syft/pkg/cataloger/javascript/parse_yarn_lock.go +++ b/syft/pkg/cataloger/javascript/parse_yarn_lock.go @@ -1,6 +1,7 @@ package javascript import ( + "bufio" "bytes" "context" "fmt" @@ -71,20 +72,26 @@ func newGenericYarnLockAdapter(cfg CatalogerConfig) genericYarnLockAdapter { } } -func parseYarnV1LockFile(reader io.ReadCloser) ([]yarnPackage, error) { - content, err := io.ReadAll(reader) - if err != nil { - return nil, fmt.Errorf("failed to read yarn.lock file: %w", err) +// isYarnV1Lockfile checks if the reader contains a v1 yarn lockfile by peeking at the first few bytes +func isYarnV1Lockfile(reader *bufio.Reader) (bool, error) { + // Peek at first 100 bytes to check for v1 marker + peek, err := reader.Peek(100) + if err != nil && err != io.EOF { + return false, err } + return bytes.Contains(peek, []byte("# yarn lockfile v1")), nil +} - re := regexp.MustCompile(`\r?\n`) - lines := re.Split(string(content), -1) +// parseYarnV1LockFile parses a v1 yarn.lock file using line-by-line scanning +func parseYarnV1LockFile(reader io.Reader) ([]yarnPackage, error) { + scanner := bufio.NewScanner(reader) var pkgs []yarnPackage var pkg = yarnPackage{} var seenPkgs = strset.New() dependencies := make(map[string]string) - for _, line := range lines { + for scanner.Scan() { + line := scanner.Text() if strings.HasPrefix(line, "#") { continue } @@ -101,8 +108,7 @@ func parseYarnV1LockFile(reader io.ReadCloser) ([]yarnPackage, error) { // The first line of a package entry is the name of the package with no // leading spaces if !strings.HasPrefix(line, " ") { - name := line - pkg.Name = findPackageName(name) + pkg.Name = findPackageName(line) continue } if strings.HasPrefix(line, " ") && !strings.HasPrefix(line, " ") { @@ -133,6 +139,11 @@ func parseYarnV1LockFile(reader io.ReadCloser) ([]yarnPackage, error) { dependencies[dependencyName] = dependencyVersion } } + + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("failed to read yarn.lock file: %w", err) + } + // If the last package in the list is not the same as the current package, add the current package // to the list. In case there was no trailing new line before we hit EOF. if len(pkg.Name) > 0 && !seenPkgs.Has(pkg.Name+"@"+pkg.Version) { @@ -171,21 +182,34 @@ func (a genericYarnLockAdapter) parseYarnLock(ctx context.Context, resolver file return nil, nil, nil } - data, err := io.ReadAll(reader) + // Wrap reader in bufio.Reader for peeking at the lockfile version + bufReader := bufio.NewReader(reader) + + var yarnPkgs []yarnPackage + var err error + + // v1 Yarn lockfiles are not YAML, so we parse them line-by-line for memory efficiency. + // v2+ lockfiles are YAML and require loading the entire file. + // We peek at the first few bytes to determine the version. + isV1, err := isYarnV1Lockfile(bufReader) if err != nil { - return nil, nil, fmt.Errorf("failed to load yarn.lock file: %w", err) + return nil, nil, fmt.Errorf("failed to determine yarn.lock version: %w", err) } - // Reset the reader to the beginning of the file - reader.ReadCloser = io.NopCloser(bytes.NewBuffer(data)) - var yarnPkgs []yarnPackage - // v1 Yarn lockfiles are not YAML, so we need to parse them as a special case. They typically - // include a comment line that indicates the version. I.e. "# yarn lockfile v1" - if strings.Contains(string(data), "# yarn lockfile v1") { - yarnPkgs, err = parseYarnV1LockFile(reader) + if isV1 { + // For v1, parse line-by-line without loading entire file into memory + yarnPkgs, err = parseYarnV1LockFile(bufReader) } else { + // For v2+, we need to load the entire file as YAML + data, err := io.ReadAll(bufReader) + if err != nil { + return nil, nil, fmt.Errorf("failed to load yarn.lock file: %w", err) + } + // Reset the reader to the beginning of the file + reader.ReadCloser = io.NopCloser(bytes.NewBuffer(data)) yarnPkgs, err = parseYarnLockYaml(reader) } + if err != nil { return nil, nil, fmt.Errorf("failed to parse yarn.lock file: %w", err) }