-
Notifications
You must be signed in to change notification settings - Fork 778
feat: Phase 1 memory optimization for JavaScript SBOM generation #4585
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,14 +3,17 @@ package dependency | |
| import ( | ||
| "sort" | ||
|
|
||
| "github.com/scylladb/go-set/strset" | ||
|
|
||
| "github.com/anchore/syft/internal" | ||
| "github.com/anchore/syft/syft/artifact" | ||
| "github.com/anchore/syft/syft/pkg" | ||
| "github.com/anchore/syft/syft/pkg/cataloger/generic" | ||
| ) | ||
|
|
||
| // pairKey is a struct used to track seen relationship pairs without string concatenation | ||
| type pairKey struct { | ||
| from, to artifact.ID | ||
| } | ||
|
|
||
| // Specification holds strings that indicate abstract resources that a package provides for other packages and | ||
| // requires for itself. These strings can represent anything from file paths, package names, or any other concept | ||
| // that is useful for dependency resolution within that packing ecosystem. | ||
|
|
@@ -64,15 +67,15 @@ func Resolve(specifier Specifier, pkgs []pkg.Package) (relationships []artifact. | |
| specsByPkg[id] = allProvides(pkgsProvidingResource, id, specifier(p)) | ||
| } | ||
|
|
||
| seen := strset.New() | ||
| seen := make(map[pairKey]struct{}) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This looks like a good change -- we should avoid creating temporary strings like this as much as possible, but using the |
||
| for _, dependantPkg := range pkgs { | ||
| specs := specsByPkg[dependantPkg.ID()] | ||
| for _, spec := range specs { | ||
| for _, resource := range deduplicate(spec.Requires) { | ||
| for providingPkgID := range pkgsProvidingResource[resource] { | ||
| // prevent creating duplicate relationships | ||
| pairKey := string(providingPkgID) + "-" + string(dependantPkg.ID()) | ||
| if seen.Has(pairKey) { | ||
| // prevent creating duplicate relationships using struct key instead of string concatenation | ||
| key := pairKey{from: providingPkgID, to: dependantPkg.ID()} | ||
| if _, exists := seen[key]; exists { | ||
| continue | ||
| } | ||
|
|
||
|
|
@@ -86,7 +89,7 @@ func Resolve(specifier Specifier, pkgs []pkg.Package) (relationships []artifact. | |
| }, | ||
| ) | ||
|
|
||
| seen.Add(pairKey) | ||
| seen[key] = struct{}{} | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -112,8 +115,16 @@ func allProvides(pkgsProvidingResource map[string]internal.Set[artifact.ID], id | |
|
|
||
| func deduplicate(ss []string) []string { | ||
| // note: we sort the set such that multiple invocations of this function will be deterministic | ||
| set := strset.New(ss...) | ||
| list := set.List() | ||
| sort.Strings(list) | ||
| return list | ||
| // use map for O(1) lookups without strset overhead | ||
| unique := make(map[string]struct{}, len(ss)) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This change might be good, but it's sacrificing readability for what is probably a small gain. Tangentially, I question whether this needs to be sorted. |
||
| result := make([]string, 0, len(unique)) | ||
|
|
||
| for _, s := range ss { | ||
| if _, exists := unique[s]; !exists { | ||
| unique[s] = struct{}{} | ||
| result = append(result, s) | ||
| } | ||
| } | ||
| sort.Strings(result) | ||
| return result | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -107,17 +107,21 @@ func (p *pnpmV6LockYaml) Parse(version float64, data []byte) ([]pnpmPackage, err | |
| log.WithFields("key", key).Trace("unable to parse pnpm package key") | ||
| continue | ||
| } | ||
| pkgKey := name + "@" + ver | ||
| pkgKey := strings.Join([]string{name, ver}, "@") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is still concatenating strings together and is less efficient -- up to half as efficient. This was my intuition, so I wrote a benchmark to verify this; I'll skip to the results: Test: To be sure there wasn't too much being optimized out, I took the example further and it continued to show the concat form uses fewer allocations and less time. |
||
|
|
||
| integrity := "" | ||
| if value, ok := pkgInfo.Resolution["integrity"]; ok { | ||
| integrity = value | ||
| } | ||
|
|
||
| dependencies := make(map[string]string) | ||
| dependencies := make(map[string]string, len(pkgInfo.Dependencies)) | ||
| for depName, depVersion := range pkgInfo.Dependencies { | ||
| var normalizedVersion = strings.SplitN(depVersion, "(", 2)[0] | ||
| dependencies[depName] = normalizedVersion | ||
| // Use strings.Cut for more efficient splitting | ||
| if normalizedVersion, _, ok := strings.Cut(depVersion, "("); ok { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All of these changes to |
||
| dependencies[depName] = normalizedVersion | ||
| } else { | ||
| dependencies[depName] = depVersion | ||
| } | ||
| } | ||
|
|
||
| packages[pkgKey] = pnpmPackage{Name: name, Version: ver, Integrity: integrity, Dependencies: dependencies, Dev: pkgInfo.Dev} | ||
|
|
@@ -143,7 +147,7 @@ func (p *pnpmV9LockYaml) Parse(_ float64, data []byte) ([]pnpmPackage, error) { | |
| log.WithFields("key", key).Trace("unable to parse pnpm v9 package key") | ||
| continue | ||
| } | ||
| pkgKey := name + "@" + ver | ||
| pkgKey := strings.Join([]string{name, ver}, "@") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the same less efficient change. |
||
| packages[pkgKey] = pnpmPackage{Name: name, Version: ver, Integrity: entry.Resolution["integrity"], Dev: entry.Dev} | ||
| } | ||
|
|
||
|
|
@@ -153,12 +157,16 @@ func (p *pnpmV9LockYaml) Parse(_ float64, data []byte) ([]pnpmPackage, error) { | |
| log.WithFields("key", key).Trace("unable to parse pnpm v9 package snapshot key") | ||
| continue | ||
| } | ||
| pkgKey := name + "@" + ver | ||
| pkgKey := strings.Join([]string{name, ver}, "@") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another instance. |
||
| if pkg, ok := packages[pkgKey]; ok { | ||
| pkg.Dependencies = make(map[string]string) | ||
| pkg.Dependencies = make(map[string]string, len(snapshotInfo.Dependencies)) | ||
| for name, versionSpecifier := range snapshotInfo.Dependencies { | ||
| var normalizedVersion = strings.SplitN(versionSpecifier, "(", 2)[0] | ||
| pkg.Dependencies[name] = normalizedVersion | ||
| // Use strings.Cut for more efficient splitting | ||
| if normalizedVersion, _, ok := strings.Cut(versionSpecifier, "("); ok { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another Cut change. |
||
| pkg.Dependencies[name] = normalizedVersion | ||
| } else { | ||
| pkg.Dependencies[name] = versionSpecifier | ||
| } | ||
| } | ||
| packages[pkgKey] = pkg | ||
| } else { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,7 @@ | ||
| package javascript | ||
|
|
||
| import ( | ||
| "bufio" | ||
| "bytes" | ||
| "context" | ||
| "fmt" | ||
|
|
@@ -71,20 +72,26 @@ | |
| } | ||
| } | ||
|
|
||
| func parseYarnV1LockFile(reader io.ReadCloser) ([]yarnPackage, error) { | ||
| content, err := io.ReadAll(reader) | ||
| if err != nil { | ||
| return nil, fmt.Errorf("failed to read yarn.lock file: %w", err) | ||
| // isYarnV1Lockfile checks if the reader contains a v1 yarn lockfile by peeking at the first few bytes | ||
| func isYarnV1Lockfile(reader *bufio.Reader) (bool, error) { | ||
| // Peek at first 100 bytes to check for v1 marker | ||
| peek, err := reader.Peek(100) | ||
| if err != nil && err != io.EOF { | ||
| return false, err | ||
| } | ||
| return bytes.Contains(peek, []byte("# yarn lockfile v1")), nil | ||
| } | ||
|
|
||
| re := regexp.MustCompile(`\r?\n`) | ||
| lines := re.Split(string(content), -1) | ||
| // parseYarnV1LockFile parses a v1 yarn.lock file using line-by-line scanning | ||
| func parseYarnV1LockFile(reader io.Reader) ([]yarnPackage, error) { | ||
| scanner := bufio.NewScanner(reader) | ||
| var pkgs []yarnPackage | ||
| var pkg = yarnPackage{} | ||
| var seenPkgs = strset.New() | ||
| dependencies := make(map[string]string) | ||
|
|
||
| for _, line := range lines { | ||
| for scanner.Scan() { | ||
| line := scanner.Text() | ||
| if strings.HasPrefix(line, "#") { | ||
| continue | ||
| } | ||
|
|
@@ -101,8 +108,7 @@ | |
| // The first line of a package entry is the name of the package with no | ||
| // leading spaces | ||
| if !strings.HasPrefix(line, " ") { | ||
| name := line | ||
| pkg.Name = findPackageName(name) | ||
| pkg.Name = findPackageName(line) | ||
| continue | ||
| } | ||
| if strings.HasPrefix(line, " ") && !strings.HasPrefix(line, " ") { | ||
|
|
@@ -133,6 +139,11 @@ | |
| dependencies[dependencyName] = dependencyVersion | ||
| } | ||
| } | ||
|
|
||
| if err := scanner.Err(); err != nil { | ||
| return nil, fmt.Errorf("failed to read yarn.lock file: %w", err) | ||
| } | ||
|
|
||
| // If the last package in the list is not the same as the current package, add the current package | ||
| // to the list. In case there was no trailing new line before we hit EOF. | ||
| if len(pkg.Name) > 0 && !seenPkgs.Has(pkg.Name+"@"+pkg.Version) { | ||
|
|
@@ -171,21 +182,34 @@ | |
| return nil, nil, nil | ||
| } | ||
|
|
||
| data, err := io.ReadAll(reader) | ||
| // Wrap reader in bufio.Reader for peeking at the lockfile version | ||
| bufReader := bufio.NewReader(reader) | ||
|
|
||
| var yarnPkgs []yarnPackage | ||
| var err error | ||
|
|
||
| // v1 Yarn lockfiles are not YAML, so we parse them line-by-line for memory efficiency. | ||
| // v2+ lockfiles are YAML and require loading the entire file. | ||
| // We peek at the first few bytes to determine the version. | ||
| isV1, err := isYarnV1Lockfile(bufReader) | ||
| if err != nil { | ||
| return nil, nil, fmt.Errorf("failed to load yarn.lock file: %w", err) | ||
| return nil, nil, fmt.Errorf("failed to determine yarn.lock version: %w", err) | ||
| } | ||
| // Reset the reader to the beginning of the file | ||
| reader.ReadCloser = io.NopCloser(bytes.NewBuffer(data)) | ||
|
|
||
| var yarnPkgs []yarnPackage | ||
| // v1 Yarn lockfiles are not YAML, so we need to parse them as a special case. They typically | ||
| // include a comment line that indicates the version. I.e. "# yarn lockfile v1" | ||
| if strings.Contains(string(data), "# yarn lockfile v1") { | ||
| yarnPkgs, err = parseYarnV1LockFile(reader) | ||
| if isV1 { | ||
| // For v1, parse line-by-line without loading entire file into memory | ||
| yarnPkgs, err = parseYarnV1LockFile(bufReader) | ||
| } else { | ||
| // For v2+, we need to load the entire file as YAML | ||
| data, err := io.ReadAll(bufReader) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks like this should just be able to pass the |
||
| if err != nil { | ||
| return nil, nil, fmt.Errorf("failed to load yarn.lock file: %w", err) | ||
| } | ||
| // Reset the reader to the beginning of the file | ||
| reader.ReadCloser = io.NopCloser(bytes.NewBuffer(data)) | ||
| yarnPkgs, err = parseYarnLockYaml(reader) | ||
|
Check failure on line 210 in syft/pkg/cataloger/javascript/parse_yarn_lock.go
|
||
| } | ||
|
|
||
| if err != nil { | ||
| return nil, nil, fmt.Errorf("failed to parse yarn.lock file: %w", err) | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is still an unnecessary change here. There is a scanner created before cataloging happens, and that scanner is passed in the context to all users.