2828which makes every filesystem performance suffer.
2929
3030In addition, when storing these files in Git repositories, we need to avoid creating any repository
31- with too many files that would make using this repository impactical or exceed the limits of some
31+ with too many files that would make using this repository impractical or exceed the limits of some
3232repository hosting services.
3333
3434Therefore we are storing vulnerability data using a directory tree using the first few characters
@@ -46,21 +46,21 @@ def build_vcid(prefix="VCID"):
4646 """
4747 Return a new Vulnerable Code ID (aka. VCID) which is a strongly unique vulnerability
4848 identifier string using the provided ``prefix``. A VCID is composed of a four letter prefix, and
49- three segments composed of four letters and dihits each separated by a dash.
49+ three segments composed of four letters and digits each separated by a dash.
5050 For example::
5151 >>> import re
5252 >>> vcid = build_vcid()
5353 >>> assert re.match('VCID(-[a-hjkm-z1-9]{4}){3}', vcid), vcid
5454
5555 We were mistakenly not using enough bits. The symptom was that the last
56- segment of the VCID was always strting with "aaa" This ensure we are now OK:
56+ segment of the VCID was always string with "aaa" This ensure we are now OK:
5757 >>> vcids = [build_vcid() for _ in range(50)]
5858 >>> assert not any(vid.split("-")[-1].startswith("aaa") for vid in vcids)
5959 """
6060 uid = uuid4 ().bytes
61- # we keep three segments of 4 base32-encodee bytes, 3*4=12
61+ # we keep three segments of 4 base32-encoded bytes, 3*4=12
6262 # which corresponds to 60 bits
63- # becausee each base32 byte can store 5 bits (2**5 = 32)
63+ # because each base32 byte can store 5 bits (2**5 = 32)
6464 uid = base32_custom (uid )[:12 ].decode ("utf-8" ).lower ()
6565 return f"{ prefix } -{ uid [:4 ]} -{ uid [4 :8 ]} -{ uid [8 :12 ]} "
6666
@@ -117,7 +117,7 @@ def vulnerability_yml_path(vcid):
117117 Return the path to a vulnerability YAML file crafted from the ``vcid`` VCID vulnerability id.
118118
119119 The approach is to distribute the files in many directories to avoid having too many files in
120- any directory and be able to find the path to a vulneravility file given its VCID distributed on
120+ any directory and be able to find the path to a vulnerability file given its VCID distributed on
121121 the first two characters of the UUID section of a VCID.
122122
123123 The UUID is using a base32 encoding, hence keeping two characters means 32 x 32 = 1024
@@ -162,6 +162,52 @@ def get_package_vulnerabilities_yml_file_path(purl: Union[PackageURL, str]):
162162 return get_package_base_dir (purl ) / VULNERABILITIES_FILENAME
163163
164164
165+ # We use a 4-tier system for storing package metadata.
166+ # The tiers are as follows:
167+ # 1. Super Large Ecosystem (~5M packages): 2^10 = 1,028 git repositories
168+ # 2. Large Ecosystem (~500K packages): 2^7 = 128 git repositories
169+ # 3. Medium Ecosystem (~50K packages): 2^5 = 32 git repositories
170+ # 4. Small Ecosystem (~2K packages): 2^0 = 1 git repository
171+ # See https://github.com/aboutcode-org/federatedcode/issues/3#issuecomment-2388371726
172+ BIT_COUNT_BY_ECOSYSTEM = {
173+ # Super Large Ecosystem
174+ "github" : 10 ,
175+ "npm" : 10 ,
176+ # Large Ecosystem
177+ "golang" : 7 ,
178+ "maven" : 7 ,
179+ "nuget" : 7 ,
180+ "perl" : 7 ,
181+ "php" : 7 ,
182+ "pypi" : 7 ,
183+ "ruby" : 7 ,
184+ # Medium Ecosystem
185+ "alpm" : 5 ,
186+ "bitbucket" : 5 ,
187+ "cocoapods" : 5 ,
188+ "composer" : 5 ,
189+ "deb" : 5 ,
190+ "docker" : 5 ,
191+ "generic" : 5 ,
192+ "huggingface" : 5 ,
193+ "mlflow" : 5 ,
194+ "pub" : 5 ,
195+ "rpm" : 5 ,
196+ # Small Ecosystem
197+ "bitnami" : 0 ,
198+ "cargo" : 0 ,
199+ "conan" : 0 ,
200+ "conda" : 0 ,
201+ "cpan" : 0 ,
202+ "cran" : 0 ,
203+ "gem" : 0 ,
204+ "hackage" : 0 ,
205+ "hex" : 0 ,
206+ "luarocks" : 0 ,
207+ "swift" : 0 ,
208+ }
209+
210+
165211def package_path_elements (purl : Union [PackageURL , str ]):
166212 """
167213 Return 4-tuple of POSIX path strings crafted from the ``purl`` package PURL string or object.
@@ -199,7 +245,7 @@ def package_path_elements(purl: Union[PackageURL, str]):
199245 sbom.spdx.2.2.json : a SPDX SBOM
200246 .... other files
201247
202- <extra_path> : one sub directory for each quote-encoded <qualifiers#supath > if any
248+ <extra_path> : one sub directory for each quote-encoded <qualifiers#subpath > if any
203249 metadata.yml : ABOUT YAML file with package origin and license metadata for this version
204250 scancode-scan.yml : a scancode scan for this package version
205251 foo-scan.yml : a scan for this package version created with tool foo
@@ -233,7 +279,8 @@ def package_path_elements(purl: Union[PackageURL, str]):
233279 if isinstance (purl , str ):
234280 purl = PackageURL .from_string (purl )
235281
236- purl_hash = get_purl_hash (purl )
282+ bit_count = BIT_COUNT_BY_ECOSYSTEM .get (purl .type , 0 )
283+ purl_hash = get_purl_hash (purl = purl , _bit_count = bit_count )
237284
238285 if ns := purl .namespace :
239286 ns_name = f"{ ns } /{ purl .name } "
@@ -290,37 +337,17 @@ def get_core_purl(purl: Union[PackageURL, str]):
290337 return PackageURL (** purld )
291338
292339
293- # See https://github.com/aboutcode-org/federatedcode/issues/3#issuecomment-2388371726
294- BIT_COUNT_BY_ECOSYSTEM = {
295- # Super large ecosystem 1024 repos.
296- "npm" : 10 ,
297- # Large ecosystem 128 repos.
298- "pypi" : 7 ,
299- "maven" : 7 ,
300- "golang" : 7 ,
301- "perl" : 7 ,
302- "ruby" : 7 ,
303- "nuget" : 7 ,
304- "php" : 7 ,
305- # Medium ecosystem 32 repos.
306- "rpm" : 5 ,
307- "deb" : 5 ,
308- # Small ecosystem 1 repo.
309- "github" : 0 ,
310- }
311-
312-
313340def get_purl_hash (purl : Union [PackageURL , str ], _bit_count : int = 0 ) -> str :
314341 """
315342 Return a short lower cased hash string from a ``purl`` string or object. The PURL is normalized
316343 and we drop its version, qualifiers and subpath.
317344
318- This function takes a normalized PURL string and a ``_bit_count`` argument defaulting to 13 bits
319- which represents 2**13 = 8192 possible hash values . It returns a fixed length short hash string
345+ This function takes a normalized PURL string and a ``_bit_count`` argument defaulting to 0 bits
346+ which represents 2**0 = 1 possible hash value . It returns a fixed length short hash string
320347 that is left-padded with zeros.
321348
322349 The hash length is derived from the bit_count and the number of bits-per-byte stored in an hex
323- encoding of this bits count. For 13 bits, this means up to 4 characters.
350+ encoding of this bits count. For 10 bits, this means up to 3 characters.
324351
325352 The function is carefully designed to be portable across tech stacks and easy to implement in
326353 many programming languages:
@@ -342,36 +369,31 @@ def get_purl_hash(purl: Union[PackageURL, str], _bit_count: int = 0) -> str:
342369 For example::
343370
344371 The hash does not change with version or qualifiers::
345- >>> get_purl_hash("pkg:pypi/univers@30.12.0")
372+ >>> get_purl_hash("pkg:pypi/univers@30.12.0", 7 )
346373 '09'
347- >>> get_purl_hash("pkg:pypi/univers@10.12.0")
374+ >>> get_purl_hash("pkg:pypi/univers@10.12.0", 7 )
348375 '09'
349- >>> get_purl_hash("pkg:pypi/univers@30.12.0?foo=bar#sub/path")
376+ >>> get_purl_hash("pkg:pypi/univers@30.12.0?foo=bar#sub/path", 7 )
350377 '09'
351378
352379 The hash is left padded with zero if it::
353- >>> get_purl_hash("pkg:pypi/expressionss")
380+ >>> get_purl_hash("pkg:pypi/expressionss", 7 )
354381 '57'
355382
356383 We normalize the PURL. Here pypi normalization always uses dash for underscore ::
357384
358- >>> get_purl_hash("pkg:pypi/license_expression")
385+ >>> get_purl_hash("pkg:pypi/license_expression", 7 )
359386 '50'
360- >>> get_purl_hash("pkg:pypi/license-expression")
387+ >>> get_purl_hash("pkg:pypi/license-expression", 7 )
361388 '50'
362389
363390 Originally from:
364391 https://github.com/nexB/purldb/pull/235/files#diff-a1fd023bd42d73f56019d540f38be711255403547add15108540d70f9948dd40R154
365392 """
366393
367- core_purl = get_core_purl (purl )
368-
369- if core_purl .type in BIT_COUNT_BY_ECOSYSTEM :
370- _bit_count = BIT_COUNT_BY_ECOSYSTEM [core_purl .type ]
371-
372- core_purl_str = core_purl .to_string ()
394+ core_purl = get_core_purl (purl ).to_string ()
373395 # compute the hash from a UTF-8 encoded string
374- purl_bytes = core_purl_str .encode ("utf-8" )
396+ purl_bytes = core_purl .encode ("utf-8" )
375397 hash_bytes = sha256 (purl_bytes ).digest ()
376398 # ... converted to integer so we can truncate with modulo. Note that we use big endian.
377399 hash_int = int .from_bytes (hash_bytes , "big" )
0 commit comments