|
14 | 14 | """Utilities for Huggingface Transformers.""" |
15 | 15 |
|
16 | 16 | import contextlib |
17 | | -import glob |
18 | 17 | import json |
19 | 18 | import logging |
20 | 19 | import os |
|
23 | 22 | from pathlib import Path |
24 | 23 | from typing import Any, Dict, List, Optional, Type, Union |
25 | 24 |
|
26 | | -import huggingface_hub |
27 | 25 | import torch |
28 | 26 | from huggingface_hub import snapshot_download |
29 | 27 |
|
|
69 | 67 | from sglang.srt.configs.internvl import InternVLChatConfig |
70 | 68 | from sglang.srt.connector import create_remote_connector |
71 | 69 | from sglang.srt.multimodal.customized_mm_processor_utils import _CUSTOMIZED_MM_PROCESSOR |
72 | | -from sglang.srt.utils import ( |
73 | | - find_local_repo_dir, |
74 | | - is_remote_url, |
75 | | - logger, |
76 | | - lru_cache_frozenset, |
77 | | - mistral_utils, |
78 | | -) |
79 | | -from sglang.utils import is_in_ci |
| 70 | +from sglang.srt.utils import is_remote_url, logger, lru_cache_frozenset, mistral_utils |
80 | 71 |
|
81 | 72 | _CONFIG_REGISTRY: List[Type[PretrainedConfig]] = [ |
82 | 73 | ChatGLMConfig, |
@@ -408,197 +399,12 @@ def get_context_length(config): |
408 | 399 | _FAST_LLAMA_TOKENIZER = "hf-internal-testing/llama-tokenizer" |
409 | 400 |
|
410 | 401 |
|
411 | | -def _validate_tokenizer_file(file_path: str) -> bool: |
412 | | - """ |
413 | | - Validate that a tokenizer file is readable and not corrupted. |
414 | | -
|
415 | | - Args: |
416 | | - file_path: Path to the tokenizer file |
417 | | -
|
418 | | - Returns: |
419 | | - True if the file is valid, False if corrupted |
420 | | - """ |
421 | | - try: |
422 | | - # For JSON files, validate they're parseable |
423 | | - if file_path.endswith(".json"): |
424 | | - with open(file_path, "r") as f: |
425 | | - json.load(f) |
426 | | - return True |
427 | | - # For .model files (SentencePiece), just check readability |
428 | | - elif file_path.endswith(".model"): |
429 | | - with open(file_path, "rb") as f: |
430 | | - # Read first few bytes to verify file is readable |
431 | | - _ = f.read(100) |
432 | | - return True |
433 | | - # For other files, just check they exist and are readable |
434 | | - else: |
435 | | - with open(file_path, "rb") as f: |
436 | | - _ = f.read(100) |
437 | | - return True |
438 | | - except Exception as e: |
439 | | - logger.warning( |
440 | | - "Corrupted tokenizer file detected: %s - %s: %s", |
441 | | - file_path, |
442 | | - type(e).__name__, |
443 | | - str(e), |
444 | | - ) |
445 | | - return False |
446 | | - |
447 | | - |
448 | | -def find_local_tokenizer_snapshot_dir( |
449 | | - model_name_or_path: str, |
450 | | - cache_dir: Optional[str], |
451 | | - allow_patterns: List[str], |
452 | | - revision: Optional[str] = None, |
453 | | -) -> Optional[str]: |
454 | | - """If the tokenizer files are already local, skip downloading and return the path. |
455 | | - Only applied in CI. |
456 | | - """ |
457 | | - if not is_in_ci(): |
458 | | - return None |
459 | | - |
460 | | - if os.path.isdir(model_name_or_path): |
461 | | - logger.info( |
462 | | - "Tokenizer path %s is already a local directory, skipping cache check", |
463 | | - model_name_or_path, |
464 | | - ) |
465 | | - return None |
466 | | - |
467 | | - logger.info("Checking for cached tokenizer: %s", model_name_or_path) |
468 | | - found_local_snapshot_dir = None |
469 | | - |
470 | | - # Check custom cache_dir (if provided) |
471 | | - if cache_dir: |
472 | | - try: |
473 | | - repo_folder = os.path.join( |
474 | | - cache_dir, |
475 | | - huggingface_hub.constants.REPO_ID_SEPARATOR.join( |
476 | | - ["models", *model_name_or_path.split("/")] |
477 | | - ), |
478 | | - ) |
479 | | - rev_to_use = revision |
480 | | - if not rev_to_use: |
481 | | - ref_main = os.path.join(repo_folder, "refs", "main") |
482 | | - if os.path.isfile(ref_main): |
483 | | - with open(ref_main) as f: |
484 | | - rev_to_use = f.read().strip() |
485 | | - if rev_to_use: |
486 | | - rev_dir = os.path.join(repo_folder, "snapshots", rev_to_use) |
487 | | - if os.path.isdir(rev_dir): |
488 | | - found_local_snapshot_dir = rev_dir |
489 | | - except Exception as e: |
490 | | - logger.warning( |
491 | | - "Failed to find local snapshot in custom cache_dir %s: %s", |
492 | | - cache_dir, |
493 | | - e, |
494 | | - ) |
495 | | - |
496 | | - # Check default HF cache as well |
497 | | - if not found_local_snapshot_dir: |
498 | | - try: |
499 | | - rev_dir = find_local_repo_dir(model_name_or_path, revision) |
500 | | - if rev_dir and os.path.isdir(rev_dir): |
501 | | - found_local_snapshot_dir = rev_dir |
502 | | - except Exception as e: |
503 | | - logger.warning("Failed to find local snapshot in default HF cache: %s", e) |
504 | | - |
505 | | - # If local snapshot exists, validate it contains at least one tokenizer file |
506 | | - # matching allow_patterns before skipping download. |
507 | | - if found_local_snapshot_dir is None: |
508 | | - return None |
509 | | - |
510 | | - # Layer 0: Check for incomplete files (corruption indicator) |
511 | | - repo_folder = os.path.abspath(os.path.join(found_local_snapshot_dir, "..", "..")) |
512 | | - blobs_dir = os.path.join(repo_folder, "blobs") |
513 | | - if os.path.isdir(blobs_dir) and glob.glob(os.path.join(blobs_dir, "*.incomplete")): |
514 | | - logger.info( |
515 | | - "Found .incomplete files in %s for %s. Considering local snapshot incomplete.", |
516 | | - blobs_dir, |
517 | | - model_name_or_path, |
518 | | - ) |
519 | | - return None |
520 | | - |
521 | | - local_tokenizer_files: List[str] = [] |
522 | | - try: |
523 | | - for pattern in allow_patterns: |
524 | | - matched_files = glob.glob(os.path.join(found_local_snapshot_dir, pattern)) |
525 | | - for f in matched_files: |
526 | | - # Layer 1: Check symlink target exists (broken symlink check) |
527 | | - if not os.path.exists(f): |
528 | | - continue |
529 | | - # Layer 2: Validate file content is not corrupted |
530 | | - if not _validate_tokenizer_file(f): |
531 | | - logger.info( |
532 | | - "Found corrupted tokenizer file %s for %s. Will re-download.", |
533 | | - f, |
534 | | - model_name_or_path, |
535 | | - ) |
536 | | - return None |
537 | | - local_tokenizer_files.append(f) |
538 | | - except Exception as e: |
539 | | - logger.warning( |
540 | | - "Failed to scan local snapshot %s with patterns %s: %s", |
541 | | - found_local_snapshot_dir, |
542 | | - allow_patterns, |
543 | | - e, |
544 | | - ) |
545 | | - local_tokenizer_files = [] |
546 | | - |
547 | | - if len(local_tokenizer_files) > 0: |
548 | | - logger.info( |
549 | | - "Found local HF snapshot for tokenizer %s at %s; skipping download.", |
550 | | - model_name_or_path, |
551 | | - found_local_snapshot_dir, |
552 | | - ) |
553 | | - return found_local_snapshot_dir |
554 | | - else: |
555 | | - logger.info( |
556 | | - "Local HF snapshot at %s has no files matching %s; will attempt download.", |
557 | | - found_local_snapshot_dir, |
558 | | - allow_patterns, |
559 | | - ) |
560 | | - return None |
561 | | - |
562 | | - |
563 | 402 | # Filter warnings like: https://github.com/sgl-project/sglang/issues/8082 |
564 | 403 | class TokenizerWarningsFilter(logging.Filter): |
565 | 404 | def filter(self, record: logging.LogRecord) -> bool: |
566 | 405 | return "Calling super().encode with" not in record.getMessage() |
567 | 406 |
|
568 | 407 |
|
569 | | -def _check_tokenizer_cache( |
570 | | - tokenizer_name: str, |
571 | | - cache_dir: Optional[str], |
572 | | - revision: Optional[str], |
573 | | - include_processor_files: bool = False, |
574 | | -) -> str: |
575 | | - """Check local cache for tokenizer files and return local path if found. |
576 | | -
|
577 | | - Args: |
578 | | - tokenizer_name: Model name or path |
579 | | - cache_dir: Optional custom cache directory |
580 | | - revision: Optional model revision |
581 | | - include_processor_files: Whether to include processor-specific files (*.py, preprocessor_config.json) |
582 | | -
|
583 | | - Returns: |
584 | | - Local path if found in cache, otherwise returns original tokenizer_name |
585 | | - """ |
586 | | - allow_patterns = [ |
587 | | - "*.json", |
588 | | - "*.model", |
589 | | - "*.txt", |
590 | | - "tokenizer.model", |
591 | | - "tokenizer_config.json", |
592 | | - ] |
593 | | - if include_processor_files: |
594 | | - allow_patterns.extend(["*.py", "preprocessor_config.json"]) |
595 | | - |
596 | | - local_path = find_local_tokenizer_snapshot_dir( |
597 | | - tokenizer_name, cache_dir, allow_patterns, revision |
598 | | - ) |
599 | | - return local_path if local_path is not None else tokenizer_name |
600 | | - |
601 | | - |
602 | 408 | def get_tokenizer( |
603 | 409 | tokenizer_name: str, |
604 | 410 | *args, |
@@ -635,11 +441,6 @@ def get_tokenizer( |
635 | 441 | client.pull_files(ignore_pattern=["*.pt", "*.safetensors", "*.bin"]) |
636 | 442 | tokenizer_name = client.get_local_dir() |
637 | 443 |
|
638 | | - # Check if tokenizer files are already in local cache (CI only) |
639 | | - tokenizer_name = _check_tokenizer_cache( |
640 | | - tokenizer_name, kwargs.get("cache_dir"), tokenizer_revision |
641 | | - ) |
642 | | - |
643 | 444 | try: |
644 | 445 | tokenizer = AutoTokenizer.from_pretrained( |
645 | 446 | tokenizer_name, |
@@ -706,15 +507,6 @@ def get_processor( |
706 | 507 | ): |
707 | 508 | # pop 'revision' from kwargs if present. |
708 | 509 | revision = kwargs.pop("revision", tokenizer_revision) |
709 | | - |
710 | | - # Check if processor/tokenizer files are already in local cache (CI only) |
711 | | - tokenizer_name = _check_tokenizer_cache( |
712 | | - tokenizer_name, |
713 | | - kwargs.get("cache_dir"), |
714 | | - revision, |
715 | | - include_processor_files=True, |
716 | | - ) |
717 | | - |
718 | 510 | if "mistral-large-3" in str(tokenizer_name).lower(): |
719 | 511 | config = _load_mistral_large_3_for_causal_LM( |
720 | 512 | tokenizer_name, |
|
0 commit comments