Skip to content

Commit 02f1e81

Browse files
authored
Revert "fix: checking if tokenizer is in cache before downloading from HF" (#14808)
1 parent 908c718 commit 02f1e81

File tree

1 file changed

+1
-209
lines changed

1 file changed

+1
-209
lines changed

python/sglang/srt/utils/hf_transformers_utils.py

Lines changed: 1 addition & 209 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
"""Utilities for Huggingface Transformers."""
1515

1616
import contextlib
17-
import glob
1817
import json
1918
import logging
2019
import os
@@ -23,7 +22,6 @@
2322
from pathlib import Path
2423
from typing import Any, Dict, List, Optional, Type, Union
2524

26-
import huggingface_hub
2725
import torch
2826
from huggingface_hub import snapshot_download
2927

@@ -69,14 +67,7 @@
6967
from sglang.srt.configs.internvl import InternVLChatConfig
7068
from sglang.srt.connector import create_remote_connector
7169
from sglang.srt.multimodal.customized_mm_processor_utils import _CUSTOMIZED_MM_PROCESSOR
72-
from sglang.srt.utils import (
73-
find_local_repo_dir,
74-
is_remote_url,
75-
logger,
76-
lru_cache_frozenset,
77-
mistral_utils,
78-
)
79-
from sglang.utils import is_in_ci
70+
from sglang.srt.utils import is_remote_url, logger, lru_cache_frozenset, mistral_utils
8071

8172
_CONFIG_REGISTRY: List[Type[PretrainedConfig]] = [
8273
ChatGLMConfig,
@@ -408,197 +399,12 @@ def get_context_length(config):
408399
_FAST_LLAMA_TOKENIZER = "hf-internal-testing/llama-tokenizer"
409400

410401

411-
def _validate_tokenizer_file(file_path: str) -> bool:
412-
"""
413-
Validate that a tokenizer file is readable and not corrupted.
414-
415-
Args:
416-
file_path: Path to the tokenizer file
417-
418-
Returns:
419-
True if the file is valid, False if corrupted
420-
"""
421-
try:
422-
# For JSON files, validate they're parseable
423-
if file_path.endswith(".json"):
424-
with open(file_path, "r") as f:
425-
json.load(f)
426-
return True
427-
# For .model files (SentencePiece), just check readability
428-
elif file_path.endswith(".model"):
429-
with open(file_path, "rb") as f:
430-
# Read first few bytes to verify file is readable
431-
_ = f.read(100)
432-
return True
433-
# For other files, just check they exist and are readable
434-
else:
435-
with open(file_path, "rb") as f:
436-
_ = f.read(100)
437-
return True
438-
except Exception as e:
439-
logger.warning(
440-
"Corrupted tokenizer file detected: %s - %s: %s",
441-
file_path,
442-
type(e).__name__,
443-
str(e),
444-
)
445-
return False
446-
447-
448-
def find_local_tokenizer_snapshot_dir(
449-
model_name_or_path: str,
450-
cache_dir: Optional[str],
451-
allow_patterns: List[str],
452-
revision: Optional[str] = None,
453-
) -> Optional[str]:
454-
"""If the tokenizer files are already local, skip downloading and return the path.
455-
Only applied in CI.
456-
"""
457-
if not is_in_ci():
458-
return None
459-
460-
if os.path.isdir(model_name_or_path):
461-
logger.info(
462-
"Tokenizer path %s is already a local directory, skipping cache check",
463-
model_name_or_path,
464-
)
465-
return None
466-
467-
logger.info("Checking for cached tokenizer: %s", model_name_or_path)
468-
found_local_snapshot_dir = None
469-
470-
# Check custom cache_dir (if provided)
471-
if cache_dir:
472-
try:
473-
repo_folder = os.path.join(
474-
cache_dir,
475-
huggingface_hub.constants.REPO_ID_SEPARATOR.join(
476-
["models", *model_name_or_path.split("/")]
477-
),
478-
)
479-
rev_to_use = revision
480-
if not rev_to_use:
481-
ref_main = os.path.join(repo_folder, "refs", "main")
482-
if os.path.isfile(ref_main):
483-
with open(ref_main) as f:
484-
rev_to_use = f.read().strip()
485-
if rev_to_use:
486-
rev_dir = os.path.join(repo_folder, "snapshots", rev_to_use)
487-
if os.path.isdir(rev_dir):
488-
found_local_snapshot_dir = rev_dir
489-
except Exception as e:
490-
logger.warning(
491-
"Failed to find local snapshot in custom cache_dir %s: %s",
492-
cache_dir,
493-
e,
494-
)
495-
496-
# Check default HF cache as well
497-
if not found_local_snapshot_dir:
498-
try:
499-
rev_dir = find_local_repo_dir(model_name_or_path, revision)
500-
if rev_dir and os.path.isdir(rev_dir):
501-
found_local_snapshot_dir = rev_dir
502-
except Exception as e:
503-
logger.warning("Failed to find local snapshot in default HF cache: %s", e)
504-
505-
# If local snapshot exists, validate it contains at least one tokenizer file
506-
# matching allow_patterns before skipping download.
507-
if found_local_snapshot_dir is None:
508-
return None
509-
510-
# Layer 0: Check for incomplete files (corruption indicator)
511-
repo_folder = os.path.abspath(os.path.join(found_local_snapshot_dir, "..", ".."))
512-
blobs_dir = os.path.join(repo_folder, "blobs")
513-
if os.path.isdir(blobs_dir) and glob.glob(os.path.join(blobs_dir, "*.incomplete")):
514-
logger.info(
515-
"Found .incomplete files in %s for %s. Considering local snapshot incomplete.",
516-
blobs_dir,
517-
model_name_or_path,
518-
)
519-
return None
520-
521-
local_tokenizer_files: List[str] = []
522-
try:
523-
for pattern in allow_patterns:
524-
matched_files = glob.glob(os.path.join(found_local_snapshot_dir, pattern))
525-
for f in matched_files:
526-
# Layer 1: Check symlink target exists (broken symlink check)
527-
if not os.path.exists(f):
528-
continue
529-
# Layer 2: Validate file content is not corrupted
530-
if not _validate_tokenizer_file(f):
531-
logger.info(
532-
"Found corrupted tokenizer file %s for %s. Will re-download.",
533-
f,
534-
model_name_or_path,
535-
)
536-
return None
537-
local_tokenizer_files.append(f)
538-
except Exception as e:
539-
logger.warning(
540-
"Failed to scan local snapshot %s with patterns %s: %s",
541-
found_local_snapshot_dir,
542-
allow_patterns,
543-
e,
544-
)
545-
local_tokenizer_files = []
546-
547-
if len(local_tokenizer_files) > 0:
548-
logger.info(
549-
"Found local HF snapshot for tokenizer %s at %s; skipping download.",
550-
model_name_or_path,
551-
found_local_snapshot_dir,
552-
)
553-
return found_local_snapshot_dir
554-
else:
555-
logger.info(
556-
"Local HF snapshot at %s has no files matching %s; will attempt download.",
557-
found_local_snapshot_dir,
558-
allow_patterns,
559-
)
560-
return None
561-
562-
563402
# Filter warnings like: https://github.com/sgl-project/sglang/issues/8082
564403
class TokenizerWarningsFilter(logging.Filter):
565404
def filter(self, record: logging.LogRecord) -> bool:
566405
return "Calling super().encode with" not in record.getMessage()
567406

568407

569-
def _check_tokenizer_cache(
570-
tokenizer_name: str,
571-
cache_dir: Optional[str],
572-
revision: Optional[str],
573-
include_processor_files: bool = False,
574-
) -> str:
575-
"""Check local cache for tokenizer files and return local path if found.
576-
577-
Args:
578-
tokenizer_name: Model name or path
579-
cache_dir: Optional custom cache directory
580-
revision: Optional model revision
581-
include_processor_files: Whether to include processor-specific files (*.py, preprocessor_config.json)
582-
583-
Returns:
584-
Local path if found in cache, otherwise returns original tokenizer_name
585-
"""
586-
allow_patterns = [
587-
"*.json",
588-
"*.model",
589-
"*.txt",
590-
"tokenizer.model",
591-
"tokenizer_config.json",
592-
]
593-
if include_processor_files:
594-
allow_patterns.extend(["*.py", "preprocessor_config.json"])
595-
596-
local_path = find_local_tokenizer_snapshot_dir(
597-
tokenizer_name, cache_dir, allow_patterns, revision
598-
)
599-
return local_path if local_path is not None else tokenizer_name
600-
601-
602408
def get_tokenizer(
603409
tokenizer_name: str,
604410
*args,
@@ -635,11 +441,6 @@ def get_tokenizer(
635441
client.pull_files(ignore_pattern=["*.pt", "*.safetensors", "*.bin"])
636442
tokenizer_name = client.get_local_dir()
637443

638-
# Check if tokenizer files are already in local cache (CI only)
639-
tokenizer_name = _check_tokenizer_cache(
640-
tokenizer_name, kwargs.get("cache_dir"), tokenizer_revision
641-
)
642-
643444
try:
644445
tokenizer = AutoTokenizer.from_pretrained(
645446
tokenizer_name,
@@ -706,15 +507,6 @@ def get_processor(
706507
):
707508
# pop 'revision' from kwargs if present.
708509
revision = kwargs.pop("revision", tokenizer_revision)
709-
710-
# Check if processor/tokenizer files are already in local cache (CI only)
711-
tokenizer_name = _check_tokenizer_cache(
712-
tokenizer_name,
713-
kwargs.get("cache_dir"),
714-
revision,
715-
include_processor_files=True,
716-
)
717-
718510
if "mistral-large-3" in str(tokenizer_name).lower():
719511
config = _load_mistral_large_3_for_causal_LM(
720512
tokenizer_name,

0 commit comments

Comments
 (0)