From ac06779feeb40d2087817945bd3c0a2538487fcf Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Wed, 17 Sep 2025 23:43:33 +0000 Subject: [PATCH 1/2] Fix model cards and modalities in toctree --- docs/source/en/_toctree.yml | 28 ++++++++--------- docs/source/en/model_doc/bert-generation.md | 1 + docs/source/en/model_doc/hunyuan_v1_dense.md | 1 + docs/source/en/model_doc/hunyuan_v1_moe.md | 1 + docs/source/en/model_doc/longcat_flash.md | 5 ++- docs/source/en/model_doc/ministral.md | 1 + docs/source/en/model_doc/olmo3.md | 11 ++++--- docs/source/en/model_doc/ovis2.md | 1 + docs/source/en/model_doc/qwen3_next.md | 12 ++++--- docs/source/en/model_doc/qwen3_vl.md | 2 +- docs/source/en/model_doc/qwen3_vl_moe.md | 2 +- docs/source/en/model_doc/seed_oss.md | 33 +++++++++++--------- docs/source/en/model_doc/vaultgemma.md | 3 +- 13 files changed, 56 insertions(+), 45 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 65411024d4a3..432da370a9c6 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -441,6 +441,8 @@ title: DeBERTa - local: model_doc/deberta-v2 title: DeBERTa-v2 + - local: model_doc/deepseek_v2 + title: DeepSeek-V2 - local: model_doc/deepseek_v3 title: DeepSeek-V3 - local: model_doc/dialogpt @@ -761,12 +763,6 @@ title: D-FINE - local: model_doc/dab-detr title: DAB-DETR - - local: model_doc/deepseek_v2 - title: DeepSeek-V2 - - local: model_doc/deepseek_vl - title: DeepseekVL - - local: model_doc/deepseek_vl_hybrid - title: DeepseekVLHybrid - local: model_doc/deformable_detr title: Deformable DETR - local: model_doc/deit @@ -849,10 +845,16 @@ title: RT-DETR - local: model_doc/rt_detr_v2 title: RT-DETRv2 + - local: model_doc/sam2 + title: SAM2 - local: model_doc/segformer title: SegFormer - local: model_doc/seggpt title: SegGpt + - local: model_doc/sam + title: Segment Anything + - local: model_doc/sam_hq + title: Segment Anything High Quality - local: model_doc/superglue title: SuperGlue - local: model_doc/superpoint @@ -975,6 +977,8 @@ title: XLSR-Wav2Vec2 title: Audio models - sections: + - local: model_doc/sam2_video + title: SAM2 Video - local: model_doc/timesformer title: TimeSformer - local: model_doc/vjepa2 @@ -1019,6 +1023,10 @@ title: ColQwen2 - local: model_doc/data2vec title: Data2Vec + - local: model_doc/deepseek_vl + title: DeepseekVL + - local: model_doc/deepseek_vl_hybrid + title: DeepseekVLHybrid - local: model_doc/deplot title: DePlot - local: model_doc/donut @@ -1137,14 +1145,6 @@ title: Qwen3VL - local: model_doc/qwen3_vl_moe title: Qwen3VLMoe - - local: model_doc/sam2 - title: SAM2 - - local: model_doc/sam2_video - title: SAM2 Video - - local: model_doc/sam - title: Segment Anything - - local: model_doc/sam_hq - title: Segment Anything High Quality - local: model_doc/shieldgemma2 title: ShieldGemma2 - local: model_doc/siglip diff --git a/docs/source/en/model_doc/bert-generation.md b/docs/source/en/model_doc/bert-generation.md index 38cbe2137eb7..b5be3458db7d 100644 --- a/docs/source/en/model_doc/bert-generation.md +++ b/docs/source/en/model_doc/bert-generation.md @@ -13,6 +13,7 @@ specific language governing permissions and limitations under the License. rendered properly in your Markdown viewer. --> +*This model was released on 2019-07-29 and added to Hugging Face Transformers on 2020-11-16.*
diff --git a/docs/source/en/model_doc/hunyuan_v1_dense.md b/docs/source/en/model_doc/hunyuan_v1_dense.md index f87ca422c8ed..520c68b7fd9d 100644 --- a/docs/source/en/model_doc/hunyuan_v1_dense.md +++ b/docs/source/en/model_doc/hunyuan_v1_dense.md @@ -13,6 +13,7 @@ specific language governing permissions and limitations under the License. rendered properly in your Markdown viewer. --> +*This model was released on {release_date} and added to Hugging Face Transformers on 2025-08-22.* # HunYuanDenseV1 diff --git a/docs/source/en/model_doc/hunyuan_v1_moe.md b/docs/source/en/model_doc/hunyuan_v1_moe.md index c66846cc0881..36a53742715d 100644 --- a/docs/source/en/model_doc/hunyuan_v1_moe.md +++ b/docs/source/en/model_doc/hunyuan_v1_moe.md @@ -13,6 +13,7 @@ specific language governing permissions and limitations under the License. rendered properly in your Markdown viewer. --> +*This model was released on {release_date} and added to Hugging Face Transformers on 2025-08-22.* # HunYuanMoEV1 diff --git a/docs/source/en/model_doc/longcat_flash.md b/docs/source/en/model_doc/longcat_flash.md index b2c2d7a00646..d9a9a4a7f603 100644 --- a/docs/source/en/model_doc/longcat_flash.md +++ b/docs/source/en/model_doc/longcat_flash.md @@ -16,8 +16,7 @@ limitations under the License. ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be rendered properly in your Markdown viewer. --> -*This model was released on 2025-09-01 and added to Hugging Face Transformers on 2025-09-15.* - +*This model was released on 2025-09-01 and added to Hugging Face Transformers on 2025-09-17.* # LongCatFlash @@ -70,7 +69,7 @@ outputs = model.generate(inputs, max_new_tokens=30) print(tokenizer.batch_decode(outputs)) ``` -To run with TP, you will need torchrun: +To run with TP, you will need torchrun: ```bash torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 | 1 --rdzv-id --rdzv-backend c10d --rdzv-endpoint $NODE_ID:$NODE_PORT --log-dir ./logs_longcat launch_longcat.py diff --git a/docs/source/en/model_doc/ministral.md b/docs/source/en/model_doc/ministral.md index 07692c6163e5..13b6f3d6c04b 100644 --- a/docs/source/en/model_doc/ministral.md +++ b/docs/source/en/model_doc/ministral.md @@ -13,6 +13,7 @@ specific language governing permissions and limitations under the License. rendered properly in your Markdown viewer. --> +*This model was released on {release_date} and added to Hugging Face Transformers on 2025-09-11.*
diff --git a/docs/source/en/model_doc/olmo3.md b/docs/source/en/model_doc/olmo3.md index e320181925ca..8e88a175d463 100644 --- a/docs/source/en/model_doc/olmo3.md +++ b/docs/source/en/model_doc/olmo3.md @@ -16,7 +16,8 @@ limitations under the License. ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be rendered properly in your Markdown viewer. --> -*This model was released on {release_date} and added to Hugging Face Transformers on 2025-09-08.* +*This model was released on {release_date} and added to Hugging Face Transformers on 2025-09-16.* +
PyTorch @@ -46,7 +47,7 @@ pipe = pipeline( dtype=torch.bfloat16, device=0, ) - + result = pipe("Plants create energy through a process known as") print(result) ``` @@ -119,11 +120,11 @@ print(tokenizer.decode(output[0], skip_special_tokens=True)) ## Notes -- Load specific intermediate checkpoints by adding the `revision` parameter to [`~PreTrainedModel.from_pretrained`]. +- Load specific intermediate checkpoints by adding the `revision` parameter to [`~PreTrainedModel.from_pretrained`]. ```py from transformers import AutoModelForCausalLM - + model = AutoModelForCausalLM.from_pretrained("allenai/TBA", revision="stage1-step140000-tokens294B") ``` @@ -144,4 +145,4 @@ print(tokenizer.decode(output[0], skip_special_tokens=True)) ## Olmo3PreTrainedModel [[autodoc]] Olmo3PreTrainedModel - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/ovis2.md b/docs/source/en/model_doc/ovis2.md index ab1d761f19ed..342e34ef7a1b 100644 --- a/docs/source/en/model_doc/ovis2.md +++ b/docs/source/en/model_doc/ovis2.md @@ -13,6 +13,7 @@ specific language governing permissions and limitations under the License. rendered properly in your Markdown viewer. --> +*This model was released on 2024-05-31 and added to Hugging Face Transformers on 2025-08-18.* # Ovis2 diff --git a/docs/source/en/model_doc/qwen3_next.md b/docs/source/en/model_doc/qwen3_next.md index f2e003182ee7..737934136099 100644 --- a/docs/source/en/model_doc/qwen3_next.md +++ b/docs/source/en/model_doc/qwen3_next.md @@ -13,18 +13,20 @@ specific language governing permissions and limitations under the License. rendered properly in your Markdown viewer. --> +*This model was released on {release_date} and added to Hugging Face Transformers on 2025-09-10.* + ## Overview -The Qwen3-Next series represents our next-generation foundation models, optimized for extreme context length and large-scale parameter efficiency. +The Qwen3-Next series represents our next-generation foundation models, optimized for extreme context length and large-scale parameter efficiency. The series introduces a suite of architectural innovations designed to maximize performance while minimizing computational cost: -- **Hybrid Attention**: Replaces standard attention with the combination of **Gated DeltaNet** and **Gated Attention**, enabling efficient context modeling. +- **Hybrid Attention**: Replaces standard attention with the combination of **Gated DeltaNet** and **Gated Attention**, enabling efficient context modeling. - **High-Sparsity MoE**: Achieves an extreme low activation ratio as 1:50 in MoE layers — drastically reducing FLOPs per token while preserving model capacity. - **Multi-Token Prediction(MTP)**: Boosts pretraining model performance, and accelerates inference. -- **Other Optimizations**: Includes techniques such as **zero-centered and weight-decayed layernorm**, **Gated Attention**, and other stabilizing enhancements for robust training. +- **Other Optimizations**: Includes techniques such as **zero-centered and weight-decayed layernorm**, **Gated Attention**, and other stabilizing enhancements for robust training. Built on this architecture, we trained and open-sourced Qwen3-Next-80B-A3B — 80B total parameters, only 3B active — achieving extreme sparsity and efficiency. -Despite its ultra-efficiency, it outperforms Qwen3-32B on downstream tasks — while requiring **less than 1/10 of the training cost**. +Despite its ultra-efficiency, it outperforms Qwen3-32B on downstream tasks — while requiring **less than 1/10 of the training cost**. Moreover, it delivers over **10x higher inference throughput** than Qwen3-32B when handling contexts longer than 32K tokens. For more details, please visit our blog [Qwen3-Next](qwen3_next) ([blog post](https://qwenlm.github.io/blog/qwen3_next/)). @@ -60,7 +62,7 @@ generated_ids = model.generate( **model_inputs, max_new_tokens=512 ) -output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() +output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() content = tokenizer.decode(output_ids, skip_special_tokens=True) diff --git a/docs/source/en/model_doc/qwen3_vl.md b/docs/source/en/model_doc/qwen3_vl.md index 9e90363a1eba..c939d5da3cd9 100644 --- a/docs/source/en/model_doc/qwen3_vl.md +++ b/docs/source/en/model_doc/qwen3_vl.md @@ -13,7 +13,7 @@ specific language governing permissions and limitations under the License. rendered properly in your Markdown viewer. --> -*This model was released on None and added to Hugging Face Transformers on 2025-08-16.* +*This model was released on None and added to Hugging Face Transformers on 2025-09-15.*
diff --git a/docs/source/en/model_doc/qwen3_vl_moe.md b/docs/source/en/model_doc/qwen3_vl_moe.md index 76d046efff2d..6e27adf915d3 100644 --- a/docs/source/en/model_doc/qwen3_vl_moe.md +++ b/docs/source/en/model_doc/qwen3_vl_moe.md @@ -13,7 +13,7 @@ specific language governing permissions and limitations under the License. rendered properly in your Markdown viewer. --> -*This model was released on None and added to Hugging Face Transformers on 2025-08-17.* +*This model was released on None and added to Hugging Face Transformers on 2025-09-15.*
diff --git a/docs/source/en/model_doc/seed_oss.md b/docs/source/en/model_doc/seed_oss.md index 0f0dacb2be90..dbcddcb5f2c7 100644 --- a/docs/source/en/model_doc/seed_oss.md +++ b/docs/source/en/model_doc/seed_oss.md @@ -1,17 +1,20 @@ - + +*This model was released on {release_date} and added to Hugging Face Transformers on 2025-08-22.* # SeedOss @@ -54,4 +57,4 @@ To be released with the official model launch. ## SeedOssForQuestionAnswering [[autodoc]] SeedOssForQuestionAnswering - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/vaultgemma.md b/docs/source/en/model_doc/vaultgemma.md index c9eb36124fca..94d28cc8afe2 100644 --- a/docs/source/en/model_doc/vaultgemma.md +++ b/docs/source/en/model_doc/vaultgemma.md @@ -16,6 +16,7 @@ limitations under the License. ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be rendered properly in your Markdown viewer. --> +*This model was released on {release_date} and added to Hugging Face Transformers on 2025-09-12.* # VaultGemma @@ -30,7 +31,7 @@ sequence length. VaultGemma was trained from scratch with sequence-level differential privacy (DP). Its training data includes the same mixture as the [Gemma 2 models](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315), consisting of a number of documents of varying lengths. Additionally, it is trained using -[DP stochastic gradient descent (DP-SGD)](https://arxiv.org/abs/1607.00133) and provides a +[DP stochastic gradient descent (DP-SGD)](https://huggingface.co/papers/1607.00133) and provides a (ε ≤ 2.0, δ ≤ 1.1e-10)-sequence-level DP guarantee, where a sequence consists of 1024 consecutive tokens extracted from heterogeneous data sources. Specifically, the privacy unit of the guarantee is for the sequences after sampling and packing of the mixture. From dde0af36ca230de99a0f6184f0ea30c379a8a619 Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Thu, 18 Sep 2025 17:51:23 +0000 Subject: [PATCH 2/2] fix new models --- docs/source/en/model_doc/flex_olmo.md | 2 +- docs/source/en/model_doc/lfm2_vl.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/flex_olmo.md b/docs/source/en/model_doc/flex_olmo.md index b771fe526d06..418a660b6d23 100644 --- a/docs/source/en/model_doc/flex_olmo.md +++ b/docs/source/en/model_doc/flex_olmo.md @@ -16,7 +16,7 @@ limitations under the License. ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be rendered properly in your Markdown viewer. --> -*This model was released on 2025-07-09 and added to Hugging Face Transformers on 2025-09-15.* +*This model was released on 2025-07-09 and added to Hugging Face Transformers on 2025-09-18.*
PyTorch diff --git a/docs/source/en/model_doc/lfm2_vl.md b/docs/source/en/model_doc/lfm2_vl.md index 1607e3066905..3a93a8189a70 100644 --- a/docs/source/en/model_doc/lfm2_vl.md +++ b/docs/source/en/model_doc/lfm2_vl.md @@ -13,6 +13,7 @@ specific language governing permissions and limitations under the License. rendered properly in your Markdown viewer. --> +*This model was released on {release_date} and added to Hugging Face Transformers on 2025-09-18.*
PyTorch