-
Notifications
You must be signed in to change notification settings - Fork 4.8k
add model: qwen2-audio #7596
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
add model: qwen2-audio #7596
Changes from all commits
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
194c071
add basic qwen2 audio support
leng-yue 5e0ec34
support qwen2-audio
leng-yue 31a8ae6
Merge branch 'main' into add-qwen2-audio
leng-yue 5bb4cfa
cleanup unused code
leng-yue 9c4f886
fix potential bug in feature lens
leng-yue bf8a4be
fix import
leng-yue File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
94 changes: 94 additions & 0 deletions
94
python/sglang/srt/managers/multimodal_processors/qwen_audio.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,94 @@ | ||
| import re | ||
| from typing import List, Union | ||
|
|
||
| import torch | ||
|
|
||
| from sglang.srt.managers.multimodal_processors.base_processor import ( | ||
| BaseMultimodalProcessor, | ||
| MultimodalSpecialTokens, | ||
| ) | ||
| from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem | ||
| from sglang.srt.models.qwen2_audio import Qwen2AudioForConditionalGeneration | ||
|
|
||
|
|
||
| class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor): | ||
| models = [Qwen2AudioForConditionalGeneration] | ||
|
|
||
| def __init__(self, hf_config, server_args, _processor): | ||
| super().__init__(hf_config, server_args, _processor) | ||
| self.AUDIO_TOKEN = "<|audio_bos|><|AUDIO|><|audio_eos|>" | ||
| self.AUDIO_TOKEN_REGEX = re.compile( | ||
| r"<\|audio_bos\|>(?:<\|AUDIO\|>)+<\|audio_eos\|>" | ||
| ) | ||
|
|
||
| async def process_mm_data_async( | ||
| self, | ||
| image_data: List[Union[str, bytes]], | ||
| input_text, | ||
| request_obj, | ||
| max_req_input_len, | ||
| **kwargs, | ||
| ): | ||
| audio_data = request_obj.audio_data | ||
| if not isinstance(audio_data, list): | ||
| audio_data = [audio_data] | ||
|
|
||
| base_output = self.load_mm_data( | ||
| prompt=input_text, | ||
| max_req_input_len=max_req_input_len, | ||
| audio_data=audio_data, | ||
| multimodal_tokens=MultimodalSpecialTokens( | ||
| audio_token=self.AUDIO_TOKEN, | ||
| audio_token_regex=self.AUDIO_TOKEN_REGEX, | ||
| ), | ||
| ) | ||
| if base_output is None: | ||
| return None | ||
|
|
||
| res = self.process_mm_data( | ||
| input_text=base_output.input_text, | ||
| audio=base_output.audios, | ||
| ) | ||
|
|
||
| # Collect special token ids | ||
| tokenizer = self._processor.tokenizer | ||
| audio_start_id = tokenizer.convert_tokens_to_ids("<|audio_bos|>") | ||
| audio_token_id = tokenizer.convert_tokens_to_ids("<|AUDIO|>") | ||
| audio_end_id = tokenizer.convert_tokens_to_ids("<|audio_eos|>") | ||
|
|
||
| items = [] | ||
| input_ids = res["input_ids"].flatten() | ||
|
|
||
| if ( | ||
| "input_features" in res | ||
| and res["input_features"] is not None | ||
| and len(res["input_features"]) != 0 | ||
| ): | ||
| if audio_start_id is not None and audio_end_id is not None: | ||
| audio_offsets = self.get_mm_items_offset_by_pair( | ||
| input_ids=input_ids, | ||
| mm_start_id=audio_start_id, | ||
| mm_end_id=audio_end_id, | ||
| ) | ||
| else: | ||
| audio_offsets = None | ||
|
|
||
| input_lengths = res["feature_attention_mask"].sum(dim=-1) | ||
| input_lengths = (input_lengths - 1) // 2 + 1 | ||
| output_lengths = (input_lengths - 2) // 2 + 1 | ||
|
|
||
| item = MultimodalDataItem( | ||
| audio_features=res["input_features"], | ||
| audio_feature_lens=output_lengths, | ||
| audio_offsets=audio_offsets, | ||
| modality=Modality.AUDIO, | ||
| ) | ||
| items += [item] | ||
|
|
||
| return { | ||
| "mm_items": items, | ||
| "input_ids": input_ids.tolist(), | ||
| "audio_start_id": audio_start_id, | ||
| "audio_token_id": audio_token_id, | ||
| "audio_end_id": audio_end_id, | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,200 @@ | ||
| # coding=utf-8 | ||
| # Adapted from | ||
| # https://github.com/huggingface/transformers/blob/1d45d90e5d1552eccb6d8cc9b7bba283ccefb808/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py | ||
| # Copyright 2024 The Qwen team. | ||
| # Copyright 2023 The vLLM team. | ||
| # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. | ||
| # | ||
| # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX | ||
| # and OPT implementations in this library. It has been modified from its | ||
| # original forms to accommodate minor architectural differences compared | ||
| # to GPT-NeoX and OPT used by the Meta AI team that trained the model. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
| """Inference-only Qwen2-Audio model compatible with HuggingFace weights.""" | ||
| import logging | ||
| import math | ||
| from functools import lru_cache, partial | ||
| from typing import Any, Iterable, List, Optional, Tuple, Type, TypedDict | ||
|
|
||
| import torch | ||
| import torch.nn as nn | ||
| import torch.nn.functional as F | ||
| from einops import rearrange | ||
| from transformers import AutoTokenizer, Qwen2AudioEncoderConfig, Qwen2Config | ||
| from transformers.activations import ACT2FN | ||
| from transformers.models.qwen2_audio.configuration_qwen2_audio import Qwen2AudioConfig | ||
| from transformers.models.qwen2_audio.modeling_qwen2_audio import ( | ||
| Qwen2AudioEncoder, | ||
| Qwen2AudioMultiModalProjector, | ||
| ) | ||
|
|
||
| from sglang.srt.hf_transformers_utils import get_processor | ||
| from sglang.srt.layers.activation import QuickGELU | ||
| from sglang.srt.layers.attention.vision import VisionAttention | ||
| from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear | ||
| from sglang.srt.layers.logits_processor import LogitsProcessor | ||
| from sglang.srt.layers.pooler import Pooler, PoolingType | ||
| from sglang.srt.layers.quantization.base_config import QuantizationConfig | ||
| from sglang.srt.layers.utils import get_layer_id | ||
| from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead | ||
| from sglang.srt.managers.mm_utils import ( | ||
| MultiModalityDataPaddingPatternMultimodalTokens, | ||
| general_mm_embed_routine, | ||
| ) | ||
| from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs | ||
| from sglang.srt.model_executor.forward_batch_info import ForwardBatch | ||
| from sglang.srt.model_loader.weight_utils import default_weight_loader | ||
| from sglang.srt.models.qwen2 import Qwen2ForCausalLM | ||
| from sglang.srt.utils import add_prefix | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| class Qwen2AudioForConditionalGeneration(nn.Module): | ||
| # BitandBytes specific attributes | ||
| default_bitsandbytes_target_modules = [ | ||
| ".gate_proj.", | ||
| ".down_proj.", | ||
| ".up_proj.", | ||
| ".q_proj.", | ||
| ".k_proj.", | ||
| ".v_proj.", | ||
| ".o_proj.", | ||
| ] | ||
| bitsandbytes_stacked_params_mapping = { | ||
| # shard_name, weight_name, index | ||
| "q_proj": ("qkv_proj", 0), | ||
| "k_proj": ("qkv_proj", 1), | ||
| "v_proj": ("qkv_proj", 2), | ||
| "gate_proj": ("gate_up_proj", 0), | ||
| "up_proj": ("gate_up_proj", 1), | ||
| } | ||
|
|
||
| def __init__( | ||
| self, | ||
| config: Qwen2AudioConfig, | ||
| quant_config: Optional[QuantizationConfig] = None, | ||
| prefix: str = "", | ||
| ) -> None: | ||
| super().__init__() | ||
|
|
||
| self.config = config | ||
|
|
||
| if getattr(self.config, "audio_config", None) is None: | ||
| self.config.audio_config = Qwen2AudioEncoderConfig( | ||
| self.config._name_or_path | ||
| ) | ||
|
|
||
| if getattr(self.config, "text_config", None) is None: | ||
| self.config.text_config = Qwen2Config(self.config._name_or_path) | ||
|
|
||
| self.audio_tower = Qwen2AudioEncoder( | ||
| config.audio_config, | ||
| ) | ||
| self.multi_modal_projector = Qwen2AudioMultiModalProjector(config) | ||
| self.language_model = Qwen2ForCausalLM( | ||
| config.text_config, quant_config, prefix=add_prefix("model", prefix) | ||
| ) | ||
|
|
||
| def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs): | ||
| # Get all special token IDs for audio | ||
| audio_token_id: int = getattr( | ||
| mm_inputs, "audio_token_id", mm_inputs.im_token_id | ||
| ) | ||
|
|
||
| pattern = MultiModalityDataPaddingPatternMultimodalTokens([audio_token_id]) | ||
| return pattern.pad_input_tokens(input_ids, mm_inputs) | ||
|
|
||
| def get_audio_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor: | ||
| # Extract audio features from input items | ||
| input_features = torch.cat([item.audio_features for item in items], dim=0).type( | ||
| self.audio_tower.dtype | ||
| ) | ||
|
|
||
| audio_embeds = self.audio_tower(input_features).last_hidden_state | ||
| audio_embeds = self.multi_modal_projector(audio_embeds) | ||
|
|
||
| audio_feature_lens = torch.cat([item.audio_feature_lens for item in items]) | ||
| new_embeds = [] | ||
| for i, d in zip(audio_feature_lens, audio_embeds): | ||
| new_embeds.append(d[: i.item()]) | ||
|
|
||
| return torch.cat(new_embeds, dim=0) | ||
|
|
||
| def forward( | ||
| self, | ||
| input_ids: torch.Tensor, | ||
| positions: torch.Tensor, | ||
| forward_batch: ForwardBatch, | ||
| **kwargs: Any, | ||
| ) -> torch.Tensor: | ||
| hidden_states = general_mm_embed_routine( | ||
| input_ids=input_ids, | ||
| forward_batch=forward_batch, | ||
| language_model=self.language_model, | ||
| audio_data_embedding_func=self.get_audio_feature, | ||
| positions=positions, | ||
| ) | ||
|
|
||
| return hidden_states | ||
|
|
||
| def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): | ||
| stacked_params_mapping = [ | ||
| # (param_name, shard_name, shard_id) | ||
| ("qkv_proj", "q_proj", "q"), | ||
| ("qkv_proj", "k_proj", "k"), | ||
| ("qkv_proj", "v_proj", "v"), | ||
| ("gate_up_proj", "gate_proj", 0), | ||
| ("gate_up_proj", "up_proj", 1), | ||
| ] | ||
| params_dict = dict(self.named_parameters(remove_duplicate=False)) | ||
|
|
||
| for name, loaded_weight in weights: | ||
| if "rotary_emb.inv_freq" in name: | ||
| continue | ||
| if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name: | ||
| # Models trained using ColossalAI may include these tensors in | ||
| # the checkpoint. Skip them. | ||
| continue | ||
|
|
||
| if self.config.text_config.tie_word_embeddings and "lm_head.weight" in name: | ||
| continue | ||
|
|
||
| for param_name, weight_name, shard_id in stacked_params_mapping: | ||
| if weight_name not in name or "audio_tower" in name: | ||
| continue | ||
| name_tmp = name.replace(weight_name, param_name) | ||
|
|
||
| # Skip loading extra bias for GPTQ models. | ||
| if name_tmp.endswith(".bias") and name_tmp not in params_dict: | ||
| continue | ||
| param = params_dict[name_tmp] | ||
| weight_loader = param.weight_loader | ||
| weight_loader(param, loaded_weight, shard_id) | ||
| break | ||
| else: | ||
| try: | ||
| # Skip loading extra bias for GPTQ models. | ||
| if name.endswith(".bias") and name not in params_dict: | ||
| continue | ||
| param = params_dict[name] | ||
| except KeyError: | ||
| print(params_dict.keys()) | ||
| raise | ||
|
|
||
| weight_loader = getattr(param, "weight_loader", default_weight_loader) | ||
| weight_loader(param, loaded_weight) | ||
|
|
||
|
|
||
| EntryClass = Qwen2AudioForConditionalGeneration |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.