vllm.model_executor.models.transformers

Wrapper around transformers models

logger `module-attribute` ¶

logger = init_logger(__name__)

MultiModalDummyInputsBuilder ¶

Bases: BaseDummyInputsBuilder[MultiModalProcessingInfo]

Source code in vllm/model_executor/models/transformers.py

class MultiModalDummyInputsBuilder(
        BaseDummyInputsBuilder[MultiModalProcessingInfo]):

    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
        num_images = mm_counts.get("image", 0)

        processor = self.info.get_hf_processor()
        if "gemma3" in processor.__class__.__name__.lower():
            image_token = processor.boi_token
        else:
            image_token = getattr(processor, "image_token", "")
        return image_token * num_images

    def get_dummy_mm_data(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
    ) -> MultiModalDataDict:
        num_images = mm_counts.get("image", 0)

        target_width, target_height = self.info.get_max_image_size()

        return {
            "image":
            self._get_dummy_images(width=target_width,
                                   height=target_height,
                                   num_images=num_images),
        }

get_dummy_mm_data ¶

get_dummy_mm_data(
    seq_len: int, mm_counts: Mapping[str, int]
) -> MultiModalDataDict

Source code in vllm/model_executor/models/transformers.py

def get_dummy_mm_data(
    self,
    seq_len: int,
    mm_counts: Mapping[str, int],
) -> MultiModalDataDict:
    num_images = mm_counts.get("image", 0)

    target_width, target_height = self.info.get_max_image_size()

    return {
        "image":
        self._get_dummy_images(width=target_width,
                               height=target_height,
                               num_images=num_images),
    }

get_dummy_text ¶

get_dummy_text(mm_counts: Mapping[str, int]) -> str

Source code in vllm/model_executor/models/transformers.py

def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
    num_images = mm_counts.get("image", 0)

    processor = self.info.get_hf_processor()
    if "gemma3" in processor.__class__.__name__.lower():
        image_token = processor.boi_token
    else:
        image_token = getattr(processor, "image_token", "")
    return image_token * num_images

MultiModalProcessingInfo ¶

Bases: BaseProcessingInfo

Source code in vllm/model_executor/models/transformers.py

class MultiModalProcessingInfo(BaseProcessingInfo):

    def get_hf_config(self):
        return self.ctx.model_config.hf_config

    def get_supported_mm_limits(self):
        return {"image": None}

    def get_mm_max_tokens_per_item(self, seq_len, mm_counts):
        return {"image": self.get_max_image_tokens()}

    def get_max_image_tokens(self) -> int:
        width, height = self.get_max_image_size()
        processor = self.get_hf_processor()
        mm_processor_kwargs = self.ctx.model_config.mm_processor_kwargs or {}
        mm_tokens = processor._get_num_multimodal_tokens(
            image_sizes=([height, width], ), **mm_processor_kwargs)
        image_tokens = mm_tokens["num_image_tokens"][0]
        return image_tokens

    def get_max_image_size(self):
        return 10_000, 10_000  # hardcode for arbitrary very large size

get_hf_config ¶

get_hf_config()

Source code in vllm/model_executor/models/transformers.py

def get_hf_config(self):
    return self.ctx.model_config.hf_config

get_max_image_size ¶

get_max_image_size()

Source code in vllm/model_executor/models/transformers.py

def get_max_image_size(self):
    return 10_000, 10_000  # hardcode for arbitrary very large size

get_max_image_tokens ¶

get_max_image_tokens() -> int

Source code in vllm/model_executor/models/transformers.py

def get_max_image_tokens(self) -> int:
    width, height = self.get_max_image_size()
    processor = self.get_hf_processor()
    mm_processor_kwargs = self.ctx.model_config.mm_processor_kwargs or {}
    mm_tokens = processor._get_num_multimodal_tokens(
        image_sizes=([height, width], ), **mm_processor_kwargs)
    image_tokens = mm_tokens["num_image_tokens"][0]
    return image_tokens

get_mm_max_tokens_per_item ¶

get_mm_max_tokens_per_item(seq_len, mm_counts)

Source code in vllm/model_executor/models/transformers.py

def get_mm_max_tokens_per_item(self, seq_len, mm_counts):
    return {"image": self.get_max_image_tokens()}

get_supported_mm_limits ¶

get_supported_mm_limits()

Source code in vllm/model_executor/models/transformers.py

def get_supported_mm_limits(self):
    return {"image": None}

MultiModalProcessor ¶

Bases: BaseMultiModalProcessor[MultiModalProcessingInfo]

Source code in vllm/model_executor/models/transformers.py

class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):

    def _get_prompt_updates(
        self,
        mm_items: MultiModalDataItems,
        hf_processor_mm_kwargs: Mapping[str, object],
        out_mm_kwargs: MultiModalKwargsItems,
    ):
        """
        Given the original multi-modal items for this modality
        and HF-processed data, output the updates to perform.

        The information returned by this method is used to update token inputs
        which bypass the HF processor. It is also used to update the output of
        HF processor if the HF process does not apply prompt updates to text
        inputs.

        Moreover, this information is critical to determine the token positions
        in order to construct  :class:`~vllm-multimodal.input.PlaceholderRange`
        for each multi-modal item.
        """
        return None

    def _get_mm_fields_config(
        self,
        hf_inputs,
        hf_processor_mm_kwargs,
        num_image_patches: torch.Tensor = None,
    ):
        # HF Processors always return a mask but vLLM doesn't need it
        hf_inputs.pop("attention_mask", None)
        mm_fields = {
            key: MultiModalFieldConfig.flat_from_sizes("image",
                                                       num_image_patches)
            for key in hf_inputs
        }
        mm_fields["image_embeds"] = MultiModalFieldConfig.flat_from_sizes(
            "image", num_image_patches)
        mm_fields["num_image_patches"] = MultiModalFieldConfig.batched("image")
        return mm_fields

    def _apply_hf_processor_text_mm(
        self,
        prompt_text: str,
        mm_items: MultiModalDataItems,
        hf_processor_mm_kwargs: Mapping[str, object],
        tokenization_kwargs: Mapping[str, object],
    ) -> tuple[list[int], BatchFeature, bool]:
        """
        Apply the HF processor on the prompt text and multi-modal data
        together.

        In addition, return whether prompt replacements have been applied.
        """
        processor_data, passthrough_data = self._get_hf_mm_data(mm_items)
        processor_data["return_mm_token_type_ids"] = True

        processed_data = self._call_hf_processor(
            prompt=prompt_text,
            mm_data=processor_data,
            mm_kwargs=hf_processor_mm_kwargs,
            tok_kwargs=tokenization_kwargs,
        )
        processed_data.update(passthrough_data)

        prompt_ids, = processed_data.pop("input_ids").tolist()
        mm_token_type_ids = processed_data.pop(
            "mm_token_type_ids"
        ) if "mm_token_type_ids" in processed_data else processed_data.pop(
            "token_type_ids")  # for gemma3 only

        return prompt_ids, processed_data, mm_token_type_ids

    def apply(
        self,
        prompt: Union[str, list[int]],
        mm_data: MultiModalDataDict,
        hf_processor_mm_kwargs: Mapping[str, object],
        tokenization_kwargs: Optional[Mapping[str, object]] = None,
    ) -> MultiModalInputs:
        """
        Process multi-modal inputs to be used in vLLM.

        Apply HF Processor on prompt text and multi-modal data together,
        outputting token IDs and processed tensors.
        """
        if tokenization_kwargs is None:
            tokenization_kwargs = {}

        mm_items = self._to_mm_items(mm_data)
        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
        if not isinstance(prompt, str):
            # the prompt is the tokenized ids which is not supported
            # by the hf_processor, which is why we would need to decode the ids
            # into string
            prompt = hf_processor.decode(prompt)

        (prompt_ids, processed_data,
         mm_token_type_ids) = self._apply_hf_processor_text_mm(
             prompt_text=prompt,
             mm_items=mm_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
         )

        # HF processor will return `mm_token_type_ids` from which
        # we can infer mm_placeholders. Until then hardcode to make code run
        # Below tested on Llava. Prompts and `mm_token_type_ids` are always bs=1
        mm_positions = torch.where(mm_token_type_ids == 1)[1]
        images = mm_items.get_items("image", ImageProcessorItems)
        mm_processor_kwargs = (self.info.ctx.model_config.mm_processor_kwargs
                               or {})
        image_sizes = []
        for item_idx in range(len(images)):
            image_size = images.get_image_size(item_idx)
            image_sizes.append((image_size.height, image_size.width))

        mm_tokens_per_modality = hf_processor._get_num_multimodal_tokens(
            image_sizes=image_sizes, **mm_processor_kwargs)

        mm_placeholders = {}
        split_sizes = mm_tokens_per_modality["num_image_tokens"]
        if split_sizes:
            chunked_mm_positions = torch.split(mm_positions, split_sizes)
            mm_tokens = torch.tensor(prompt_ids)[mm_token_type_ids[0].bool()]
            chunked_mm_tokens = torch.split(mm_tokens, split_sizes)
            ranges = [
                PlaceholderRange(
                    offset=positions[0].item(),
                    length=positions.shape[0],
                    is_embed=(mm_tokens == hf_processor.image_token_id).bool())
                for positions, mm_tokens in zip(chunked_mm_positions,
                                                chunked_mm_tokens)
            ]
            mm_placeholders = {"image": ranges}

        num_image_patches = torch.tensor(
            mm_tokens_per_modality["num_image_patches"]
        ) if "num_image_patches" in mm_tokens_per_modality else None
        processed_data['num_image_patches'] = num_image_patches
        mm_kwargs = MultiModalKwargsItems.from_hf_inputs(
            processed_data,
            self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs,
                                       num_image_patches),
        )

        mm_hashes = self._hash_mm_items(mm_items, hf_processor_mm_kwargs,
                                        tokenization_kwargs)
        return MultiModalInputs(
            type="multimodal",
            prompt=prompt,
            prompt_token_ids=prompt_ids,
            mm_kwargs=mm_kwargs,
            mm_hashes=mm_hashes,
            mm_placeholders=mm_placeholders,
        )

_apply_hf_processor_text_mm ¶

_apply_hf_processor_text_mm(
    prompt_text: str,
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object],
    tokenization_kwargs: Mapping[str, object],
) -> tuple[list[int], BatchFeature, bool]

Apply the HF processor on the prompt text and multi-modal data together.

In addition, return whether prompt replacements have been applied.

Source code in vllm/model_executor/models/transformers.py

def _apply_hf_processor_text_mm(
    self,
    prompt_text: str,
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object],
    tokenization_kwargs: Mapping[str, object],
) -> tuple[list[int], BatchFeature, bool]:
    """
    Apply the HF processor on the prompt text and multi-modal data
    together.

    In addition, return whether prompt replacements have been applied.
    """
    processor_data, passthrough_data = self._get_hf_mm_data(mm_items)
    processor_data["return_mm_token_type_ids"] = True

    processed_data = self._call_hf_processor(
        prompt=prompt_text,
        mm_data=processor_data,
        mm_kwargs=hf_processor_mm_kwargs,
        tok_kwargs=tokenization_kwargs,
    )
    processed_data.update(passthrough_data)

    prompt_ids, = processed_data.pop("input_ids").tolist()
    mm_token_type_ids = processed_data.pop(
        "mm_token_type_ids"
    ) if "mm_token_type_ids" in processed_data else processed_data.pop(
        "token_type_ids")  # for gemma3 only

    return prompt_ids, processed_data, mm_token_type_ids

_get_mm_fields_config ¶

_get_mm_fields_config(
    hf_inputs,
    hf_processor_mm_kwargs,
    num_image_patches: Tensor = None,
)

Source code in vllm/model_executor/models/transformers.py

def _get_mm_fields_config(
    self,
    hf_inputs,
    hf_processor_mm_kwargs,
    num_image_patches: torch.Tensor = None,
):
    # HF Processors always return a mask but vLLM doesn't need it
    hf_inputs.pop("attention_mask", None)
    mm_fields = {
        key: MultiModalFieldConfig.flat_from_sizes("image",
                                                   num_image_patches)
        for key in hf_inputs
    }
    mm_fields["image_embeds"] = MultiModalFieldConfig.flat_from_sizes(
        "image", num_image_patches)
    mm_fields["num_image_patches"] = MultiModalFieldConfig.batched("image")
    return mm_fields

_get_prompt_updates ¶

_get_prompt_updates(
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object],
    out_mm_kwargs: MultiModalKwargsItems,
)

Given the original multi-modal items for this modality and HF-processed data, output the updates to perform.

The information returned by this method is used to update token inputs which bypass the HF processor. It is also used to update the output of HF processor if the HF process does not apply prompt updates to text inputs.

Moreover, this information is critical to determine the token positions in order to construct :class:~vllm-multimodal.input.PlaceholderRange for each multi-modal item.

Source code in vllm/model_executor/models/transformers.py

def _get_prompt_updates(
    self,
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object],
    out_mm_kwargs: MultiModalKwargsItems,
):
    """
    Given the original multi-modal items for this modality
    and HF-processed data, output the updates to perform.

    The information returned by this method is used to update token inputs
    which bypass the HF processor. It is also used to update the output of
    HF processor if the HF process does not apply prompt updates to text
    inputs.

    Moreover, this information is critical to determine the token positions
    in order to construct  :class:`~vllm-multimodal.input.PlaceholderRange`
    for each multi-modal item.
    """
    return None

apply ¶

apply(
    prompt: Union[str, list[int]],
    mm_data: MultiModalDataDict,
    hf_processor_mm_kwargs: Mapping[str, object],
    tokenization_kwargs: Optional[
        Mapping[str, object]
    ] = None,
) -> MultiModalInputs

Process multi-modal inputs to be used in vLLM.

Apply HF Processor on prompt text and multi-modal data together, outputting token IDs and processed tensors.

Source code in vllm/model_executor/models/transformers.py

def apply(
    self,
    prompt: Union[str, list[int]],
    mm_data: MultiModalDataDict,
    hf_processor_mm_kwargs: Mapping[str, object],
    tokenization_kwargs: Optional[Mapping[str, object]] = None,
) -> MultiModalInputs:
    """
    Process multi-modal inputs to be used in vLLM.

    Apply HF Processor on prompt text and multi-modal data together,
    outputting token IDs and processed tensors.
    """
    if tokenization_kwargs is None:
        tokenization_kwargs = {}

    mm_items = self._to_mm_items(mm_data)
    hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
    if not isinstance(prompt, str):
        # the prompt is the tokenized ids which is not supported
        # by the hf_processor, which is why we would need to decode the ids
        # into string
        prompt = hf_processor.decode(prompt)

    (prompt_ids, processed_data,
     mm_token_type_ids) = self._apply_hf_processor_text_mm(
         prompt_text=prompt,
         mm_items=mm_items,
         hf_processor_mm_kwargs=hf_processor_mm_kwargs,
         tokenization_kwargs=tokenization_kwargs,
     )

    # HF processor will return `mm_token_type_ids` from which
    # we can infer mm_placeholders. Until then hardcode to make code run
    # Below tested on Llava. Prompts and `mm_token_type_ids` are always bs=1
    mm_positions = torch.where(mm_token_type_ids == 1)[1]
    images = mm_items.get_items("image", ImageProcessorItems)
    mm_processor_kwargs = (self.info.ctx.model_config.mm_processor_kwargs
                           or {})
    image_sizes = []
    for item_idx in range(len(images)):
        image_size = images.get_image_size(item_idx)
        image_sizes.append((image_size.height, image_size.width))

    mm_tokens_per_modality = hf_processor._get_num_multimodal_tokens(
        image_sizes=image_sizes, **mm_processor_kwargs)

    mm_placeholders = {}
    split_sizes = mm_tokens_per_modality["num_image_tokens"]
    if split_sizes:
        chunked_mm_positions = torch.split(mm_positions, split_sizes)
        mm_tokens = torch.tensor(prompt_ids)[mm_token_type_ids[0].bool()]
        chunked_mm_tokens = torch.split(mm_tokens, split_sizes)
        ranges = [
            PlaceholderRange(
                offset=positions[0].item(),
                length=positions.shape[0],
                is_embed=(mm_tokens == hf_processor.image_token_id).bool())
            for positions, mm_tokens in zip(chunked_mm_positions,
                                            chunked_mm_tokens)
        ]
        mm_placeholders = {"image": ranges}

    num_image_patches = torch.tensor(
        mm_tokens_per_modality["num_image_patches"]
    ) if "num_image_patches" in mm_tokens_per_modality else None
    processed_data['num_image_patches'] = num_image_patches
    mm_kwargs = MultiModalKwargsItems.from_hf_inputs(
        processed_data,
        self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs,
                                   num_image_patches),
    )

    mm_hashes = self._hash_mm_items(mm_items, hf_processor_mm_kwargs,
                                    tokenization_kwargs)
    return MultiModalInputs(
        type="multimodal",
        prompt=prompt,
        prompt_token_ids=prompt_ids,
        mm_kwargs=mm_kwargs,
        mm_hashes=mm_hashes,
        mm_placeholders=mm_placeholders,
    )

TransformersBase ¶

Bases: Module, SupportsQuant, SupportsLoRA, SupportsPP

Source code in vllm/model_executor/models/transformers.py

class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
    embedding_padding_modules = ["lm_head"]
    embedding_modules = ["embed_tokens"
                         ]  # TODO transformers will have a util to get it

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()
        logger.info("Using Transformers backend.")

        self.config: PretrainedConfig = vllm_config.model_config.hf_config
        self.text_config: PretrainedConfig = self.config.get_text_config()
        self.cache_config: CacheConfig = vllm_config.cache_config
        self.device_config: DeviceConfig = vllm_config.device_config
        self.model_config: ModelConfig = vllm_config.model_config
        self.parallel_config: ParallelConfig = vllm_config.parallel_config
        self.quant_config: QuantizationConfig = vllm_config.quant_config

        self.pp_group = get_pp_group()
        self.pp_size = self.pp_group.world_size
        self.pp_rank = self.pp_group.rank_in_group
        self.tp_size = get_tensor_model_parallel_world_size()

        # To be updated in child classes for use in `load_weights`
        self.skip_prefixes: Optional[list[str]] = None

        # Set correct attn and init on "meta" to delay allocating GPU tensors
        # TODO: @raushan, use the public `model.set_attn_implementation()`
        # method once its checks are fixed in Transformers.
        self.text_config._attn_implementation = "vllm"
        with init_on_device_without_buffers("meta"):
            self.model: PreTrainedModel = AutoModel.from_config(
                self.config,
                torch_dtype=self.model_config.dtype,
                trust_remote_code=self.model_config.trust_remote_code,
            )

        self.pipeline_parallel()
        self.tensor_parallel()

        # Input embeddings
        if not isinstance(self.model.get_input_embeddings(), PPMissingLayer):
            self.model.set_input_embeddings(
                VocabParallelEmbedding(
                    self.text_config.vocab_size,
                    self.text_config.hidden_size,
                    org_num_embeddings=self.text_config.vocab_size,
                    quant_config=self.quant_config,
                ))

        # Attention layers
        self.attention_instances = self.create_attention_instances()

        # Initialize any parameters that have not had their modules replaced
        self.init_parameters(self.model)

        self.make_empty_intermediate_tensors = (
            make_empty_intermediate_tensors_factory(
                ["hidden_states"], self.text_config.hidden_size))

    def pipeline_parallel(self):
        """
        Apply the model's pipeline parallelization plan.
        """
        if self.pp_size <= 1:
            return

        if not self.model.supports_pp_plan:
            raise ValueError(
                f"{type(self.model)} does not support pipeline parallel yet!")

        module_lists = []
        module_list_idx = None
        pp_plan = list(self.model._pp_plan.keys())
        for i, name in enumerate(pp_plan):
            if isinstance(getattr(self.model, name), nn.ModuleList):
                module_lists.append(name)
                module_list_idx = i

        if len(module_lists) > 1:
            raise ValueError(
                "Pipeline parallel of models with multiple `ModuleList`s "
                "in the base model are not supported yet!")
        if module_list_idx is None:
            raise ValueError(
                f"Could not find `ModuleList` in {type(self.model)}")

        # Layers before module list
        for name in pp_plan[:module_list_idx]:
            if self.pp_group.is_first_rank or (
                    self.text_config.tie_word_embeddings
                    and self.pp_group.is_last_rank):
                continue
            setattr(self.model, name, PPMissingLayer())

        # Module list
        start_layer, end_layer = get_pp_indices(
            self.text_config.num_hidden_layers, self.pp_rank, self.pp_size)
        layers_name = pp_plan[module_list_idx]
        layers = getattr(self.model, layers_name)
        for i in range(len(layers)):
            if start_layer <= i and i < end_layer:
                continue
            layers[i] = PPMissingLayer()

        # Layers after module list
        for name in pp_plan[module_list_idx + 1:]:
            # Modules that should be on last rank
            if not self.pp_group.is_last_rank:
                setattr(self.model, name, PPMissingLayer())

    def tensor_parallel(self):
        """
        Apply the model's tensor parallelization plan.
        Currently only supports linear layers.
        """
        # Look for tp plans in all of the PreTrainedModels found in self.model
        is_pretrained_model = lambda m: isinstance(m, PreTrainedModel)
        supports_tp_plan = lambda m: m.config.base_model_tp_plan is not None
        pretrained_models = filter(is_pretrained_model, self.model.modules())
        models_with_tp_plan = filter(supports_tp_plan, pretrained_models)

        if not any(models_with_tp_plan) and self.tp_size > 1:
            raise ValueError(
                f"{type(self.model)} does not support tensor parallel yet!")

        def _tensor_parallel(module: nn.Module,
                             prefix: str = "",
                             tp_plan=None):
            tp_plan = tp_plan or {}

            # If the current module is a PreTrainedModel, set the tp_plan for
            # all of its children
            if isinstance(module, PreTrainedModel):
                tp_plan = module.config.base_model_tp_plan or {}
                tp_plan = {
                    maybe_prefix(prefix, k): v
                    for k, v in tp_plan.items()
                }

            # Some weight loaders expect linear layers to inherit from vLLM's
            # LinearBase class, so we set a default style which causes any
            # unspecified linear layers to be replaced with ReplicatedLinear
            for child_name, child_module in module.named_children():
                qual_name = maybe_prefix(prefix, child_name)
                if isinstance(child_module, nn.Linear):
                    generator = (p for p in tp_plan if re.match(p, qual_name))
                    pattern = next(generator, None)
                    style = tp_plan.get(pattern, "replicate")
                    new_module = replace_linear_class(child_module, style,
                                                      self.quant_config)
                    setattr(module, child_name, new_module)
                    log_replacement(qual_name, child_module, new_module)
                else:
                    _tensor_parallel(child_module,
                                     prefix=qual_name,
                                     tp_plan=tp_plan)

        _tensor_parallel(self.model)

    def create_attention_instances(self) -> dict[int, Attention]:
        """
        Create `Attention` instances to inform KV cache allocation.
        """
        num_heads = self.model_config.get_num_attention_heads(
            self.parallel_config)
        head_size = self.model_config.get_head_size()
        num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config)
        start, end = get_pp_indices(self.text_config.num_hidden_layers,
                                    self.pp_rank, self.pp_size)

        attention_instances = {}
        for i in range(start, end):
            # Handle interleaved sliding window attention
            per_layer_sliding_window = None
            if (hasattr(self.config, "layer_types")
                    and self.config.layer_types[i] == "sliding_attention"):
                per_layer_sliding_window = self.config.sliding_window

            attention_instances[i] = Attention(
                num_heads=num_heads,
                head_size=head_size,
                # NOTE: We use Llama scale as default, if it's set by
                # Transformers, it's updated in vllm_flash_attention_forward
                scale=head_size**-0.5,
                num_kv_heads=num_kv_heads,
                cache_config=self.cache_config,
                quant_config=self.quant_config,
                per_layer_sliding_window=per_layer_sliding_window,
                prefix=f"{i}.attn")
        return attention_instances

    def init_parameters(self, module: nn.Module):
        """
        If a `parameter` is on the `meta` device, then its parent
        `module` is the original module created by:

        ```python
        with torch.device("meta"):
            self.model: PreTrainedModel = AutoModel.from_config(...)
        ```
        """
        for name, param in module.named_parameters(recurse=False):
            if param.device == torch.device("meta"):
                new_param = nn.Parameter(
                    torch.empty_like(param.data,
                                     dtype=self.model_config.dtype,
                                     device=self.device_config.device))
                setattr(module, name, new_param)
        for child in module.children():
            self.init_parameters(child)

    def forward(
        self,
        input_ids: Optional[torch.Tensor],
        positions: torch.Tensor,
        intermediate_tensors: Optional[IntermediateTensors] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, IntermediateTensors]:
        if not get_pp_group().is_first_rank:
            assert intermediate_tensors is not None
            input_ids = None
            inputs_embeds = intermediate_tensors["hidden_states"]

        if input_ids is not None:
            input_ids = input_ids[None, ...]
        if inputs_embeds is not None:
            inputs_embeds = inputs_embeds[None, ...]

        if self.model_config.uses_mrope:
            position_ids = positions[:, None]
        else:
            position_ids = positions[None, ...]

        hidden_states = self.model(
            input_ids=input_ids,
            inputs_embeds=inputs_embeds,
            use_cache=False,
            position_ids=position_ids,
            attention_instances=self.attention_instances,
            return_dict=False)[0][0, ...]  # we remove batch dimension for now

        if not get_pp_group().is_last_rank:
            return IntermediateTensors({"hidden_states": hidden_states})

        return hidden_states

    def load_weights(self, weights: Iterable[tuple[str,
                                                   torch.Tensor]]) -> set[str]:
        loader = AutoWeightsLoader(self, skip_prefixes=self.skip_prefixes)
        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)

attention_instances `instance-attribute` ¶

attention_instances = create_attention_instances()

cache_config `instance-attribute` ¶

cache_config: CacheConfig = cache_config

config `instance-attribute` ¶

config: PretrainedConfig = hf_config

device_config `instance-attribute` ¶

device_config: DeviceConfig = device_config

embedding_modules `class-attribute` `instance-attribute` ¶

embedding_modules = ['embed_tokens']

embedding_padding_modules `class-attribute` `instance-attribute` ¶

embedding_padding_modules = ['lm_head']

make_empty_intermediate_tensors `instance-attribute` ¶

make_empty_intermediate_tensors = (
    make_empty_intermediate_tensors_factory(
        ["hidden_states"], hidden_size
    )
)

model `instance-attribute` ¶

model: PreTrainedModel = from_config(
    config,
    torch_dtype=dtype,
    trust_remote_code=trust_remote_code,
)

model_config `instance-attribute` ¶

model_config: ModelConfig = model_config

parallel_config `instance-attribute` ¶

parallel_config: ParallelConfig = parallel_config

pp_group `instance-attribute` ¶

pp_group = get_pp_group()

pp_rank `instance-attribute` ¶

pp_rank = rank_in_group

pp_size `instance-attribute` ¶

pp_size = world_size

quant_config `instance-attribute` ¶

quant_config: QuantizationConfig = quant_config

skip_prefixes `instance-attribute` ¶

skip_prefixes: Optional[list[str]] = None

text_config `instance-attribute` ¶

text_config: PretrainedConfig = get_text_config()

tp_size `instance-attribute` ¶

tp_size = get_tensor_model_parallel_world_size()

init ¶

__init__(*, vllm_config: VllmConfig, prefix: str = '')

Source code in vllm/model_executor/models/transformers.py

def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
    super().__init__()
    logger.info("Using Transformers backend.")

    self.config: PretrainedConfig = vllm_config.model_config.hf_config
    self.text_config: PretrainedConfig = self.config.get_text_config()
    self.cache_config: CacheConfig = vllm_config.cache_config
    self.device_config: DeviceConfig = vllm_config.device_config
    self.model_config: ModelConfig = vllm_config.model_config
    self.parallel_config: ParallelConfig = vllm_config.parallel_config
    self.quant_config: QuantizationConfig = vllm_config.quant_config

    self.pp_group = get_pp_group()
    self.pp_size = self.pp_group.world_size
    self.pp_rank = self.pp_group.rank_in_group
    self.tp_size = get_tensor_model_parallel_world_size()

    # To be updated in child classes for use in `load_weights`
    self.skip_prefixes: Optional[list[str]] = None

    # Set correct attn and init on "meta" to delay allocating GPU tensors
    # TODO: @raushan, use the public `model.set_attn_implementation()`
    # method once its checks are fixed in Transformers.
    self.text_config._attn_implementation = "vllm"
    with init_on_device_without_buffers("meta"):
        self.model: PreTrainedModel = AutoModel.from_config(
            self.config,
            torch_dtype=self.model_config.dtype,
            trust_remote_code=self.model_config.trust_remote_code,
        )

    self.pipeline_parallel()
    self.tensor_parallel()

    # Input embeddings
    if not isinstance(self.model.get_input_embeddings(), PPMissingLayer):
        self.model.set_input_embeddings(
            VocabParallelEmbedding(
                self.text_config.vocab_size,
                self.text_config.hidden_size,
                org_num_embeddings=self.text_config.vocab_size,
                quant_config=self.quant_config,
            ))

    # Attention layers
    self.attention_instances = self.create_attention_instances()

    # Initialize any parameters that have not had their modules replaced
    self.init_parameters(self.model)

    self.make_empty_intermediate_tensors = (
        make_empty_intermediate_tensors_factory(
            ["hidden_states"], self.text_config.hidden_size))

create_attention_instances ¶

create_attention_instances() -> dict[int, Attention]

Create Attention instances to inform KV cache allocation.

Source code in vllm/model_executor/models/transformers.py

def create_attention_instances(self) -> dict[int, Attention]:
    """
    Create `Attention` instances to inform KV cache allocation.
    """
    num_heads = self.model_config.get_num_attention_heads(
        self.parallel_config)
    head_size = self.model_config.get_head_size()
    num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config)
    start, end = get_pp_indices(self.text_config.num_hidden_layers,
                                self.pp_rank, self.pp_size)

    attention_instances = {}
    for i in range(start, end):
        # Handle interleaved sliding window attention
        per_layer_sliding_window = None
        if (hasattr(self.config, "layer_types")
                and self.config.layer_types[i] == "sliding_attention"):
            per_layer_sliding_window = self.config.sliding_window

        attention_instances[i] = Attention(
            num_heads=num_heads,
            head_size=head_size,
            # NOTE: We use Llama scale as default, if it's set by
            # Transformers, it's updated in vllm_flash_attention_forward
            scale=head_size**-0.5,
            num_kv_heads=num_kv_heads,
            cache_config=self.cache_config,
            quant_config=self.quant_config,
            per_layer_sliding_window=per_layer_sliding_window,
            prefix=f"{i}.attn")
    return attention_instances

forward ¶

forward(
    input_ids: Optional[Tensor],
    positions: Tensor,
    intermediate_tensors: Optional[
        IntermediateTensors
    ] = None,
    inputs_embeds: Optional[Tensor] = None,
) -> Union[Tensor, IntermediateTensors]

Source code in vllm/model_executor/models/transformers.py

def forward(
    self,
    input_ids: Optional[torch.Tensor],
    positions: torch.Tensor,
    intermediate_tensors: Optional[IntermediateTensors] = None,
    inputs_embeds: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, IntermediateTensors]:
    if not get_pp_group().is_first_rank:
        assert intermediate_tensors is not None
        input_ids = None
        inputs_embeds = intermediate_tensors["hidden_states"]

    if input_ids is not None:
        input_ids = input_ids[None, ...]
    if inputs_embeds is not None:
        inputs_embeds = inputs_embeds[None, ...]

    if self.model_config.uses_mrope:
        position_ids = positions[:, None]
    else:
        position_ids = positions[None, ...]

    hidden_states = self.model(
        input_ids=input_ids,
        inputs_embeds=inputs_embeds,
        use_cache=False,
        position_ids=position_ids,
        attention_instances=self.attention_instances,
        return_dict=False)[0][0, ...]  # we remove batch dimension for now

    if not get_pp_group().is_last_rank:
        return IntermediateTensors({"hidden_states": hidden_states})

    return hidden_states

init_parameters ¶

init_parameters(module: Module)

If a parameter is on the meta device, then its parent module is the original module created by:

with torch.device("meta"):
    self.model: PreTrainedModel = AutoModel.from_config(...)

Source code in vllm/model_executor/models/transformers.py

def init_parameters(self, module: nn.Module):
    """
    If a `parameter` is on the `meta` device, then its parent
    `module` is the original module created by:

    ```python
    with torch.device("meta"):
        self.model: PreTrainedModel = AutoModel.from_config(...)
    ```
    """
    for name, param in module.named_parameters(recurse=False):
        if param.device == torch.device("meta"):
            new_param = nn.Parameter(
                torch.empty_like(param.data,
                                 dtype=self.model_config.dtype,
                                 device=self.device_config.device))
            setattr(module, name, new_param)
    for child in module.children():
        self.init_parameters(child)

load_weights ¶

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]

Source code in vllm/model_executor/models/transformers.py

def load_weights(self, weights: Iterable[tuple[str,
                                               torch.Tensor]]) -> set[str]:
    loader = AutoWeightsLoader(self, skip_prefixes=self.skip_prefixes)
    return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)

pipeline_parallel ¶

pipeline_parallel()

Apply the model's pipeline parallelization plan.

Source code in vllm/model_executor/models/transformers.py

def pipeline_parallel(self):
    """
    Apply the model's pipeline parallelization plan.
    """
    if self.pp_size <= 1:
        return

    if not self.model.supports_pp_plan:
        raise ValueError(
            f"{type(self.model)} does not support pipeline parallel yet!")

    module_lists = []
    module_list_idx = None
    pp_plan = list(self.model._pp_plan.keys())
    for i, name in enumerate(pp_plan):
        if isinstance(getattr(self.model, name), nn.ModuleList):
            module_lists.append(name)
            module_list_idx = i

    if len(module_lists) > 1:
        raise ValueError(
            "Pipeline parallel of models with multiple `ModuleList`s "
            "in the base model are not supported yet!")
    if module_list_idx is None:
        raise ValueError(
            f"Could not find `ModuleList` in {type(self.model)}")

    # Layers before module list
    for name in pp_plan[:module_list_idx]:
        if self.pp_group.is_first_rank or (
                self.text_config.tie_word_embeddings
                and self.pp_group.is_last_rank):
            continue
        setattr(self.model, name, PPMissingLayer())

    # Module list
    start_layer, end_layer = get_pp_indices(
        self.text_config.num_hidden_layers, self.pp_rank, self.pp_size)
    layers_name = pp_plan[module_list_idx]
    layers = getattr(self.model, layers_name)
    for i in range(len(layers)):
        if start_layer <= i and i < end_layer:
            continue
        layers[i] = PPMissingLayer()

    # Layers after module list
    for name in pp_plan[module_list_idx + 1:]:
        # Modules that should be on last rank
        if not self.pp_group.is_last_rank:
            setattr(self.model, name, PPMissingLayer())

tensor_parallel ¶

tensor_parallel()

Apply the model's tensor parallelization plan. Currently only supports linear layers.

Source code in vllm/model_executor/models/transformers.py

def tensor_parallel(self):
    """
    Apply the model's tensor parallelization plan.
    Currently only supports linear layers.
    """
    # Look for tp plans in all of the PreTrainedModels found in self.model
    is_pretrained_model = lambda m: isinstance(m, PreTrainedModel)
    supports_tp_plan = lambda m: m.config.base_model_tp_plan is not None
    pretrained_models = filter(is_pretrained_model, self.model.modules())
    models_with_tp_plan = filter(supports_tp_plan, pretrained_models)

    if not any(models_with_tp_plan) and self.tp_size > 1:
        raise ValueError(
            f"{type(self.model)} does not support tensor parallel yet!")

    def _tensor_parallel(module: nn.Module,
                         prefix: str = "",
                         tp_plan=None):
        tp_plan = tp_plan or {}

        # If the current module is a PreTrainedModel, set the tp_plan for
        # all of its children
        if isinstance(module, PreTrainedModel):
            tp_plan = module.config.base_model_tp_plan or {}
            tp_plan = {
                maybe_prefix(prefix, k): v
                for k, v in tp_plan.items()
            }

        # Some weight loaders expect linear layers to inherit from vLLM's
        # LinearBase class, so we set a default style which causes any
        # unspecified linear layers to be replaced with ReplicatedLinear
        for child_name, child_module in module.named_children():
            qual_name = maybe_prefix(prefix, child_name)
            if isinstance(child_module, nn.Linear):
                generator = (p for p in tp_plan if re.match(p, qual_name))
                pattern = next(generator, None)
                style = tp_plan.get(pattern, "replicate")
                new_module = replace_linear_class(child_module, style,
                                                  self.quant_config)
                setattr(module, child_name, new_module)
                log_replacement(qual_name, child_module, new_module)
            else:
                _tensor_parallel(child_module,
                                 prefix=qual_name,
                                 tp_plan=tp_plan)

    _tensor_parallel(self.model)

TransformersForCausalLM ¶

Bases: TransformersBase

Source code in vllm/model_executor/models/transformers.py

@support_torch_compile
class TransformersForCausalLM(TransformersBase):

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__(vllm_config=vllm_config, prefix=prefix)

        # Tell `TransformersBase.load_weights` to skip
        # `lm_head` if the model has tied word embeddings
        if self.text_config.tie_word_embeddings:
            self.skip_prefixes = ["lm_head."]

        if get_pp_group().is_last_rank:
            self.unpadded_vocab_size = self.text_config.vocab_size
            self.lm_head = ParallelLMHead(
                self.text_config.vocab_size,
                self.text_config.hidden_size,
                quant_config=self.quant_config,
                prefix=maybe_prefix(prefix, "lm_head"),
            )
            if self.text_config.tie_word_embeddings:
                self.lm_head = self.lm_head.tie_weights(
                    self.model.get_input_embeddings())

            logit_scale = getattr(self.text_config, "logit_scale", 1.0)
            self.logits_processor = LogitsProcessor(
                self.unpadded_vocab_size, self.text_config.vocab_size,
                logit_scale)
        else:
            self.lm_head = PPMissingLayer()

    def compute_logits(
        self,
        hidden_states: torch.Tensor,
        sampling_metadata: SamplingMetadata,
    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

lm_head `instance-attribute` ¶

lm_head = ParallelLMHead(
    vocab_size,
    hidden_size,
    quant_config=quant_config,
    prefix=maybe_prefix(prefix, "lm_head"),
)

logits_processor `instance-attribute` ¶

logits_processor = LogitsProcessor(
    unpadded_vocab_size, vocab_size, logit_scale
)

skip_prefixes `instance-attribute` ¶

skip_prefixes = ['lm_head.']

unpadded_vocab_size `instance-attribute` ¶

unpadded_vocab_size = vocab_size

init ¶

__init__(*, vllm_config: VllmConfig, prefix: str = '')

Source code in vllm/model_executor/models/transformers.py

def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
    super().__init__(vllm_config=vllm_config, prefix=prefix)

    # Tell `TransformersBase.load_weights` to skip
    # `lm_head` if the model has tied word embeddings
    if self.text_config.tie_word_embeddings:
        self.skip_prefixes = ["lm_head."]

    if get_pp_group().is_last_rank:
        self.unpadded_vocab_size = self.text_config.vocab_size
        self.lm_head = ParallelLMHead(
            self.text_config.vocab_size,
            self.text_config.hidden_size,
            quant_config=self.quant_config,
            prefix=maybe_prefix(prefix, "lm_head"),
        )
        if self.text_config.tie_word_embeddings:
            self.lm_head = self.lm_head.tie_weights(
                self.model.get_input_embeddings())

        logit_scale = getattr(self.text_config, "logit_scale", 1.0)
        self.logits_processor = LogitsProcessor(
            self.unpadded_vocab_size, self.text_config.vocab_size,
            logit_scale)
    else:
        self.lm_head = PPMissingLayer()

compute_logits ¶

compute_logits(
    hidden_states: Tensor,
    sampling_metadata: SamplingMetadata,
) -> Optional[Tensor]

Source code in vllm/model_executor/models/transformers.py

def compute_logits(
    self,
    hidden_states: torch.Tensor,
    sampling_metadata: SamplingMetadata,
) -> Optional[torch.Tensor]:
    logits = self.logits_processor(self.lm_head, hidden_states,
                                   sampling_metadata)
    return logits

TransformersForMultimodalLM ¶

Bases: TransformersForCausalLM, SupportsMultiModal

Source code in vllm/model_executor/models/transformers.py

@MULTIMODAL_REGISTRY.register_processor(
    MultiModalProcessor,
    info=MultiModalProcessingInfo,
    dummy_inputs=MultiModalDummyInputsBuilder)
@support_torch_compile(
    dynamic_arg_dims={
        "input_ids": 0,
        "positions": -1,
        "intermediate_tensors": 0,
        "inputs_embeds": 0,
    })  # set `positions` to last dim to support Qwen-mrope
class TransformersForMultimodalLM(TransformersForCausalLM, SupportsMultiModal):
    # Backwards compatibility for prev released models. State dicts back then
    # had different formats and cannot be loaded with `AutoModel` mapping as is
    hf_to_vllm_mapper = WeightsMapper(
        orig_to_new_prefix={
            "language_model.model": "model.language_model",
            "text_model.model": "model.text_model",
            "vision_tower": "model.vision_tower",
            "vqmodel": "model.vqmodel",
            "visual": "model.visual",
            "vision_model": "model.vision_model",
            "vision_embed_tokens": "model.vision_embed_tokens",
            "image_newline": "model.image_newline",
            "multi_modal_projector": "model.multi_modal_projector",
            "text_model.lm_head": "lm_head",
            "language_model.lm_head": "lm_head",
            # Qwen models used "model" as the name for the language model.
            # Therefore, we must map each of submodule explicitly to avoid
            # conflicts with newer models that use "model.language_model".
            "model.embed_tokens": "model.language_model.embed_tokens",
            "model.layers": "model.language_model.layers",
            "model.norm": "model.language_model.norm",
        })

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__(vllm_config=vllm_config, prefix=prefix)

        self.dtype = vllm_config.model_config.dtype

    def forward(
        self,
        input_ids: Optional[torch.Tensor],
        positions: torch.Tensor,
        intermediate_tensors: Optional[IntermediateTensors] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        **kwargs: object,
    ) -> Union[torch.Tensor, IntermediateTensors]:
        # NOTE: In v1, inputs_embeds is always generated at model runner from
        # `get_multimodal_embeddings` and `get_input_embeddings`, this
        # condition is only for v0 compatibility.
        if inputs_embeds is None:
            multimodal_embeds = self.get_multimodal_embeddings(**kwargs)
            if multimodal_embeds is not None:
                inputs_embeds = self.get_input_embeddings(
                    input_ids, multimodal_embeds)
                input_ids = None

        model_output = super().forward(input_ids, positions,
                                       intermediate_tensors, inputs_embeds)
        return model_output

    def get_multimodal_embeddings(self, **kwargs):
        pixel_values = kwargs.pop("pixel_values", None)
        pixel_values = pixel_values if pixel_values is not None else kwargs.pop(
            "image_patches", None)
        image_embeds = kwargs.pop("image_embeds", None)

        if image_embeds is not None:
            return image_embeds

        if pixel_values is None and image_embeds is None:
            return None

        num_image_patches = kwargs.pop("num_image_patches")
        if pixel_values is not None:
            if isinstance(pixel_values, torch.Tensor):
                pixel_values = flatten_bn(pixel_values).to(self.dtype)
            elif is_list_of(pixel_values, torch.Tensor):
                pixel_values = flatten_and_concat(pixel_values).to(self.dtype)
            else:
                raise ValueError(
                    f"Unsupported pixel_values type {type(pixel_values)}. "
                    "Expected `torch.Tensor` or list of `torch.Tensor`.")

            if isinstance(num_image_patches, list):
                num_image_patches = torch.cat(num_image_patches)

            vision_embeddings = self.model.get_image_features(
                pixel_values,
                **{
                    k: v.flatten(0, 1)
                    for k, v in kwargs.items()
                },
            )

            if isinstance(vision_embeddings, torch.Tensor):
                if vision_embeddings.ndim == 2:
                    vision_embeddings = vision_embeddings.unsqueeze(0)

                # Embeddings have to be 2D tensors of length `num_images`
                # but transformers returns concat tensors if each patch
                # is of different size. We split it back to make vLLM happy
                vision_embeddings = torch.split(
                    vision_embeddings,
                    num_image_patches.flatten().tolist())
                vision_embeddings = [
                    embed.flatten(start_dim=0, end_dim=-2)
                    for embed in vision_embeddings
                ]

            return vision_embeddings

    def get_input_embeddings(
        self,
        input_ids: torch.Tensor,
        multimodal_embeddings=None,
    ) -> torch.Tensor:
        inputs_embeds = self.model.get_input_embeddings()(input_ids)
        if (multimodal_embeddings is not None
                and len(multimodal_embeddings) != 0):
            mask = (input_ids == self.config.image_token_id)
            mask = mask.unsqueeze(-1).expand_as(inputs_embeds)
            multimodal_embeddings = torch.cat(multimodal_embeddings)

            inputs_embeds = inputs_embeds.masked_scatter(
                mask, multimodal_embeddings)
        return inputs_embeds

dtype `instance-attribute` ¶

dtype = dtype

hf_to_vllm_mapper `class-attribute` `instance-attribute` ¶

hf_to_vllm_mapper = WeightsMapper(
    orig_to_new_prefix={
        "language_model.model": "model.language_model",
        "text_model.model": "model.text_model",
        "vision_tower": "model.vision_tower",
        "vqmodel": "model.vqmodel",
        "visual": "model.visual",
        "vision_model": "model.vision_model",
        "vision_embed_tokens": "model.vision_embed_tokens",
        "image_newline": "model.image_newline",
        "multi_modal_projector": "model.multi_modal_projector",
        "text_model.lm_head": "lm_head",
        "language_model.lm_head": "lm_head",
        "model.embed_tokens": "model.language_model.embed_tokens",
        "model.layers": "model.language_model.layers",
        "model.norm": "model.language_model.norm",
    }
)

init ¶

__init__(*, vllm_config: VllmConfig, prefix: str = '')

Source code in vllm/model_executor/models/transformers.py

def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
    super().__init__(vllm_config=vllm_config, prefix=prefix)

    self.dtype = vllm_config.model_config.dtype

forward ¶

forward(
    input_ids: Optional[Tensor],
    positions: Tensor,
    intermediate_tensors: Optional[
        IntermediateTensors
    ] = None,
    inputs_embeds: Optional[Tensor] = None,
    **kwargs: object,
) -> Union[Tensor, IntermediateTensors]

Source code in vllm/model_executor/models/transformers.py

def forward(
    self,
    input_ids: Optional[torch.Tensor],
    positions: torch.Tensor,
    intermediate_tensors: Optional[IntermediateTensors] = None,
    inputs_embeds: Optional[torch.Tensor] = None,
    **kwargs: object,
) -> Union[torch.Tensor, IntermediateTensors]:
    # NOTE: In v1, inputs_embeds is always generated at model runner from
    # `get_multimodal_embeddings` and `get_input_embeddings`, this
    # condition is only for v0 compatibility.
    if inputs_embeds is None:
        multimodal_embeds = self.get_multimodal_embeddings(**kwargs)
        if multimodal_embeds is not None:
            inputs_embeds = self.get_input_embeddings(
                input_ids, multimodal_embeds)
            input_ids = None

    model_output = super().forward(input_ids, positions,
                                   intermediate_tensors, inputs_embeds)
    return model_output

get_input_embeddings ¶

get_input_embeddings(
    input_ids: Tensor, multimodal_embeddings=None
) -> Tensor

Source code in vllm/model_executor/models/transformers.py

def get_input_embeddings(
    self,
    input_ids: torch.Tensor,
    multimodal_embeddings=None,
) -> torch.Tensor:
    inputs_embeds = self.model.get_input_embeddings()(input_ids)
    if (multimodal_embeddings is not None
            and len(multimodal_embeddings) != 0):
        mask = (input_ids == self.config.image_token_id)
        mask = mask.unsqueeze(-1).expand_as(inputs_embeds)
        multimodal_embeddings = torch.cat(multimodal_embeddings)

        inputs_embeds = inputs_embeds.masked_scatter(
            mask, multimodal_embeddings)
    return inputs_embeds

get_multimodal_embeddings ¶

get_multimodal_embeddings(**kwargs)

Source code in vllm/model_executor/models/transformers.py

def get_multimodal_embeddings(self, **kwargs):
    pixel_values = kwargs.pop("pixel_values", None)
    pixel_values = pixel_values if pixel_values is not None else kwargs.pop(
        "image_patches", None)
    image_embeds = kwargs.pop("image_embeds", None)

    if image_embeds is not None:
        return image_embeds

    if pixel_values is None and image_embeds is None:
        return None

    num_image_patches = kwargs.pop("num_image_patches")
    if pixel_values is not None:
        if isinstance(pixel_values, torch.Tensor):
            pixel_values = flatten_bn(pixel_values).to(self.dtype)
        elif is_list_of(pixel_values, torch.Tensor):
            pixel_values = flatten_and_concat(pixel_values).to(self.dtype)
        else:
            raise ValueError(
                f"Unsupported pixel_values type {type(pixel_values)}. "
                "Expected `torch.Tensor` or list of `torch.Tensor`.")

        if isinstance(num_image_patches, list):
            num_image_patches = torch.cat(num_image_patches)

        vision_embeddings = self.model.get_image_features(
            pixel_values,
            **{
                k: v.flatten(0, 1)
                for k, v in kwargs.items()
            },
        )

        if isinstance(vision_embeddings, torch.Tensor):
            if vision_embeddings.ndim == 2:
                vision_embeddings = vision_embeddings.unsqueeze(0)

            # Embeddings have to be 2D tensors of length `num_images`
            # but transformers returns concat tensors if each patch
            # is of different size. We split it back to make vLLM happy
            vision_embeddings = torch.split(
                vision_embeddings,
                num_image_patches.flatten().tolist())
            vision_embeddings = [
                embed.flatten(start_dim=0, end_dim=-2)
                for embed in vision_embeddings
            ]

        return vision_embeddings

TransformersModel ¶

Bases: TransformersBase

Source code in vllm/model_executor/models/transformers.py

@support_torch_compile
class TransformersModel(TransformersBase):
    hf_to_vllm_mapper = WeightsMapper(
        orig_to_new_prefix={
            # Add `model.` prefix for base model checkpoints
            "": "model.",
            # Remove `model.` from places it should not be
            "model.model.": "model.",
            "model.score": "score",
        })

hf_to_vllm_mapper `class-attribute` `instance-attribute` ¶

hf_to_vllm_mapper = WeightsMapper(
    orig_to_new_prefix={
        "": "model.",
        "model.model.": "model.",
        "model.score": "score",
    }
)

flatten_and_concat ¶

flatten_and_concat(x: list[Tensor]) -> Tensor

Flatten until a list of tensors can be concatenated then do concat

Source code in vllm/model_executor/models/transformers.py

def flatten_and_concat(x: list[torch.Tensor]) -> torch.Tensor:
    """Flatten until a list of tensors can be concatenated then do concat"""

    def _can_concat(x: list[torch.Tensor]):
        return len(set(map(lambda _x: _x.shape[1:], x))) == 1

    if _can_concat(x):
        return torch.concat(x)
    return flatten_and_concat(flatten_bn(x))

init_on_device_without_buffers ¶

init_on_device_without_buffers(device: device)

A context manager under which models are initialized with all parameters on the specified device. However buffers are not initialized on specified device.

Parameters:

Name	Type	Description	Default
`device`	`torch.device`	Device to initialize all parameters on.	required

Source code in vllm/model_executor/models/transformers.py

@contextmanager
def init_on_device_without_buffers(device: torch.device):
    """
    A context manager under which models are initialized with all
    parameters on the specified device. However buffers are not
    initialized on specified device.

    Args:
        device (`torch.device`):
            Device to initialize all parameters on.
    """

    old_register_parameter = nn.Module.register_parameter

    def register_empty_parameter(module, name, param):
        old_register_parameter(module, name, param)
        if param is not None:
            param_cls = type(module._parameters[name])
            kwargs = module._parameters[name].__dict__
            kwargs["requires_grad"] = param.requires_grad
            module._parameters[name] = param_cls(
                module._parameters[name].to(device), **kwargs)

    tensor_constructors_to_patch = {}

    def patch_tensor_constructor(fn):

        def wrapper(*args, **kwargs):
            kwargs["device"] = device
            return fn(*args, **kwargs)

        return wrapper

    try:
        nn.Module.register_parameter = register_empty_parameter
        for torch_function_name in tensor_constructors_to_patch:
            setattr(
                torch, torch_function_name,
                patch_tensor_constructor(getattr(torch, torch_function_name)))
        yield
    finally:
        nn.Module.register_parameter = old_register_parameter
        for torch_function_name, old_torch_function in (
                tensor_constructors_to_patch.items()):
            setattr(torch, torch_function_name, old_torch_function)

log_replacement ¶

log_replacement(
    name: str, old_module: Module, new_module: Module
)

Source code in vllm/model_executor/models/transformers.py

def log_replacement(name: str, old_module: nn.Module, new_module: nn.Module):
    logger.debug("%s: %s -> %s", name, old_module, new_module)

replace_linear_class ¶

replace_linear_class(
    linear: Linear,
    style: Literal["colwise", "rowwise"],
    quant_config: QuantizationConfig,
) -> Union[
    ColumnParallelLinear,
    RowParallelLinear,
    ReplicatedLinear,
]

Replace nn.Linear with one of vLLM's tensor parallel linear classes.

Parameters:

Name	Type	Description	Default
`linear`	`Linear`	`nn.Linear` to be replaced.	required
`style`	`str`	Tensor parallel style of the new linear, e.g. "colwise".	required
`quant_config`	`QuantConfig`	Quantization config for the new linear.	required

Returns: Union[ColumnParallelLinear, RowParallelLinear]: The new linear.

Source code in vllm/model_executor/models/transformers.py

def replace_linear_class(
    linear: nn.Linear, style: Literal["colwise", "rowwise"],
    quant_config: QuantizationConfig
) -> Union[ColumnParallelLinear, RowParallelLinear, ReplicatedLinear]:
    """
    Replace nn.Linear with one of vLLM's tensor parallel linear classes.

    Args:
        linear (nn.Linear): `nn.Linear` to be replaced.
        style (str): Tensor parallel style of the new linear, e.g. "colwise".
        quant_config (QuantConfig): Quantization config for the new linear.
    Returns:
        Union[ColumnParallelLinear, RowParallelLinear]: The new linear.
    """

    if not isinstance(style, str):
        raise ValueError(
            f"Unsupported parallel style type {type(style)}, expected str")

    vllm_linear_cls, vllm_linear_kwargs = {
        "colwise": (ColumnParallelLinear, {}),
        "colwise_rep": (ColumnParallelLinear, {
            "gather_output": True
        }),
        "rowwise": (RowParallelLinear, {}),
        "rowwise_rep": (RowParallelLinear, {
            "input_is_parallel": False
        }),
        "replicate": (ReplicatedLinear, {}),
    }.get(style, (ReplicatedLinear, {}))

    return vllm_linear_cls(
        input_size=linear.in_features,
        output_size=linear.out_features,
        bias=linear.bias is not None,
        quant_config=quant_config,
        return_bias=False,
        **vllm_linear_kwargs,
    )

vllm_flash_attention_forward ¶

vllm_flash_attention_forward(
    module: Module,
    query: Tensor,
    key: Tensor,
    value: Tensor,
    attention_mask: Tensor,
    scaling: Optional[float] = None,
    attention_instances: Optional[dict[Attention]] = None,
    **kwargs,
)

Source code in vllm/model_executor/models/transformers.py

def vllm_flash_attention_forward(
        # Transformers args
        module: torch.nn.Module,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        attention_mask: torch.Tensor,
        # Transformers kwargs
        scaling: Optional[float] = None,
        # vLLM kwargs
        attention_instances: Optional[dict[Attention]] = None,
        **kwargs):
    self_attn = attention_instances[module.layer_idx]
    if scaling is not None:
        self_attn.impl.scale = float(scaling)
    hidden = query.shape[-2]
    query, key, value = (x.transpose(1, 2) for x in (query, key, value))
    query, key, value = (x.reshape(hidden, -1) for x in (query, key, value))
    return self_attn.forward(query, key, value), None

vllm.model_executor.models.transformers

logger module-attribute ¶

MultiModalDummyInputsBuilder ¶

get_dummy_mm_data ¶

get_dummy_text ¶

MultiModalProcessingInfo ¶

get_hf_config ¶

get_max_image_size ¶

get_max_image_tokens ¶

get_mm_max_tokens_per_item ¶

get_supported_mm_limits ¶

MultiModalProcessor ¶

_apply_hf_processor_text_mm ¶

_get_mm_fields_config ¶

_get_prompt_updates ¶

apply ¶

TransformersBase ¶

attention_instances instance-attribute ¶

cache_config instance-attribute ¶

config instance-attribute ¶

device_config instance-attribute ¶

embedding_modules class-attribute instance-attribute ¶

embedding_padding_modules class-attribute instance-attribute ¶

make_empty_intermediate_tensors instance-attribute ¶

model instance-attribute ¶

model_config instance-attribute ¶

parallel_config instance-attribute ¶

pp_group instance-attribute ¶

pp_rank instance-attribute ¶

pp_size instance-attribute ¶

quant_config instance-attribute ¶

skip_prefixes instance-attribute ¶

text_config instance-attribute ¶

tp_size instance-attribute ¶

__init__ ¶

create_attention_instances ¶

forward ¶

init_parameters ¶

load_weights ¶

pipeline_parallel ¶

tensor_parallel ¶

TransformersForCausalLM ¶

lm_head instance-attribute ¶

logits_processor instance-attribute ¶

skip_prefixes instance-attribute ¶

unpadded_vocab_size instance-attribute ¶

__init__ ¶

compute_logits ¶

TransformersForMultimodalLM ¶

dtype instance-attribute ¶

hf_to_vllm_mapper class-attribute instance-attribute ¶

__init__ ¶

forward ¶

get_input_embeddings ¶

get_multimodal_embeddings ¶

TransformersModel ¶

hf_to_vllm_mapper class-attribute instance-attribute ¶

flatten_and_concat ¶

init_on_device_without_buffers ¶

log_replacement ¶

replace_linear_class ¶

vllm_flash_attention_forward ¶

logger `module-attribute` ¶

attention_instances `instance-attribute` ¶

cache_config `instance-attribute` ¶

config `instance-attribute` ¶

device_config `instance-attribute` ¶

embedding_modules `class-attribute` `instance-attribute` ¶

embedding_padding_modules `class-attribute` `instance-attribute` ¶

make_empty_intermediate_tensors `instance-attribute` ¶

model `instance-attribute` ¶

model_config `instance-attribute` ¶

parallel_config `instance-attribute` ¶

pp_group `instance-attribute` ¶

pp_rank `instance-attribute` ¶

pp_size `instance-attribute` ¶

quant_config `instance-attribute` ¶

skip_prefixes `instance-attribute` ¶

text_config `instance-attribute` ¶

tp_size `instance-attribute` ¶

init ¶

lm_head `instance-attribute` ¶

logits_processor `instance-attribute` ¶

skip_prefixes `instance-attribute` ¶

unpadded_vocab_size `instance-attribute` ¶

init ¶

dtype `instance-attribute` ¶

hf_to_vllm_mapper `class-attribute` `instance-attribute` ¶

init ¶

hf_to_vllm_mapper `class-attribute` `instance-attribute` ¶