vllm.transformers_utils.configs

Model configs may be defined in this directory for the following reasons:

There is no configuration file defined by HF Hub or Transformers library.
There is a need to override the existing config to support vLLM.

Modules:

Name	Description
`arctic`	Arctic model configuration
`chatglm`
`deepseek_vl2`
`eagle`
`falcon`	Falcon configuration
`jais`	JAIS configuration
`kimi_vl`
`medusa`
`mistral`
`mlp_speculator`
`moonvit`
`nemotron`	Nemotron model configuration
`nemotron_h`	NemotronH model configuration
`nemotron_vl`
`ovis`
`speculators`
`step3_vl`
`ultravox`

all `module-attribute` ¶

__all__ = [
    "ChatGLMConfig",
    "DeepseekVLV2Config",
    "EAGLEConfig",
    "RWConfig",
    "JAISConfig",
    "MedusaConfig",
    "MLPSpeculatorConfig",
    "MoonViTConfig",
    "KimiVLConfig",
    "NemotronConfig",
    "NemotronHConfig",
    "Nemotron_Nano_VL_Config",
    "OvisConfig",
    "SpeculatorsConfig",
    "UltravoxConfig",
    "Step3VLConfig",
    "Step3VisionEncoderConfig",
    "Step3TextConfig",
]

ChatGLMConfig ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/chatglm.py

class ChatGLMConfig(PretrainedConfig):
    model_type = "chatglm"
    attribute_map = {
        "num_hidden_layers": "num_layers",
        "n_head_kv": "multi_query_group_num",
    }

    def __init__(self,
                 num_layers=28,
                 padded_vocab_size=65024,
                 hidden_size=4096,
                 ffn_hidden_size=13696,
                 kv_channels=128,
                 num_attention_heads=32,
                 seq_length=2048,
                 hidden_dropout=0.0,
                 attention_dropout=0.0,
                 layernorm_epsilon=1e-5,
                 rmsnorm=True,
                 apply_residual_connection_post_layernorm=False,
                 post_layer_norm=True,
                 add_bias_linear=False,
                 add_qkv_bias=False,
                 interleaved_qkv=False,
                 bias_dropout_fusion=True,
                 multi_query_attention=False,
                 multi_query_group_num=1,
                 apply_query_key_layer_scaling=True,
                 attention_softmax_in_fp32=True,
                 fp32_residual_connection=False,
                 quantization_bit=0,
                 pre_seq_len=None,
                 prefix_projection=False,
                 **kwargs):
        self.num_layers = num_layers
        self.vocab_size = padded_vocab_size
        self.padded_vocab_size = padded_vocab_size
        self.hidden_size = hidden_size
        self.ffn_hidden_size = ffn_hidden_size
        self.kv_channels = kv_channels
        self.num_attention_heads = num_attention_heads
        self.seq_length = seq_length
        # It is to be compatible with long lora.
        self.max_position_embeddings = seq_length
        self.hidden_dropout = hidden_dropout
        self.attention_dropout = attention_dropout
        self.layernorm_epsilon = layernorm_epsilon
        self.rmsnorm = rmsnorm
        self.apply_residual_connection_post_layernorm = (
            apply_residual_connection_post_layernorm)
        self.post_layer_norm = post_layer_norm
        self.add_bias_linear = add_bias_linear
        self.add_qkv_bias = add_qkv_bias
        self.bias_dropout_fusion = bias_dropout_fusion
        self.multi_query_attention = multi_query_attention
        self.multi_query_group_num = multi_query_group_num
        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
        self.fp32_residual_connection = fp32_residual_connection
        self.quantization_bit = quantization_bit
        self.pre_seq_len = pre_seq_len
        self.prefix_projection = prefix_projection
        self.interleaved_qkv = interleaved_qkv
        super().__init__(**kwargs)

add_bias_linear `instance-attribute` ¶

add_bias_linear = add_bias_linear

add_qkv_bias `instance-attribute` ¶

add_qkv_bias = add_qkv_bias

apply_query_key_layer_scaling `instance-attribute` ¶

apply_query_key_layer_scaling = (
    apply_query_key_layer_scaling
)

apply_residual_connection_post_layernorm `instance-attribute` ¶

apply_residual_connection_post_layernorm = (
    apply_residual_connection_post_layernorm
)

attention_dropout `instance-attribute` ¶

attention_dropout = attention_dropout

attention_softmax_in_fp32 `instance-attribute` ¶

attention_softmax_in_fp32 = attention_softmax_in_fp32

attribute_map `class-attribute` `instance-attribute` ¶

attribute_map = {
    "num_hidden_layers": "num_layers",
    "n_head_kv": "multi_query_group_num",
}

bias_dropout_fusion `instance-attribute` ¶

bias_dropout_fusion = bias_dropout_fusion

ffn_hidden_size `instance-attribute` ¶

ffn_hidden_size = ffn_hidden_size

fp32_residual_connection `instance-attribute` ¶

fp32_residual_connection = fp32_residual_connection

hidden_dropout `instance-attribute` ¶

hidden_dropout = hidden_dropout

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

interleaved_qkv `instance-attribute` ¶

interleaved_qkv = interleaved_qkv

kv_channels `instance-attribute` ¶

kv_channels = kv_channels

layernorm_epsilon `instance-attribute` ¶

layernorm_epsilon = layernorm_epsilon

max_position_embeddings `instance-attribute` ¶

max_position_embeddings = seq_length

model_type `class-attribute` `instance-attribute` ¶

model_type = 'chatglm'

multi_query_attention `instance-attribute` ¶

multi_query_attention = multi_query_attention

multi_query_group_num `instance-attribute` ¶

multi_query_group_num = multi_query_group_num

num_attention_heads `instance-attribute` ¶

num_attention_heads = num_attention_heads

num_layers `instance-attribute` ¶

num_layers = num_layers

padded_vocab_size `instance-attribute` ¶

padded_vocab_size = padded_vocab_size

post_layer_norm `instance-attribute` ¶

post_layer_norm = post_layer_norm

pre_seq_len `instance-attribute` ¶

pre_seq_len = pre_seq_len

prefix_projection `instance-attribute` ¶

prefix_projection = prefix_projection

quantization_bit `instance-attribute` ¶

quantization_bit = quantization_bit

rmsnorm `instance-attribute` ¶

rmsnorm = rmsnorm

seq_length `instance-attribute` ¶

seq_length = seq_length

vocab_size `instance-attribute` ¶

vocab_size = padded_vocab_size

init ¶

__init__(
    num_layers=28,
    padded_vocab_size=65024,
    hidden_size=4096,
    ffn_hidden_size=13696,
    kv_channels=128,
    num_attention_heads=32,
    seq_length=2048,
    hidden_dropout=0.0,
    attention_dropout=0.0,
    layernorm_epsilon=1e-05,
    rmsnorm=True,
    apply_residual_connection_post_layernorm=False,
    post_layer_norm=True,
    add_bias_linear=False,
    add_qkv_bias=False,
    interleaved_qkv=False,
    bias_dropout_fusion=True,
    multi_query_attention=False,
    multi_query_group_num=1,
    apply_query_key_layer_scaling=True,
    attention_softmax_in_fp32=True,
    fp32_residual_connection=False,
    quantization_bit=0,
    pre_seq_len=None,
    prefix_projection=False,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/chatglm.py

def __init__(self,
             num_layers=28,
             padded_vocab_size=65024,
             hidden_size=4096,
             ffn_hidden_size=13696,
             kv_channels=128,
             num_attention_heads=32,
             seq_length=2048,
             hidden_dropout=0.0,
             attention_dropout=0.0,
             layernorm_epsilon=1e-5,
             rmsnorm=True,
             apply_residual_connection_post_layernorm=False,
             post_layer_norm=True,
             add_bias_linear=False,
             add_qkv_bias=False,
             interleaved_qkv=False,
             bias_dropout_fusion=True,
             multi_query_attention=False,
             multi_query_group_num=1,
             apply_query_key_layer_scaling=True,
             attention_softmax_in_fp32=True,
             fp32_residual_connection=False,
             quantization_bit=0,
             pre_seq_len=None,
             prefix_projection=False,
             **kwargs):
    self.num_layers = num_layers
    self.vocab_size = padded_vocab_size
    self.padded_vocab_size = padded_vocab_size
    self.hidden_size = hidden_size
    self.ffn_hidden_size = ffn_hidden_size
    self.kv_channels = kv_channels
    self.num_attention_heads = num_attention_heads
    self.seq_length = seq_length
    # It is to be compatible with long lora.
    self.max_position_embeddings = seq_length
    self.hidden_dropout = hidden_dropout
    self.attention_dropout = attention_dropout
    self.layernorm_epsilon = layernorm_epsilon
    self.rmsnorm = rmsnorm
    self.apply_residual_connection_post_layernorm = (
        apply_residual_connection_post_layernorm)
    self.post_layer_norm = post_layer_norm
    self.add_bias_linear = add_bias_linear
    self.add_qkv_bias = add_qkv_bias
    self.bias_dropout_fusion = bias_dropout_fusion
    self.multi_query_attention = multi_query_attention
    self.multi_query_group_num = multi_query_group_num
    self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
    self.attention_softmax_in_fp32 = attention_softmax_in_fp32
    self.fp32_residual_connection = fp32_residual_connection
    self.quantization_bit = quantization_bit
    self.pre_seq_len = pre_seq_len
    self.prefix_projection = prefix_projection
    self.interleaved_qkv = interleaved_qkv
    super().__init__(**kwargs)

DeepseekVLV2Config ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/deepseek_vl2.py

class DeepseekVLV2Config(PretrainedConfig):
    model_type = "deepseek_vl_v2"
    vision_config: VisionEncoderConfig
    projector_config: MlpProjectorConfig

    tile_tag: str = "2D"
    global_view_pos: str = "head"
    candidate_resolutions: tuple[tuple[int, int]] = ((384, 384), )

    def __init__(self,
                 tile_tag: str = "tile_tag",
                 global_view_pos: str = "head",
                 candidate_resolutions: tuple[tuple[int,
                                                    int]] = ((384, 384), ),
                 **kwargs):
        super().__init__(**kwargs)

        vision_config = kwargs.get("vision_config", {})
        self.vision_config = VisionEncoderConfig(**vision_config)

        projector_config = kwargs.get("projector_config", {})
        self.projector_config = MlpProjectorConfig(**projector_config)

        language_config = kwargs.get("language_config", {})
        self.text_config = DeepseekV2Config(**language_config)

        self.tile_tag = tile_tag
        self.global_view_pos = global_view_pos
        self.candidate_resolutions = candidate_resolutions
        self.vocab_size = self.text_config.vocab_size

candidate_resolutions `class-attribute` `instance-attribute` ¶

candidate_resolutions: tuple[tuple[int, int]] = (
    candidate_resolutions
)

global_view_pos `class-attribute` `instance-attribute` ¶

global_view_pos: str = global_view_pos

model_type `class-attribute` `instance-attribute` ¶

model_type = 'deepseek_vl_v2'

projector_config `instance-attribute` ¶

projector_config: MlpProjectorConfig = MlpProjectorConfig(
    **projector_config
)

text_config `instance-attribute` ¶

text_config = DeepseekV2Config(**language_config)

tile_tag `class-attribute` `instance-attribute` ¶

tile_tag: str = tile_tag

vision_config `instance-attribute` ¶

vision_config: VisionEncoderConfig = VisionEncoderConfig(
    **vision_config
)

vocab_size `instance-attribute` ¶

vocab_size = vocab_size

init ¶

__init__(
    tile_tag: str = "tile_tag",
    global_view_pos: str = "head",
    candidate_resolutions: tuple[tuple[int, int]] = (
        (384, 384),
    ),
    **kwargs,
)

Source code in vllm/transformers_utils/configs/deepseek_vl2.py

def __init__(self,
             tile_tag: str = "tile_tag",
             global_view_pos: str = "head",
             candidate_resolutions: tuple[tuple[int,
                                                int]] = ((384, 384), ),
             **kwargs):
    super().__init__(**kwargs)

    vision_config = kwargs.get("vision_config", {})
    self.vision_config = VisionEncoderConfig(**vision_config)

    projector_config = kwargs.get("projector_config", {})
    self.projector_config = MlpProjectorConfig(**projector_config)

    language_config = kwargs.get("language_config", {})
    self.text_config = DeepseekV2Config(**language_config)

    self.tile_tag = tile_tag
    self.global_view_pos = global_view_pos
    self.candidate_resolutions = candidate_resolutions
    self.vocab_size = self.text_config.vocab_size

EAGLEConfig ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/eagle.py

class EAGLEConfig(PretrainedConfig):
    model_type = "eagle"

    def __init__(self,
                 model: Union[PretrainedConfig, dict, None] = None,
                 truncated_vocab_size: Optional[int] = None,
                 method: Optional[str] = 'eagle',
                 **kwargs):

        model_config: Union[PretrainedConfig, DeepseekV2Config, None]
        if isinstance(model, dict):
            archs = model.get("architectures", [])
            target_archs = ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]
            if any(target_arch in archs for target_arch in target_archs):
                # AutoConfig does not support DeepSeek MoE models yet
                model_config = DeepseekV2Config(**model)
            else:
                model_config = AutoConfig.for_model(**model)
        else:
            model_config = model

        for k, v in kwargs.items():
            if k != "architectures" and k != "model_type" and hasattr(
                    model_config, k):
                setattr(model_config, k, v)

        self.model = model_config

        if self.model is None:
            self.truncated_vocab_size = None
        else:
            self.truncated_vocab_size = self.model.vocab_size if \
                truncated_vocab_size is None else truncated_vocab_size

        # Eagle model name should follow naming convention of
        # LlamaForCausalLM -> EagleLlamaForCausalLM
        # LlamaForCausalLM -> Eagle3LlamaForCausalLM
        if method == "eagle":
            assert self.model is not None, \
                "model should not be None when method is eagle"
            kwargs["architectures"] = [
                f"Eagle{arch}" if not arch.startswith("Eagle") \
                    else arch for arch in self.model.architectures
            ]
        elif method == "eagle3":
            assert self.model is not None, \
                "model should not be None when method is eagle3"
            kwargs["architectures"] = [
                arch if arch.startswith("Eagle3") or arch.endswith("Eagle3")
                else f"Eagle3{arch}" for arch in self.model.architectures
            ]
        else:
            raise ValueError(f"Invalid method {method}. "
                             "Supported methods are eagle and eagle3.")

        super().__init__(**kwargs)

        if self.model is not None:
            for k, v in self.model.to_dict().items():
                if k not in kwargs:
                    setattr(self, k, v)

    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path: Union[str, os.PathLike],
        **kwargs,
    ) -> "EAGLEConfig":
        config_dict, kwargs = cls.get_config_dict(
            pretrained_model_name_or_path, **kwargs)
        return cls.from_dict(config_dict, **kwargs)

model `instance-attribute` ¶

model = model_config

model_type `class-attribute` `instance-attribute` ¶

model_type = 'eagle'

truncated_vocab_size `instance-attribute` ¶

truncated_vocab_size = None

init ¶

__init__(
    model: Union[PretrainedConfig, dict, None] = None,
    truncated_vocab_size: Optional[int] = None,
    method: Optional[str] = "eagle",
    **kwargs,
)

Source code in vllm/transformers_utils/configs/eagle.py

def __init__(self,
             model: Union[PretrainedConfig, dict, None] = None,
             truncated_vocab_size: Optional[int] = None,
             method: Optional[str] = 'eagle',
             **kwargs):

    model_config: Union[PretrainedConfig, DeepseekV2Config, None]
    if isinstance(model, dict):
        archs = model.get("architectures", [])
        target_archs = ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]
        if any(target_arch in archs for target_arch in target_archs):
            # AutoConfig does not support DeepSeek MoE models yet
            model_config = DeepseekV2Config(**model)
        else:
            model_config = AutoConfig.for_model(**model)
    else:
        model_config = model

    for k, v in kwargs.items():
        if k != "architectures" and k != "model_type" and hasattr(
                model_config, k):
            setattr(model_config, k, v)

    self.model = model_config

    if self.model is None:
        self.truncated_vocab_size = None
    else:
        self.truncated_vocab_size = self.model.vocab_size if \
            truncated_vocab_size is None else truncated_vocab_size

    # Eagle model name should follow naming convention of
    # LlamaForCausalLM -> EagleLlamaForCausalLM
    # LlamaForCausalLM -> Eagle3LlamaForCausalLM
    if method == "eagle":
        assert self.model is not None, \
            "model should not be None when method is eagle"
        kwargs["architectures"] = [
            f"Eagle{arch}" if not arch.startswith("Eagle") \
                else arch for arch in self.model.architectures
        ]
    elif method == "eagle3":
        assert self.model is not None, \
            "model should not be None when method is eagle3"
        kwargs["architectures"] = [
            arch if arch.startswith("Eagle3") or arch.endswith("Eagle3")
            else f"Eagle3{arch}" for arch in self.model.architectures
        ]
    else:
        raise ValueError(f"Invalid method {method}. "
                         "Supported methods are eagle and eagle3.")

    super().__init__(**kwargs)

    if self.model is not None:
        for k, v in self.model.to_dict().items():
            if k not in kwargs:
                setattr(self, k, v)

from_pretrained `classmethod` ¶

from_pretrained(
    pretrained_model_name_or_path: Union[str, PathLike],
    **kwargs,
) -> EAGLEConfig

Source code in vllm/transformers_utils/configs/eagle.py

@classmethod
def from_pretrained(
    cls,
    pretrained_model_name_or_path: Union[str, os.PathLike],
    **kwargs,
) -> "EAGLEConfig":
    config_dict, kwargs = cls.get_config_dict(
        pretrained_model_name_or_path, **kwargs)
    return cls.from_dict(config_dict, **kwargs)

JAISConfig ¶

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [JAISModel]. It is used to instantiate a JAIS model according to the specified arguments, defining the model architecture.

Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the documentation from [PretrainedConfig] for more information.

Parameters:

Name	Type	Description	Default
`vocab_size`	`int`, optional, defaults to 50257	Vocabulary size of the JAIS model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`JAISModel`].	`50257`
`n_positions`	`int`, optional, defaults to 1024	The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048).	`1024`
`n_embd`	`int`, optional, defaults to 768	Dimensionality of the embeddings and hidden states.	`768`
`n_layer`	`int`, optional, defaults to 12	Number of hidden layers in the Transformer encoder.	`12`
`n_head`	`int`, optional, defaults to 12	Number of attention heads for each attention layer in the Transformer encoder.	`12`
`n_inner`	`int`, optional, defaults to None	Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd	`None`
`activation_function`	`str`, optional, defaults to `"gelu"`	Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new", "swiglu"]`.	`'gelu_new'`
`resid_pdrop`	`float`, optional, defaults to 0.1	The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.	`0.1`
`embd_pdrop`	`float`, optional, defaults to 0.1	The dropout ratio for the embeddings.	`0.1`
`attn_pdrop`	`float`, optional, defaults to 0.1	The dropout ratio for the attention.	`0.1`
`layer_norm_epsilon`	`float`, optional, defaults to 1e-5	The epsilon to use in the layer normalization layers.	`1e-05`
`initializer_range`	`float`, optional, defaults to 0.02	The standard deviation of the truncated_normal_initializer for initializing all weight matrices.	`0.02`
`scale_attn_weights`	`bool`, optional, defaults to `True`	Scale attention weights by dividing by sqrt(hidden_size)..	`True`
`use_cache`	`bool`, optional, defaults to `True`	Whether or not the model should return the last key/values attentions (not used by all models).	`True`
`reorder_and_upcast_attn`	`bool`, optional, defaults to `False`	Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention dot-product/softmax to float() when training with mixed precision.	`False`
`position_embedding_type`	`str`, optional, defaults to `"learned"`	Positional embedding can be either `"alibi"` or `"learned"`.	`'learned'`
`mup_width_scale`	`float`, optional, defaults to 1.0	muP parameter to scale learning rate and initializers. Calculated as (`d_model,0 / d_model`), where `d_model` is the model's width and `d_model,0` is the proxy model's width.	`1.0`
`mup_embeddings_scale`	`float`, optional, defaults to 1.0	muP parameter to scale token and position embeddings.	`1.0`
`mup_output_alpha`	`float`, optional, defaults to 1.0	muP parameter to scale output logits (`output_logits_scale = mup_output_alpha * mup_width_scale`).	`1.0`
`mup_scale_qk_dot_by_d`	`bool`, optional, defaults to `False`	Scale attention weights by dividing by hidden_size instead of sqrt(hidden_size). Need to set scale_attn_weights to `True` as well.	`False`
`alibi_scaling`	`dict`, optional	Dictionary containing the scaling configuration for ALiBi embeddings. Currently only supports linear scaling strategy. Can specify either the scaling `factor` (must be a float greater than 1) for fixed scaling or `train_seq_len` for dynamic scaling on input samples with sequence length > `train_seq_len`. The expected formats are `{"type": strategy name, "factor": scaling factor}` or `{"type": strategy name, "train_seq_len": training sequence length}`.	`None`
`architectures`	`list`, optional, defaults to ['JAISLMHeadModel']	architecture names for Jais.	`None`

Example:

>>> from transformers import JAISConfig, JAISModel

>>> # Initializing a JAIS configuration
>>> configuration = JAISConfig()

>>> # Initializing a model (with random weights) from the configuration
>>> model = JAISModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config

Source code in vllm/transformers_utils/configs/jais.py

class JAISConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a
    [`JAISModel`]. It is used to instantiate a JAIS model according to the
    specified arguments, defining the model architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used
    to control the model outputs. Read the documentation from
    [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 50257):
            Vocabulary size of the JAIS model. Defines the number of different
            tokens that can be represented by the
            `inputs_ids` passed when calling [`JAISModel`].
        n_positions (`int`, *optional*, defaults to 1024):
            The maximum sequence length that this model might ever be used
            with. Typically set this to something large just in case
            (e.g., 512 or 1024 or 2048).
        n_embd (`int`, *optional*, defaults to 768):
            Dimensionality of the embeddings and hidden states.
        n_layer (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        n_head (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the
            Transformer encoder.
        n_inner (`int`, *optional*, defaults to None):
            Dimensionality of the inner feed-forward layers. `None` will set
            it to 4 times n_embd
        activation_function (`str`, *optional*, defaults to `"gelu"`):
            Activation function, to be selected in the list
            `["relu", "silu", "gelu", "tanh", "gelu_new", "swiglu"]`.
        resid_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in
            the embeddings, encoder, and pooler.
        embd_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the embeddings.
        attn_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention.
        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
            The epsilon to use in the layer normalization layers.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for
            initializing all weight matrices.
        scale_attn_weights (`bool`, *optional*, defaults to `True`):
            Scale attention weights by dividing by sqrt(hidden_size)..
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values
            attentions (not used by all models).
        scale_attn_by_inverse_layer_idx (`bool`, *optional*,
            defaults to `False`):
            Whether to additionally scale attention weights by
            `1 / layer_idx + 1`.
        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
            Whether to scale keys (K) prior to computing attention
            (dot-product)
            and upcast attention dot-product/softmax to float() when training
            with mixed precision.
        position_embedding_type (`str`, *optional*, defaults to `"learned"`):
            Positional embedding can be either `"alibi"` or `"learned"`.
        mup_width_scale (`float`, *optional*, defaults to 1.0):
            muP parameter to scale learning rate and initializers. Calculated
            as (`d_model,0 / d_model`), where
            `d_model` is the model's width and `d_model,0` is the proxy
            model's width.
        mup_embeddings_scale (`float`, *optional*, defaults to 1.0):
            muP parameter to scale token and position embeddings.
        mup_output_alpha (`float`, *optional*, defaults to 1.0):
            muP parameter to scale output logits
            (`output_logits_scale = mup_output_alpha * mup_width_scale`).
        mup_scale_qk_dot_by_d (`bool`, *optional*, defaults to `False`):
            Scale attention weights by dividing by hidden_size instead of
            sqrt(hidden_size). Need to set scale_attn_weights to `True` as
            well.
        alibi_scaling (`dict`, *optional*):
            Dictionary containing the scaling configuration for ALiBi
            embeddings. Currently only supports linear
            scaling strategy. Can specify either the scaling `factor` (must be
            a float greater than 1) for fixed scaling
            or `train_seq_len` for dynamic scaling on input samples with
            sequence length > `train_seq_len`. The expected
            formats are `{"type": strategy name, "factor": scaling factor}` or
            `{"type": strategy name,
            "train_seq_len": training sequence length}`.
        architectures (`list`, *optional*, defaults to ['JAISLMHeadModel']):
            architecture names for Jais.

    Example:

    ```python
    >>> from transformers import JAISConfig, JAISModel

    >>> # Initializing a JAIS configuration
    >>> configuration = JAISConfig()

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = JAISModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "jais"
    keys_to_ignore_at_inference = ["past_key_values"]
    attribute_map = {
        "hidden_size": "n_embd",
        "max_position_embeddings": "n_positions",
        "num_attention_heads": "n_head",
        "num_hidden_layers": "n_layer",
    }

    def __init__(
        self,
        vocab_size=50257,
        n_positions=1024,
        n_embd=768,
        n_layer=12,
        n_head=12,
        n_inner=None,
        activation_function="gelu_new",
        resid_pdrop=0.1,
        embd_pdrop=0.1,
        attn_pdrop=0.1,
        layer_norm_epsilon=1e-5,
        initializer_range=0.02,
        scale_attn_weights=True,
        use_cache=True,
        bos_token_id=50256,
        eos_token_id=50256,
        scale_attn_by_inverse_layer_idx=False,
        reorder_and_upcast_attn=False,
        position_embedding_type="learned",
        mup_width_scale=1.0,
        mup_embeddings_scale=1.0,
        mup_output_alpha=1.0,
        mup_scale_qk_dot_by_d=False,
        alibi_scaling=None,
        architectures=None,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.n_positions = n_positions
        self.n_embd = n_embd
        self.n_layer = n_layer
        self.n_head = n_head
        self.n_inner = n_inner
        self.activation_function = activation_function
        self.resid_pdrop = resid_pdrop
        self.embd_pdrop = embd_pdrop
        self.attn_pdrop = attn_pdrop
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_range = initializer_range
        self.scale_attn_weights = scale_attn_weights
        self.use_cache = use_cache
        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
        self.reorder_and_upcast_attn = reorder_and_upcast_attn

        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id

        self.position_embedding_type = position_embedding_type
        self.mup_width_scale = mup_width_scale
        self.mup_embeddings_scale = mup_embeddings_scale
        self.mup_output_alpha = mup_output_alpha
        self.mup_scale_qk_dot_by_d = mup_scale_qk_dot_by_d

        self.alibi_scaling = alibi_scaling
        self._alibi_scaling_validation()
        if architectures is None:
            architectures = ["JAISLMHeadModel"]

        super().__init__(
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            architectures=architectures,
            **kwargs,
        )

    def _alibi_scaling_validation(self):
        """
        Validate the `alibi_scaling` configuration.
        """
        if self.alibi_scaling is None:
            return

        if (not isinstance(self.alibi_scaling, dict)
                or len(self.alibi_scaling) != 2):
            raise ValueError(
                "`alibi_scaling` must be a dictionary with two fields, "
                "`type` and `factor` or `type` and `train_seq_len`, "
                f"got {self.alibi_scaling}")
        alibi_scaling_type = self.alibi_scaling.get("type", None)
        alibi_scaling_factor = self.alibi_scaling.get("factor", None)
        alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None)
        if alibi_scaling_type is None or alibi_scaling_type != "linear":
            raise ValueError(f"`alibi_scaling`'s type field must be 'linear', "
                             f"got {alibi_scaling_type}")
        if (alibi_scaling_factor is not None
                and not isinstance(alibi_scaling_factor, float)
                or (alibi_scaling_factor is not None
                    and alibi_scaling_factor <= 1.0)):
            raise ValueError(
                f"`alibi_scaling`'s factor field must be a float > 1.0, "
                f"got {alibi_scaling_factor}")
        if (alibi_dynamic_scaling is not None
                and not isinstance(alibi_dynamic_scaling, int)
                or (alibi_dynamic_scaling is not None
                    and alibi_dynamic_scaling <= 1)):
            raise ValueError(
                f"`alibi_scaling`'s `train_seq_len` field must be an "
                f"integer > 1, got {alibi_dynamic_scaling}")

activation_function `instance-attribute` ¶

activation_function = activation_function

alibi_scaling `instance-attribute` ¶

alibi_scaling = alibi_scaling

attn_pdrop `instance-attribute` ¶

attn_pdrop = attn_pdrop

attribute_map `class-attribute` `instance-attribute` ¶

attribute_map = {
    "hidden_size": "n_embd",
    "max_position_embeddings": "n_positions",
    "num_attention_heads": "n_head",
    "num_hidden_layers": "n_layer",
}

bos_token_id `instance-attribute` ¶

bos_token_id = bos_token_id

embd_pdrop `instance-attribute` ¶

embd_pdrop = embd_pdrop

eos_token_id `instance-attribute` ¶

eos_token_id = eos_token_id

initializer_range `instance-attribute` ¶

initializer_range = initializer_range

keys_to_ignore_at_inference `class-attribute` `instance-attribute` ¶

keys_to_ignore_at_inference = ['past_key_values']

layer_norm_epsilon `instance-attribute` ¶

layer_norm_epsilon = layer_norm_epsilon

model_type `class-attribute` `instance-attribute` ¶

model_type = 'jais'

mup_embeddings_scale `instance-attribute` ¶

mup_embeddings_scale = mup_embeddings_scale

mup_output_alpha `instance-attribute` ¶

mup_output_alpha = mup_output_alpha

mup_scale_qk_dot_by_d `instance-attribute` ¶

mup_scale_qk_dot_by_d = mup_scale_qk_dot_by_d

mup_width_scale `instance-attribute` ¶

mup_width_scale = mup_width_scale

n_embd `instance-attribute` ¶

n_embd = n_embd

n_head `instance-attribute` ¶

n_head = n_head

n_inner `instance-attribute` ¶

n_inner = n_inner

n_layer `instance-attribute` ¶

n_layer = n_layer

n_positions `instance-attribute` ¶

n_positions = n_positions

position_embedding_type `instance-attribute` ¶

position_embedding_type = position_embedding_type

reorder_and_upcast_attn `instance-attribute` ¶

reorder_and_upcast_attn = reorder_and_upcast_attn

resid_pdrop `instance-attribute` ¶

resid_pdrop = resid_pdrop

scale_attn_by_inverse_layer_idx `instance-attribute` ¶

scale_attn_by_inverse_layer_idx = (
    scale_attn_by_inverse_layer_idx
)

scale_attn_weights `instance-attribute` ¶

scale_attn_weights = scale_attn_weights

use_cache `instance-attribute` ¶

use_cache = use_cache

vocab_size `instance-attribute` ¶

vocab_size = vocab_size

init ¶

__init__(
    vocab_size=50257,
    n_positions=1024,
    n_embd=768,
    n_layer=12,
    n_head=12,
    n_inner=None,
    activation_function="gelu_new",
    resid_pdrop=0.1,
    embd_pdrop=0.1,
    attn_pdrop=0.1,
    layer_norm_epsilon=1e-05,
    initializer_range=0.02,
    scale_attn_weights=True,
    use_cache=True,
    bos_token_id=50256,
    eos_token_id=50256,
    scale_attn_by_inverse_layer_idx=False,
    reorder_and_upcast_attn=False,
    position_embedding_type="learned",
    mup_width_scale=1.0,
    mup_embeddings_scale=1.0,
    mup_output_alpha=1.0,
    mup_scale_qk_dot_by_d=False,
    alibi_scaling=None,
    architectures=None,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/jais.py

def __init__(
    self,
    vocab_size=50257,
    n_positions=1024,
    n_embd=768,
    n_layer=12,
    n_head=12,
    n_inner=None,
    activation_function="gelu_new",
    resid_pdrop=0.1,
    embd_pdrop=0.1,
    attn_pdrop=0.1,
    layer_norm_epsilon=1e-5,
    initializer_range=0.02,
    scale_attn_weights=True,
    use_cache=True,
    bos_token_id=50256,
    eos_token_id=50256,
    scale_attn_by_inverse_layer_idx=False,
    reorder_and_upcast_attn=False,
    position_embedding_type="learned",
    mup_width_scale=1.0,
    mup_embeddings_scale=1.0,
    mup_output_alpha=1.0,
    mup_scale_qk_dot_by_d=False,
    alibi_scaling=None,
    architectures=None,
    **kwargs,
):
    self.vocab_size = vocab_size
    self.n_positions = n_positions
    self.n_embd = n_embd
    self.n_layer = n_layer
    self.n_head = n_head
    self.n_inner = n_inner
    self.activation_function = activation_function
    self.resid_pdrop = resid_pdrop
    self.embd_pdrop = embd_pdrop
    self.attn_pdrop = attn_pdrop
    self.layer_norm_epsilon = layer_norm_epsilon
    self.initializer_range = initializer_range
    self.scale_attn_weights = scale_attn_weights
    self.use_cache = use_cache
    self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
    self.reorder_and_upcast_attn = reorder_and_upcast_attn

    self.bos_token_id = bos_token_id
    self.eos_token_id = eos_token_id

    self.position_embedding_type = position_embedding_type
    self.mup_width_scale = mup_width_scale
    self.mup_embeddings_scale = mup_embeddings_scale
    self.mup_output_alpha = mup_output_alpha
    self.mup_scale_qk_dot_by_d = mup_scale_qk_dot_by_d

    self.alibi_scaling = alibi_scaling
    self._alibi_scaling_validation()
    if architectures is None:
        architectures = ["JAISLMHeadModel"]

    super().__init__(
        bos_token_id=bos_token_id,
        eos_token_id=eos_token_id,
        architectures=architectures,
        **kwargs,
    )

_alibi_scaling_validation ¶

_alibi_scaling_validation()

Validate the alibi_scaling configuration.

Source code in vllm/transformers_utils/configs/jais.py

def _alibi_scaling_validation(self):
    """
    Validate the `alibi_scaling` configuration.
    """
    if self.alibi_scaling is None:
        return

    if (not isinstance(self.alibi_scaling, dict)
            or len(self.alibi_scaling) != 2):
        raise ValueError(
            "`alibi_scaling` must be a dictionary with two fields, "
            "`type` and `factor` or `type` and `train_seq_len`, "
            f"got {self.alibi_scaling}")
    alibi_scaling_type = self.alibi_scaling.get("type", None)
    alibi_scaling_factor = self.alibi_scaling.get("factor", None)
    alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None)
    if alibi_scaling_type is None or alibi_scaling_type != "linear":
        raise ValueError(f"`alibi_scaling`'s type field must be 'linear', "
                         f"got {alibi_scaling_type}")
    if (alibi_scaling_factor is not None
            and not isinstance(alibi_scaling_factor, float)
            or (alibi_scaling_factor is not None
                and alibi_scaling_factor <= 1.0)):
        raise ValueError(
            f"`alibi_scaling`'s factor field must be a float > 1.0, "
            f"got {alibi_scaling_factor}")
    if (alibi_dynamic_scaling is not None
            and not isinstance(alibi_dynamic_scaling, int)
            or (alibi_dynamic_scaling is not None
                and alibi_dynamic_scaling <= 1)):
        raise ValueError(
            f"`alibi_scaling`'s `train_seq_len` field must be an "
            f"integer > 1, got {alibi_dynamic_scaling}")

KimiVLConfig ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/kimi_vl.py

class KimiVLConfig(PretrainedConfig):
    model_type = "kimi_vl"

    def __init__(self,
                 vision_config: Optional[Union[dict, MoonViTConfig]] = None,
                 text_config: Optional[Union[dict, DeepseekV2Config]] = None,
                 ignore_index: int = -100,
                 media_placeholder_token_id: int = 163605,
                 pad_token_id: int = 0,
                 **kwargs):
        if vision_config is None:
            vision_config = MoonViTConfig()
        elif isinstance(vision_config, dict):
            vision_config = MoonViTConfig(**vision_config)
        self.vision_config = vision_config

        if text_config is None:
            text_config = DeepseekV2Config()
        elif isinstance(text_config, dict):
            text_config = DeepseekV2Config(**text_config)
        self.text_config = text_config

        self.ignore_index = ignore_index
        self.media_placeholder_token_id = media_placeholder_token_id

        super().__init__(pad_token_id=pad_token_id, **kwargs)

ignore_index `instance-attribute` ¶

ignore_index = ignore_index

media_placeholder_token_id `instance-attribute` ¶

media_placeholder_token_id = media_placeholder_token_id

model_type `class-attribute` `instance-attribute` ¶

model_type = 'kimi_vl'

text_config `instance-attribute` ¶

text_config = text_config

vision_config `instance-attribute` ¶

vision_config = vision_config

init ¶

__init__(
    vision_config: Optional[
        Union[dict, MoonViTConfig]
    ] = None,
    text_config: Optional[
        Union[dict, DeepseekV2Config]
    ] = None,
    ignore_index: int = -100,
    media_placeholder_token_id: int = 163605,
    pad_token_id: int = 0,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/kimi_vl.py

def __init__(self,
             vision_config: Optional[Union[dict, MoonViTConfig]] = None,
             text_config: Optional[Union[dict, DeepseekV2Config]] = None,
             ignore_index: int = -100,
             media_placeholder_token_id: int = 163605,
             pad_token_id: int = 0,
             **kwargs):
    if vision_config is None:
        vision_config = MoonViTConfig()
    elif isinstance(vision_config, dict):
        vision_config = MoonViTConfig(**vision_config)
    self.vision_config = vision_config

    if text_config is None:
        text_config = DeepseekV2Config()
    elif isinstance(text_config, dict):
        text_config = DeepseekV2Config(**text_config)
    self.text_config = text_config

    self.ignore_index = ignore_index
    self.media_placeholder_token_id = media_placeholder_token_id

    super().__init__(pad_token_id=pad_token_id, **kwargs)

MLPSpeculatorConfig ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/mlp_speculator.py

class MLPSpeculatorConfig(PretrainedConfig):
    model_type = "mlp_speculator"

    attribute_map = {
        "hidden_size": "emb_dim",
    }

    def __init__(self,
                 vocab_size: int = 32000,
                 emb_dim: int = 4096,
                 inner_dim: int = 0,
                 n_predict: int = 3,
                 top_k_tokens_per_head: Optional[list[int]] = None,
                 n_candidates: int = 5,
                 tie_weights: bool = False,
                 scale_input: bool = False,
                 **kwargs):
        """
        Initialize an MLPSpeculatorConfig

        Args:
            vocab_size: int
                the model vocab size
            emb_dim: int
                the model embedding dimension
            inner_dim: int
                the inner dimension of the model. If 0, will be the emb_dim.
            n_predict: int
                the number of lookaheads for the speculator
            top_k_tokens_per_head: list[int]
                Number of tokens to consider from each head when forming the
                candidate tree.
                For each candidate branch in the tree, head n produces topk[n]
                additional sub-branches.
                NOTE: This parameter is currently unused.
            n_candidates: int
                number of child candidates to create per sequence
            tie_weights: bool
                If true, use a single set of weights for every model
                head/stage after the first. The initial projection
                from the base model may have a different size, so that
                stays separate.
            scale_input: bool
                if True, will scale the initial hidden states from
                the base model.
        """
        if top_k_tokens_per_head is None:
            top_k_tokens_per_head = [5, 4, 3]
        assert len(top_k_tokens_per_head) == n_predict
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim
        self.inner_dim = inner_dim
        self.n_predict = n_predict
        self.top_k_tokens_per_head = top_k_tokens_per_head
        self.n_candidates = n_candidates
        self.num_lookahead_tokens = n_predict
        self.tie_weights = tie_weights
        self.scale_input = scale_input

        super().__init__(**kwargs)

attribute_map `class-attribute` `instance-attribute` ¶

attribute_map = {'hidden_size': 'emb_dim'}

emb_dim `instance-attribute` ¶

emb_dim = emb_dim

inner_dim `instance-attribute` ¶

inner_dim = inner_dim

model_type `class-attribute` `instance-attribute` ¶

model_type = 'mlp_speculator'

n_candidates `instance-attribute` ¶

n_candidates = n_candidates

n_predict `instance-attribute` ¶

n_predict = n_predict

num_lookahead_tokens `instance-attribute` ¶

num_lookahead_tokens = n_predict

scale_input `instance-attribute` ¶

scale_input = scale_input

tie_weights `instance-attribute` ¶

tie_weights = tie_weights

top_k_tokens_per_head `instance-attribute` ¶

top_k_tokens_per_head = top_k_tokens_per_head

vocab_size `instance-attribute` ¶

vocab_size = vocab_size

init ¶

__init__(
    vocab_size: int = 32000,
    emb_dim: int = 4096,
    inner_dim: int = 0,
    n_predict: int = 3,
    top_k_tokens_per_head: Optional[list[int]] = None,
    n_candidates: int = 5,
    tie_weights: bool = False,
    scale_input: bool = False,
    **kwargs,
)

Initialize an MLPSpeculatorConfig

Parameters:

Name	Type	Description	Default
`vocab_size`	`int`	int the model vocab size	`32000`
`emb_dim`	`int`	int the model embedding dimension	`4096`
`inner_dim`	`int`	int the inner dimension of the model. If 0, will be the emb_dim.	`0`
`n_predict`	`int`	int the number of lookaheads for the speculator	`3`
`top_k_tokens_per_head`	`Optional[list[int]]`	list[int] Number of tokens to consider from each head when forming the candidate tree. For each candidate branch in the tree, head n produces topk[n] additional sub-branches. NOTE: This parameter is currently unused.	`None`
`n_candidates`	`int`	int number of child candidates to create per sequence	`5`
`tie_weights`	`bool`	bool If true, use a single set of weights for every model head/stage after the first. The initial projection from the base model may have a different size, so that stays separate.	`False`
`scale_input`	`bool`	bool if True, will scale the initial hidden states from the base model.	`False`

Source code in vllm/transformers_utils/configs/mlp_speculator.py

def __init__(self,
             vocab_size: int = 32000,
             emb_dim: int = 4096,
             inner_dim: int = 0,
             n_predict: int = 3,
             top_k_tokens_per_head: Optional[list[int]] = None,
             n_candidates: int = 5,
             tie_weights: bool = False,
             scale_input: bool = False,
             **kwargs):
    """
    Initialize an MLPSpeculatorConfig

    Args:
        vocab_size: int
            the model vocab size
        emb_dim: int
            the model embedding dimension
        inner_dim: int
            the inner dimension of the model. If 0, will be the emb_dim.
        n_predict: int
            the number of lookaheads for the speculator
        top_k_tokens_per_head: list[int]
            Number of tokens to consider from each head when forming the
            candidate tree.
            For each candidate branch in the tree, head n produces topk[n]
            additional sub-branches.
            NOTE: This parameter is currently unused.
        n_candidates: int
            number of child candidates to create per sequence
        tie_weights: bool
            If true, use a single set of weights for every model
            head/stage after the first. The initial projection
            from the base model may have a different size, so that
            stays separate.
        scale_input: bool
            if True, will scale the initial hidden states from
            the base model.
    """
    if top_k_tokens_per_head is None:
        top_k_tokens_per_head = [5, 4, 3]
    assert len(top_k_tokens_per_head) == n_predict
    self.vocab_size = vocab_size
    self.emb_dim = emb_dim
    self.inner_dim = inner_dim
    self.n_predict = n_predict
    self.top_k_tokens_per_head = top_k_tokens_per_head
    self.n_candidates = n_candidates
    self.num_lookahead_tokens = n_predict
    self.tie_weights = tie_weights
    self.scale_input = scale_input

    super().__init__(**kwargs)

MedusaConfig ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/medusa.py

class MedusaConfig(PretrainedConfig):
    model_type = "medusa"

    def __init__(self,
                 hidden_size: int = 4096,
                 vocab_size: int = 32001,
                 num_heads: int = 5,
                 num_hidden_layers: int = 1,
                 max_paths: int = 64,
                 topk: int = 10,
                 truncated_vocab_size: Optional[int] = None,
                 **kwargs):

        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.num_heads = num_heads
        self.num_hidden_layers = num_hidden_layers
        self.max_paths = max_paths
        self.topk = topk
        self.max_seq_len = int(2**20)
        self.truncated_vocab_size = vocab_size if truncated_vocab_size is None\
            else truncated_vocab_size
        if "architectures" not in kwargs:
            kwargs["architectures"] = ["MedusaModel"]

        super().__init__(**kwargs)

    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path: Union[str, os.PathLike],
        **kwargs,
    ) -> "MedusaConfig":
        config_dict, kwargs = cls.get_config_dict(
            pretrained_model_name_or_path, **kwargs)
        for k in list(config_dict.keys()):
            if 'num' in k:
                if 'heads' in k:
                    config_dict["num_heads"] = config_dict.pop(k)
                elif 'layers' in k:
                    config_dict["num_hidden_layers"] = config_dict.pop(k)
        return cls.from_dict(config_dict, **kwargs)

    @property
    def num_attention_heads(self):
        return 0

    @property
    def num_lookahead_tokens(self):
        return self.num_heads

    @num_lookahead_tokens.setter
    def num_lookahead_tokens(self, num_lookahead_tokens: int):
        self.num_heads = num_lookahead_tokens

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

max_paths `instance-attribute` ¶

max_paths = max_paths

max_seq_len `instance-attribute` ¶

max_seq_len = int(2 ** 20)

model_type `class-attribute` `instance-attribute` ¶

model_type = 'medusa'

num_attention_heads `property` ¶

num_attention_heads

num_heads `instance-attribute` ¶

num_heads = num_heads

num_hidden_layers `instance-attribute` ¶

num_hidden_layers = num_hidden_layers

num_lookahead_tokens `property` `writable` ¶

num_lookahead_tokens

topk `instance-attribute` ¶

topk = topk

truncated_vocab_size `instance-attribute` ¶

truncated_vocab_size = (
    vocab_size
    if truncated_vocab_size is None
    else truncated_vocab_size
)

vocab_size `instance-attribute` ¶

vocab_size = vocab_size

init ¶

__init__(
    hidden_size: int = 4096,
    vocab_size: int = 32001,
    num_heads: int = 5,
    num_hidden_layers: int = 1,
    max_paths: int = 64,
    topk: int = 10,
    truncated_vocab_size: Optional[int] = None,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/medusa.py

def __init__(self,
             hidden_size: int = 4096,
             vocab_size: int = 32001,
             num_heads: int = 5,
             num_hidden_layers: int = 1,
             max_paths: int = 64,
             topk: int = 10,
             truncated_vocab_size: Optional[int] = None,
             **kwargs):

    self.hidden_size = hidden_size
    self.vocab_size = vocab_size
    self.num_heads = num_heads
    self.num_hidden_layers = num_hidden_layers
    self.max_paths = max_paths
    self.topk = topk
    self.max_seq_len = int(2**20)
    self.truncated_vocab_size = vocab_size if truncated_vocab_size is None\
        else truncated_vocab_size
    if "architectures" not in kwargs:
        kwargs["architectures"] = ["MedusaModel"]

    super().__init__(**kwargs)

from_pretrained `classmethod` ¶

from_pretrained(
    pretrained_model_name_or_path: Union[str, PathLike],
    **kwargs,
) -> MedusaConfig

Source code in vllm/transformers_utils/configs/medusa.py

@classmethod
def from_pretrained(
    cls,
    pretrained_model_name_or_path: Union[str, os.PathLike],
    **kwargs,
) -> "MedusaConfig":
    config_dict, kwargs = cls.get_config_dict(
        pretrained_model_name_or_path, **kwargs)
    for k in list(config_dict.keys()):
        if 'num' in k:
            if 'heads' in k:
                config_dict["num_heads"] = config_dict.pop(k)
            elif 'layers' in k:
                config_dict["num_hidden_layers"] = config_dict.pop(k)
    return cls.from_dict(config_dict, **kwargs)

MoonViTConfig ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/moonvit.py

class MoonViTConfig(PretrainedConfig):
    model_type = "moonvit"

    def __init__(
            self,
            patch_size: int = 14,
            init_pos_emb_height: int = 64,
            init_pos_emb_width: int = 64,
            num_attention_heads: int = 16,
            num_hidden_layers: int = 27,
            hidden_size: int = 1152,
            intermediate_size: int = 4304,
            merge_kernel_size: tuple[int, int] = (2, 2),
            **kwargs,
    ):
        super().__init__(**kwargs)
        self.patch_size = patch_size
        # Positional embedding config
        self.init_pos_emb_height = init_pos_emb_height
        self.init_pos_emb_width = init_pos_emb_width
        # Transformer config
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        # Patch merger config
        self.merge_kernel_size = merge_kernel_size

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

init_pos_emb_height `instance-attribute` ¶

init_pos_emb_height = init_pos_emb_height

init_pos_emb_width `instance-attribute` ¶

init_pos_emb_width = init_pos_emb_width

intermediate_size `instance-attribute` ¶

intermediate_size = intermediate_size

merge_kernel_size `instance-attribute` ¶

merge_kernel_size = merge_kernel_size

model_type `class-attribute` `instance-attribute` ¶

model_type = 'moonvit'

num_attention_heads `instance-attribute` ¶

num_attention_heads = num_attention_heads

num_hidden_layers `instance-attribute` ¶

num_hidden_layers = num_hidden_layers

patch_size `instance-attribute` ¶

patch_size = patch_size

init ¶

__init__(
    patch_size: int = 14,
    init_pos_emb_height: int = 64,
    init_pos_emb_width: int = 64,
    num_attention_heads: int = 16,
    num_hidden_layers: int = 27,
    hidden_size: int = 1152,
    intermediate_size: int = 4304,
    merge_kernel_size: tuple[int, int] = (2, 2),
    **kwargs,
)

Source code in vllm/transformers_utils/configs/moonvit.py

def __init__(
        self,
        patch_size: int = 14,
        init_pos_emb_height: int = 64,
        init_pos_emb_width: int = 64,
        num_attention_heads: int = 16,
        num_hidden_layers: int = 27,
        hidden_size: int = 1152,
        intermediate_size: int = 4304,
        merge_kernel_size: tuple[int, int] = (2, 2),
        **kwargs,
):
    super().__init__(**kwargs)
    self.patch_size = patch_size
    # Positional embedding config
    self.init_pos_emb_height = init_pos_emb_height
    self.init_pos_emb_width = init_pos_emb_width
    # Transformer config
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    # Patch merger config
    self.merge_kernel_size = merge_kernel_size

NemotronConfig ¶

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [NemotronModel]. It is used to instantiate an Nemotron model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the Nemotron-8B.

Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the documentation from [PretrainedConfig] for more information.

Parameters:

Name	Type	Description	Default
`vocab_size`	`int`, optional, defaults to 256000	Vocabulary size of the Nemotron model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`NemotronModel`]	`256000`
`hidden_size`	`int`, optional, defaults to 6144	Dimension of the hidden representations.	`6144`
`intermediate_size`	`int`, optional, defaults to 24576	Dimension of the MLP representations.	`24576`
`num_hidden_layers`	`int`, optional, defaults to 32	Number of hidden layers in the Transformer decoder.	`32`
`num_attention_heads`	`int`, optional, defaults to 48	Number of attention heads for each attention layer in the Transformer decoder.	`48`
`head_dim`	`int`, optional	Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if None	`None`
`num_key_value_heads`	`int`, optional	This is the number of key_value heads that should be used to implement Grouped Query Attention. If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group. For more details checkout [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to`num_attention_heads`.	`None`
`hidden_act`	`str` or `function`, optional, defaults to `"relu2"`	The non-linear activation function (function or string) in the decoder.	`'relu2'`
`max_position_embeddings`	`int`, optional, defaults to 4096	The maximum sequence length that this model might ever be used with.	`4096`
`initializer_range`	`float`, optional, defaults to 0.0134	The standard deviation of the truncated_normal_initializer for initializing all weight matrices.	`0.0134`
`norm_eps`	`float`, optional, defaults to 1e-05	The epsilon used by the normalization layers.	`1e-05`
`use_cache`	`bool`, optional, defaults to `True`	Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if `config.is_decoder=True`.	`True`
`pad_token_id`	`int`, optional	Padding token id.	`None`
`bos_token_id`	`int`, optional, defaults to 2	Beginning of stream token id.	`2`
`eos_token_id`	`int`, optional, defaults to 3	End of stream token id.	`3`
`tie_word_embeddings`	`bool`, optional, defaults to `False`	Whether to tie weight embeddings	`False`
`rope_theta`	`float`, optional, defaults to 10000.0	The base period of the RoPE embeddings.	`10000.0`
`partial_rotary_factor`	`float`, optional, defaults to 0.5	Percentage of the query and keys which will have rotary embedding.	`0.5`
`attention_bias`	`bool`, optional, defaults to `False`	Whether to use a bias in the query, key, value and output projection layers during self-attention.	`False`
`attention_dropout`	`float`, optional, defaults to 0.0	The dropout ratio for the attention probabilities.	`0.0`
`mlp_bias`	`bool`, optional, defaults to `False`	Whether to use a bias in up_proj and down_proj layers in the MLP layers.	`False`

>>> from transformers import NemotronModel, NemotronConfig
>>> # Initializing a Nemotron nemotron-15b style configuration
>>> configuration = NemotronConfig()
>>> # Initializing a model from the nemotron-15b style configuration
>>> model = NemotronModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config

Source code in vllm/transformers_utils/configs/nemotron.py

class NemotronConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a
    [`NemotronModel`]. It is used to instantiate an Nemotron model
    according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar
    configuration to that of the Nemotron-8B.

    Configuration objects inherit from [`PretrainedConfig`] and can be
    used to control the model outputs. Read the documentation from
    [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 256000):
            Vocabulary size of the Nemotron model. Defines the number of
            different tokens that can be represented by the
            `inputs_ids` passed when calling [`NemotronModel`]
        hidden_size (`int`, *optional*, defaults to 6144):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 24576):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 48):
            Number of attention heads for each attention layer in the
            Transformer decoder.
        head_dim (`int`, *optional*):
            Projection weights dimension in multi-head attention. Set to
            hidden_size // num_attention_heads if None
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to
            implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use
            Multi Head Attention (MHA), if
            `num_key_value_heads=1 the model will use Multi Query Attention
            (MQA) otherwise GQA is used. When converting a multi-head
            checkpoint to a GQA checkpoint, each group key and value
            head should be constructed by meanpooling all the original
            heads within that group. For more details checkout 
            [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it
            is not specified, will default to `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
            The non-linear activation function (function or string) in the
            decoder.
        max_position_embeddings (`int`, *optional*, defaults to 4096):
            The maximum sequence length that this model might ever be used
            with.
        initializer_range (`float`, *optional*, defaults to 0.0134):
            The standard deviation of the truncated_normal_initializer for
            initializing all weight matrices.
        norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values
            attentions (not used by all models). Only relevant if
            `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 3):
            End of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        partial_rotary_factor (`float`, *optional*, defaults to 0.5):
            Percentage of the query and keys which will have rotary embedding.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output
            projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in up_proj and down_proj layers in the MLP
            layers.

    ```python
    >>> from transformers import NemotronModel, NemotronConfig
    >>> # Initializing a Nemotron nemotron-15b style configuration
    >>> configuration = NemotronConfig()
    >>> # Initializing a model from the nemotron-15b style configuration
    >>> model = NemotronModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "nemotron"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=256000,
        hidden_size=6144,
        intermediate_size=24576,
        num_hidden_layers=32,
        num_attention_heads=48,
        head_dim=None,
        num_key_value_heads=None,
        hidden_act="relu2",
        max_position_embeddings=4096,
        initializer_range=0.0134,
        norm_eps=1e-5,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=2,
        eos_token_id=3,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        partial_rotary_factor=0.5,
        attention_bias=False,
        attention_dropout=0.0,
        mlp_bias=False,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        head_dim = head_dim or kwargs.get("kv_channels")
        self.head_dim = head_dim if head_dim is not None else (
            hidden_size // num_attention_heads)

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.norm_eps = norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        # for backward compatibility
        partial_rotary_factor = kwargs.get("rope_percent") or kwargs.get(
            "rope_percentage") or partial_rotary_factor
        self.partial_rotary_factor = partial_rotary_factor
        self._rope_scaling_validation()
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        self.mlp_bias = mlp_bias

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

    def _rope_scaling_validation(self):
        """
        Validate the `rope_scaling` configuration.
        """
        if self.rope_scaling is None:
            return

        if not isinstance(self.rope_scaling, dict) or len(
                self.rope_scaling) != 2:
            raise ValueError(
                "`rope_scaling` must be a dictionary with two fields, "
                f"`type` and `factor`, got {self.rope_scaling}")
        rope_scaling_type = self.rope_scaling.get("type", None)
        rope_scaling_factor = self.rope_scaling.get("factor", None)
        if rope_scaling_type is None or rope_scaling_type not in [
                "linear", "dynamic"
        ]:
            raise ValueError(
                "`rope_scaling`'s type field must be one of ['linear', "
                f"'dynamic'], got {rope_scaling_type}")
        if rope_scaling_factor is None or not isinstance(
                rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
            raise ValueError(
                "`rope_scaling`'s factor field must be a float > 1, got "
                f"{rope_scaling_factor}")

attention_bias `instance-attribute` ¶

attention_bias = attention_bias

attention_dropout `instance-attribute` ¶

attention_dropout = attention_dropout

head_dim `instance-attribute` ¶

head_dim = (
    head_dim
    if head_dim is not None
    else hidden_size // num_attention_heads
)

hidden_act `instance-attribute` ¶

hidden_act = hidden_act

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

initializer_range `instance-attribute` ¶

initializer_range = initializer_range

intermediate_size `instance-attribute` ¶

intermediate_size = intermediate_size

keys_to_ignore_at_inference `class-attribute` `instance-attribute` ¶

keys_to_ignore_at_inference = ['past_key_values']

max_position_embeddings `instance-attribute` ¶

max_position_embeddings = max_position_embeddings

mlp_bias `instance-attribute` ¶

mlp_bias = mlp_bias

model_type `class-attribute` `instance-attribute` ¶

model_type = 'nemotron'

norm_eps `instance-attribute` ¶

norm_eps = norm_eps

num_attention_heads `instance-attribute` ¶

num_attention_heads = num_attention_heads

num_hidden_layers `instance-attribute` ¶

num_hidden_layers = num_hidden_layers

num_key_value_heads `instance-attribute` ¶

num_key_value_heads = num_key_value_heads

partial_rotary_factor `instance-attribute` ¶

partial_rotary_factor = partial_rotary_factor

rope_scaling `instance-attribute` ¶

rope_scaling = rope_scaling

rope_theta `instance-attribute` ¶

rope_theta = rope_theta

use_cache `instance-attribute` ¶

use_cache = use_cache

vocab_size `instance-attribute` ¶

vocab_size = vocab_size

init ¶

__init__(
    vocab_size=256000,
    hidden_size=6144,
    intermediate_size=24576,
    num_hidden_layers=32,
    num_attention_heads=48,
    head_dim=None,
    num_key_value_heads=None,
    hidden_act="relu2",
    max_position_embeddings=4096,
    initializer_range=0.0134,
    norm_eps=1e-05,
    use_cache=True,
    pad_token_id=None,
    bos_token_id=2,
    eos_token_id=3,
    tie_word_embeddings=False,
    rope_theta=10000.0,
    rope_scaling=None,
    partial_rotary_factor=0.5,
    attention_bias=False,
    attention_dropout=0.0,
    mlp_bias=False,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/nemotron.py

def __init__(
    self,
    vocab_size=256000,
    hidden_size=6144,
    intermediate_size=24576,
    num_hidden_layers=32,
    num_attention_heads=48,
    head_dim=None,
    num_key_value_heads=None,
    hidden_act="relu2",
    max_position_embeddings=4096,
    initializer_range=0.0134,
    norm_eps=1e-5,
    use_cache=True,
    pad_token_id=None,
    bos_token_id=2,
    eos_token_id=3,
    tie_word_embeddings=False,
    rope_theta=10000.0,
    rope_scaling=None,
    partial_rotary_factor=0.5,
    attention_bias=False,
    attention_dropout=0.0,
    mlp_bias=False,
    **kwargs,
):
    self.vocab_size = vocab_size
    self.max_position_embeddings = max_position_embeddings
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    head_dim = head_dim or kwargs.get("kv_channels")
    self.head_dim = head_dim if head_dim is not None else (
        hidden_size // num_attention_heads)

    # for backward compatibility
    if num_key_value_heads is None:
        num_key_value_heads = num_attention_heads

    self.num_key_value_heads = num_key_value_heads
    self.hidden_act = hidden_act
    self.initializer_range = initializer_range
    self.norm_eps = norm_eps
    self.use_cache = use_cache
    self.rope_theta = rope_theta
    self.rope_scaling = rope_scaling
    # for backward compatibility
    partial_rotary_factor = kwargs.get("rope_percent") or kwargs.get(
        "rope_percentage") or partial_rotary_factor
    self.partial_rotary_factor = partial_rotary_factor
    self._rope_scaling_validation()
    self.attention_bias = attention_bias
    self.attention_dropout = attention_dropout
    self.mlp_bias = mlp_bias

    super().__init__(
        pad_token_id=pad_token_id,
        bos_token_id=bos_token_id,
        eos_token_id=eos_token_id,
        tie_word_embeddings=tie_word_embeddings,
        **kwargs,
    )

_rope_scaling_validation ¶

_rope_scaling_validation()

Validate the rope_scaling configuration.

Source code in vllm/transformers_utils/configs/nemotron.py

def _rope_scaling_validation(self):
    """
    Validate the `rope_scaling` configuration.
    """
    if self.rope_scaling is None:
        return

    if not isinstance(self.rope_scaling, dict) or len(
            self.rope_scaling) != 2:
        raise ValueError(
            "`rope_scaling` must be a dictionary with two fields, "
            f"`type` and `factor`, got {self.rope_scaling}")
    rope_scaling_type = self.rope_scaling.get("type", None)
    rope_scaling_factor = self.rope_scaling.get("factor", None)
    if rope_scaling_type is None or rope_scaling_type not in [
            "linear", "dynamic"
    ]:
        raise ValueError(
            "`rope_scaling`'s type field must be one of ['linear', "
            f"'dynamic'], got {rope_scaling_type}")
    if rope_scaling_factor is None or not isinstance(
            rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
        raise ValueError(
            "`rope_scaling`'s factor field must be a float > 1, got "
            f"{rope_scaling_factor}")

NemotronHConfig ¶

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [NemotronHModel]. It is used to instantiate a NemotronH model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the NemotronH-v0.1 model. Args: vocab_size (int, optional, defaults to 131072): Vocabulary size of the NemotronH model. Defines the number of different tokens that can be represented by the inputs_ids passed when calling [NemotronHModel] tie_word_embeddings (bool, optional, defaults to False): Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the model has a output word embedding layer. hidden_size (int, optional, defaults to 4096): Dimension of the hidden representations. intermediate_size (int, optional, defaults to 21504): Dimension of the MLP representations. num_hidden_layers (int, optional, defaults to 52): Number of hidden layers in the Transformer encoder. hybrid_override_pattern (str, optional, defaults to "M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"): The pattern of the hybrid model. The pattern is a string of characters where each character represents M: Mamba2, : Attention, -: MLP num_attention_heads (int, optional, defaults to 32): Number of attention heads for each attention layer in the Transformer encoder. attention_head_dim (int, optional, defaults to 128): Dimension of each attention head. num_key_value_heads (int, optional, defaults to 8): This is the number of key_value heads that should be used to implement Grouped Query Attention. If num_key_value_heads=num_attention_heads, the model will use Multi Head Attention (MHA), if num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. mlp_hidden_act (str, optional, defaults to "relu2"): The non-linear activation function in the MLP layers. attention_bias (bool, optional, defaults to False): Whether to use bias in attention layers. mlp_bias (bool, optional, defaults to False): Whether to use bias in MLP layers. use_bias (bool, optional, defaults to False): Whether to use bias in the model. initializer_range (float, optional, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_epsilon (float, optional, defaults to 1e-5): The epsilon used by the layer normalization layers. residual_in_fp32 (bool, optional, defaults to False): Whether or not residuals should be in float32. If set to False residuals will keep the same dtype as the rest of the model. use_cache (bool, optional, defaults to True): Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if config.is_decoder=True. num_logits_to_keep (int or None, optional, defaults to 1): Number of prompt logits to calculate during generation. If None, all logits will be calculated. If an integer value, only last num_logits_to_keep logits will be calculated. pad_token_id (int, optional, defaults to 0): The id of the padding token. bos_token_id (int, optional, defaults to 1): The id of the "beginning-of-sequence" token. eos_token_id (int, optional, defaults to 2): The id of the "end-of-sequence" token. sliding_window (int, optional, defaults to None): Sliding window attention window size. max_position_embeddings (int, optional, defaults to 4096): The maximum sequence length that this model might ever be used with. attention_dropout (float, optional, defaults to 0.0): The dropout ratio for the attention probabilities. hidden_dropout (float, optional, defaults to 0.0): The dropout ratio for the hidden states. use_mamba_kernels (bool, optional, defaults to True): Flag indicating whether or not to use the fast mamba kernels. These are available only if mamba-ssm and causal-conv1d are installed, and the mamba modules are running on a CUDA device. ssm_state_size (int, optional, defaults to 128): The dimension of the mamba state space latents. mamba_num_heads (int, optional, defaults to 128): Number of heads in Mamba layers. mamba_n_groups (int, optional, defaults to 8): Number of groups in Mamba layers. mamba_head_dim (int, optional, defaults to 64): Dimension of each Mamba head. mamba_d_conv (int, optional, defaults to 4): The size of the mamba convolution kernel. mamba_expand (int, optional, defaults to 2): Expanding factor used to determine the mamba intermediate size. mamba_hidden_act (str, optional, defaults to "silu"): The non-linear activation function in the Mamba layers. mamba_dt_min (float, optional, defaults to 0.001): Minimum value for the time step in Mamba. mamba_dt_max (float, optional, defaults to 0.1): Maximum value for the time step in Mamba. mamba_dt_limit (tuple, optional, defaults to (0.0, float("inf"))): Limits for the time step in Mamba. mamba_dt_init_floor (float, optional, defaults to 1e-4): Floor value for time step initialization in Mamba. mamba_conv_bias (bool, optional, defaults to True): Whether to use bias in the convolution layer of the mamba mixer block. mamba_proj_bias (bool, optional, defaults to False): Whether to use bias in the input and output projections of the mamba mixer block. mamba_chunk_size (int, optional, defaults to 256): Size of chunks for Mamba processing. rescale_prenorm_residual (bool, optional*, defaults to True): Whether to rescale the pre-normalization residual connections.

Source code in vllm/transformers_utils/configs/nemotron_h.py

class NemotronHConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a
    [`NemotronHModel`]. It is used to instantiate a NemotronH model according
    to the specified arguments, defining the model architecture. Instantiating
    a configuration with the defaults will yield a similar configuration to
    that of the NemotronH-v0.1 model.
    Args:
        vocab_size (`int`, *optional*, defaults to 131072):
            Vocabulary size of the NemotronH model. Defines the number of
            different tokens that can be represented by the `inputs_ids`
            passed when calling [`NemotronHModel`]
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be
            tied. Note that this is only relevant if the model has a output
            word embedding layer.
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 21504):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 52):
            Number of hidden layers in the Transformer encoder.
        hybrid_override_pattern (`str`, *optional*, defaults to
            `"M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"`):
            The pattern of the hybrid model. The pattern is a string of
            characters where each character represents
            M: Mamba2, *: Attention, -: MLP
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the
            Transformer encoder.
        attention_head_dim (`int`, *optional*, defaults to 128):
            Dimension of each attention head.
        num_key_value_heads (`int`, *optional*, defaults to 8):
            This is the number of key_value heads that should be used to
            implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use
            Multi Head Attention (MHA), if `num_key_value_heads=1` the model
            will use Multi Query Attention (MQA) otherwise GQA is used.
        mlp_hidden_act (`str`, *optional*, defaults to "relu2"):
            The non-linear activation function in the MLP layers.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in attention layers.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in MLP layers.
        use_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the model.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for
            initializing all weight matrices.
        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
            The epsilon used by the layer normalization layers.
        residual_in_fp32 (`bool`, *optional*, defaults to `False`):
            Whether or not residuals should be in `float32`. If set to `False`
            residuals will keep the same `dtype` as the rest of the model.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values
            attentions (not used by all models). Only relevant if
            `config.is_decoder=True`.
        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
            Number of prompt logits to calculate during generation. If `None`,
            all logits will be calculated. If an integer value, only last
            `num_logits_to_keep` logits will be calculated.
        pad_token_id (`int`, *optional*, defaults to 0):
            The id of the padding token.
        bos_token_id (`int`, *optional*, defaults to 1):
            The id of the "beginning-of-sequence" token.
        eos_token_id (`int`, *optional*, defaults to 2):
            The id of the "end-of-sequence" token.
        sliding_window (`int`, *optional*, defaults to None):
            Sliding window attention window size.
        max_position_embeddings (`int`, *optional*, defaults to 4096):
            The maximum sequence length that this model might ever be used
            with.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        hidden_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the hidden states.
        use_mamba_kernels (`bool`, *optional*, defaults to `True`):
            Flag indicating whether or not to use the fast mamba kernels.
            These are available only if `mamba-ssm` and `causal-conv1d`
            are installed, and the mamba modules are running on a CUDA device.
        ssm_state_size (`int`, *optional*, defaults to 128):
            The dimension of the mamba state space latents.
        mamba_num_heads (`int`, *optional*, defaults to 128):
            Number of heads in Mamba layers.
        mamba_n_groups (`int`, *optional*, defaults to 8):
            Number of groups in Mamba layers.
        mamba_head_dim (`int`, *optional*, defaults to 64):
            Dimension of each Mamba head.
        mamba_d_conv (`int`, *optional*, defaults to 4):
            The size of the mamba convolution kernel.
        mamba_expand (`int`, *optional*, defaults to 2):
            Expanding factor used to determine the mamba intermediate size.
        mamba_hidden_act (`str`, *optional*, defaults to "silu"):
            The non-linear activation function in the Mamba layers.
        mamba_dt_min (`float`, *optional*, defaults to 0.001):
            Minimum value for the time step in Mamba.
        mamba_dt_max (`float`, *optional*, defaults to 0.1):
            Maximum value for the time step in Mamba.
        mamba_dt_limit (`tuple`, *optional*, defaults to (0.0, float("inf"))):
            Limits for the time step in Mamba.
        mamba_dt_init_floor (`float`, *optional*, defaults to 1e-4):
            Floor value for time step initialization in Mamba.
        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
            Whether to use bias in the convolution layer of the mamba mixer
            block.
        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the input and output projections of the
            mamba mixer block.
        mamba_chunk_size (`int`, *optional*, defaults to 256):
            Size of chunks for Mamba processing.
        rescale_prenorm_residual (`bool`, *optional*, defaults to `True`):
            Whether to rescale the pre-normalization residual connections.
    """

    model_type = "nemotron_h"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=131072,
        tie_word_embeddings=False,
        hidden_size=4096,
        intermediate_size=21504,
        num_hidden_layers=52,
        hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
        num_attention_heads=32,
        head_dim=128,
        num_key_value_heads=8,  # nemo: num_query_groups
        mlp_hidden_act="relu2",
        attention_bias=False,
        mlp_bias=False,
        use_bias=False,
        initializer_range=0.02,  # nemo: init_method_std
        layer_norm_epsilon=1e-5,  # nemo: layernorm_epsilon
        residual_in_fp32=False,  #  Megatron Core default value
        use_cache=True,
        num_logits_to_keep=1,
        pad_token_id=0,
        bos_token_id=1,
        eos_token_id=2,
        sliding_window=None,
        max_position_embeddings=4096,
        attention_dropout=0.0,
        hidden_dropout=0.0,  # * ADDED
        use_mamba_kernels=True,
        ssm_state_size=128,  # mamba_state_size
        mamba_num_heads=128,
        mamba_n_groups=8,  # nemo: mamba_ssm_ngroups = num_heads
        mamba_head_dim=64,
        mamba_d_conv=4,
        mamba_expand=2,
        mamba_hidden_act="silu",
        mamba_dt_min=0.001,
        mamba_dt_max=0.1,
        mamba_dt_limit=(0.0, float("inf")),
        mamba_dt_init_floor=1e-4,
        mamba_conv_bias=True,
        mamba_proj_bias=False,
        mamba_chunk_size=256,
        rescale_prenorm_residual=True,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.tie_word_embeddings = tie_word_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.hybrid_override_pattern = hybrid_override_pattern
        self.num_attention_heads = num_attention_heads
        self.head_dim = head_dim
        self.sliding_window = sliding_window
        self.max_position_embeddings = max_position_embeddings
        self.attention_dropout = attention_dropout
        self.hidden_dropout = hidden_dropout

        # Validate hybrid_override_pattern
        # M: Mamba2, *: Attention, -: MLP
        assert len(self.hybrid_override_pattern) == self.num_hidden_layers, (
            "hybrid_override_pattern must have same length as "
            "num_hidden_layers")
        assert re.match(r"^[*-M]+$", self.hybrid_override_pattern), (
            "hybrid_override_pattern must only contain characters "
            "'M', '*', or '-'")

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.mlp_hidden_act = mlp_hidden_act
        self.attention_bias = attention_bias
        self.mlp_bias = mlp_bias
        self.use_bias = use_bias
        self.initializer_range = initializer_range
        self.layer_norm_epsilon = layer_norm_epsilon
        self.residual_in_fp32 = residual_in_fp32

        self.use_cache = use_cache
        self.num_logits_to_keep = num_logits_to_keep

        self.use_mamba_kernels = use_mamba_kernels
        self.n_groups = mamba_n_groups
        self.mamba_head_dim = mamba_head_dim
        self.ssm_state_size = ssm_state_size
        self.mamba_num_heads = mamba_num_heads
        self.conv_kernel = mamba_d_conv
        self.expand = mamba_expand
        self.mamba_hidden_act = mamba_hidden_act
        self.time_step_min = mamba_dt_min
        self.time_step_max = mamba_dt_max
        self.time_step_limit = mamba_dt_limit
        self.time_step_floor = mamba_dt_init_floor
        self.use_conv_bias = mamba_conv_bias
        self.mamba_proj_bias = mamba_proj_bias
        self.chunk_size = mamba_chunk_size
        self.rescale_prenorm_residual = rescale_prenorm_residual

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

    @property
    def layers_block_type(self):
        return [
            "mamba" if self.hybrid_override_pattern[i] == "M" else
            "attention" if self.hybrid_override_pattern[i] == "*" else "mlp"
            for i in range(self.num_hidden_layers)
        ]

attention_bias `instance-attribute` ¶

attention_bias = attention_bias

attention_dropout `instance-attribute` ¶

attention_dropout = attention_dropout

chunk_size `instance-attribute` ¶

chunk_size = mamba_chunk_size

conv_kernel `instance-attribute` ¶

conv_kernel = mamba_d_conv

expand `instance-attribute` ¶

expand = mamba_expand

head_dim `instance-attribute` ¶

head_dim = head_dim

hidden_dropout `instance-attribute` ¶

hidden_dropout = hidden_dropout

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

hybrid_override_pattern `instance-attribute` ¶

hybrid_override_pattern = hybrid_override_pattern

initializer_range `instance-attribute` ¶

initializer_range = initializer_range

intermediate_size `instance-attribute` ¶

intermediate_size = intermediate_size

keys_to_ignore_at_inference `class-attribute` `instance-attribute` ¶

keys_to_ignore_at_inference = ['past_key_values']

layer_norm_epsilon `instance-attribute` ¶

layer_norm_epsilon = layer_norm_epsilon

layers_block_type `property` ¶

layers_block_type

mamba_head_dim `instance-attribute` ¶

mamba_head_dim = mamba_head_dim

mamba_hidden_act `instance-attribute` ¶

mamba_hidden_act = mamba_hidden_act

mamba_num_heads `instance-attribute` ¶

mamba_num_heads = mamba_num_heads

mamba_proj_bias `instance-attribute` ¶

mamba_proj_bias = mamba_proj_bias

max_position_embeddings `instance-attribute` ¶

max_position_embeddings = max_position_embeddings

mlp_bias `instance-attribute` ¶

mlp_bias = mlp_bias

mlp_hidden_act `instance-attribute` ¶

mlp_hidden_act = mlp_hidden_act

model_type `class-attribute` `instance-attribute` ¶

model_type = 'nemotron_h'

n_groups `instance-attribute` ¶

n_groups = mamba_n_groups

num_attention_heads `instance-attribute` ¶

num_attention_heads = num_attention_heads

num_hidden_layers `instance-attribute` ¶

num_hidden_layers = num_hidden_layers

num_key_value_heads `instance-attribute` ¶

num_key_value_heads = num_key_value_heads

num_logits_to_keep `instance-attribute` ¶

num_logits_to_keep = num_logits_to_keep

rescale_prenorm_residual `instance-attribute` ¶

rescale_prenorm_residual = rescale_prenorm_residual

residual_in_fp32 `instance-attribute` ¶

residual_in_fp32 = residual_in_fp32

sliding_window `instance-attribute` ¶

sliding_window = sliding_window

ssm_state_size `instance-attribute` ¶

ssm_state_size = ssm_state_size

tie_word_embeddings `instance-attribute` ¶

tie_word_embeddings = tie_word_embeddings

time_step_floor `instance-attribute` ¶

time_step_floor = mamba_dt_init_floor

time_step_limit `instance-attribute` ¶

time_step_limit = mamba_dt_limit

time_step_max `instance-attribute` ¶

time_step_max = mamba_dt_max

time_step_min `instance-attribute` ¶

time_step_min = mamba_dt_min

use_bias `instance-attribute` ¶

use_bias = use_bias

use_cache `instance-attribute` ¶

use_cache = use_cache

use_conv_bias `instance-attribute` ¶

use_conv_bias = mamba_conv_bias

use_mamba_kernels `instance-attribute` ¶

use_mamba_kernels = use_mamba_kernels

vocab_size `instance-attribute` ¶

vocab_size = vocab_size

init ¶

__init__(
    vocab_size=131072,
    tie_word_embeddings=False,
    hidden_size=4096,
    intermediate_size=21504,
    num_hidden_layers=52,
    hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
    num_attention_heads=32,
    head_dim=128,
    num_key_value_heads=8,
    mlp_hidden_act="relu2",
    attention_bias=False,
    mlp_bias=False,
    use_bias=False,
    initializer_range=0.02,
    layer_norm_epsilon=1e-05,
    residual_in_fp32=False,
    use_cache=True,
    num_logits_to_keep=1,
    pad_token_id=0,
    bos_token_id=1,
    eos_token_id=2,
    sliding_window=None,
    max_position_embeddings=4096,
    attention_dropout=0.0,
    hidden_dropout=0.0,
    use_mamba_kernels=True,
    ssm_state_size=128,
    mamba_num_heads=128,
    mamba_n_groups=8,
    mamba_head_dim=64,
    mamba_d_conv=4,
    mamba_expand=2,
    mamba_hidden_act="silu",
    mamba_dt_min=0.001,
    mamba_dt_max=0.1,
    mamba_dt_limit=(0.0, float("inf")),
    mamba_dt_init_floor=0.0001,
    mamba_conv_bias=True,
    mamba_proj_bias=False,
    mamba_chunk_size=256,
    rescale_prenorm_residual=True,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/nemotron_h.py

def __init__(
    self,
    vocab_size=131072,
    tie_word_embeddings=False,
    hidden_size=4096,
    intermediate_size=21504,
    num_hidden_layers=52,
    hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
    num_attention_heads=32,
    head_dim=128,
    num_key_value_heads=8,  # nemo: num_query_groups
    mlp_hidden_act="relu2",
    attention_bias=False,
    mlp_bias=False,
    use_bias=False,
    initializer_range=0.02,  # nemo: init_method_std
    layer_norm_epsilon=1e-5,  # nemo: layernorm_epsilon
    residual_in_fp32=False,  #  Megatron Core default value
    use_cache=True,
    num_logits_to_keep=1,
    pad_token_id=0,
    bos_token_id=1,
    eos_token_id=2,
    sliding_window=None,
    max_position_embeddings=4096,
    attention_dropout=0.0,
    hidden_dropout=0.0,  # * ADDED
    use_mamba_kernels=True,
    ssm_state_size=128,  # mamba_state_size
    mamba_num_heads=128,
    mamba_n_groups=8,  # nemo: mamba_ssm_ngroups = num_heads
    mamba_head_dim=64,
    mamba_d_conv=4,
    mamba_expand=2,
    mamba_hidden_act="silu",
    mamba_dt_min=0.001,
    mamba_dt_max=0.1,
    mamba_dt_limit=(0.0, float("inf")),
    mamba_dt_init_floor=1e-4,
    mamba_conv_bias=True,
    mamba_proj_bias=False,
    mamba_chunk_size=256,
    rescale_prenorm_residual=True,
    **kwargs,
):
    self.vocab_size = vocab_size
    self.tie_word_embeddings = tie_word_embeddings
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    self.num_hidden_layers = num_hidden_layers
    self.hybrid_override_pattern = hybrid_override_pattern
    self.num_attention_heads = num_attention_heads
    self.head_dim = head_dim
    self.sliding_window = sliding_window
    self.max_position_embeddings = max_position_embeddings
    self.attention_dropout = attention_dropout
    self.hidden_dropout = hidden_dropout

    # Validate hybrid_override_pattern
    # M: Mamba2, *: Attention, -: MLP
    assert len(self.hybrid_override_pattern) == self.num_hidden_layers, (
        "hybrid_override_pattern must have same length as "
        "num_hidden_layers")
    assert re.match(r"^[*-M]+$", self.hybrid_override_pattern), (
        "hybrid_override_pattern must only contain characters "
        "'M', '*', or '-'")

    # for backward compatibility
    if num_key_value_heads is None:
        num_key_value_heads = num_attention_heads

    self.num_key_value_heads = num_key_value_heads
    self.mlp_hidden_act = mlp_hidden_act
    self.attention_bias = attention_bias
    self.mlp_bias = mlp_bias
    self.use_bias = use_bias
    self.initializer_range = initializer_range
    self.layer_norm_epsilon = layer_norm_epsilon
    self.residual_in_fp32 = residual_in_fp32

    self.use_cache = use_cache
    self.num_logits_to_keep = num_logits_to_keep

    self.use_mamba_kernels = use_mamba_kernels
    self.n_groups = mamba_n_groups
    self.mamba_head_dim = mamba_head_dim
    self.ssm_state_size = ssm_state_size
    self.mamba_num_heads = mamba_num_heads
    self.conv_kernel = mamba_d_conv
    self.expand = mamba_expand
    self.mamba_hidden_act = mamba_hidden_act
    self.time_step_min = mamba_dt_min
    self.time_step_max = mamba_dt_max
    self.time_step_limit = mamba_dt_limit
    self.time_step_floor = mamba_dt_init_floor
    self.use_conv_bias = mamba_conv_bias
    self.mamba_proj_bias = mamba_proj_bias
    self.chunk_size = mamba_chunk_size
    self.rescale_prenorm_residual = rescale_prenorm_residual

    super().__init__(
        pad_token_id=pad_token_id,
        bos_token_id=bos_token_id,
        eos_token_id=eos_token_id,
        tie_word_embeddings=tie_word_embeddings,
        **kwargs,
    )

Nemotron_Nano_VL_Config ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/nemotron_vl.py

class Nemotron_Nano_VL_Config(PretrainedConfig):
    model_type = 'Llama_Nemotron_Nano_VL'
    is_composition = True

    def __init__(
        self,
        vision_config=None,
        llm_config=None,
        force_image_size=None,
        downsample_ratio=0.5,
        template=None,
        ps_version='v1',
        image_tag_type="internvl",
        projector_hidden_size=4096,
        vit_hidden_size=1280,
        **kwargs
    ):
        super().__init__(**kwargs)

        if vision_config is not None:
            assert "auto_map" in vision_config and "AutoConfig" in vision_config["auto_map"]
            vision_auto_config = get_class_from_dynamic_module(*vision_config["auto_map"]["AutoConfig"].split("--")[::-1])
            self.vision_config = vision_auto_config(**vision_config)
        else:
            self.vision_config = PretrainedConfig()

        if llm_config is None:
            self.text_config = LlamaConfig()
        else:
            self.text_config = LlamaConfig(**llm_config)

        # Assign configuration values
        self.force_image_size = force_image_size
        self.downsample_ratio = downsample_ratio
        self.template = template  # TODO move out of here and into the tokenizer
        self.ps_version = ps_version  # Pixel shuffle version
        self.image_tag_type = image_tag_type # TODO: into the tokenizer too?
        self.projector_hidden_size = projector_hidden_size
        self.vit_hidden_size = vit_hidden_size

downsample_ratio `instance-attribute` ¶

downsample_ratio = downsample_ratio

force_image_size `instance-attribute` ¶

force_image_size = force_image_size

image_tag_type `instance-attribute` ¶

image_tag_type = image_tag_type

is_composition `class-attribute` `instance-attribute` ¶

is_composition = True

model_type `class-attribute` `instance-attribute` ¶

model_type = 'Llama_Nemotron_Nano_VL'

projector_hidden_size `instance-attribute` ¶

projector_hidden_size = projector_hidden_size

ps_version `instance-attribute` ¶

ps_version = ps_version

template `instance-attribute` ¶

template = template

text_config `instance-attribute` ¶

text_config = LlamaConfig()

vision_config `instance-attribute` ¶

vision_config = vision_auto_config(**vision_config)

vit_hidden_size `instance-attribute` ¶

vit_hidden_size = vit_hidden_size

init ¶

__init__(
    vision_config=None,
    llm_config=None,
    force_image_size=None,
    downsample_ratio=0.5,
    template=None,
    ps_version="v1",
    image_tag_type="internvl",
    projector_hidden_size=4096,
    vit_hidden_size=1280,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/nemotron_vl.py

def __init__(
    self,
    vision_config=None,
    llm_config=None,
    force_image_size=None,
    downsample_ratio=0.5,
    template=None,
    ps_version='v1',
    image_tag_type="internvl",
    projector_hidden_size=4096,
    vit_hidden_size=1280,
    **kwargs
):
    super().__init__(**kwargs)

    if vision_config is not None:
        assert "auto_map" in vision_config and "AutoConfig" in vision_config["auto_map"]
        vision_auto_config = get_class_from_dynamic_module(*vision_config["auto_map"]["AutoConfig"].split("--")[::-1])
        self.vision_config = vision_auto_config(**vision_config)
    else:
        self.vision_config = PretrainedConfig()

    if llm_config is None:
        self.text_config = LlamaConfig()
    else:
        self.text_config = LlamaConfig(**llm_config)

    # Assign configuration values
    self.force_image_size = force_image_size
    self.downsample_ratio = downsample_ratio
    self.template = template  # TODO move out of here and into the tokenizer
    self.ps_version = ps_version  # Pixel shuffle version
    self.image_tag_type = image_tag_type # TODO: into the tokenizer too?
    self.projector_hidden_size = projector_hidden_size
    self.vit_hidden_size = vit_hidden_size

OvisConfig ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/ovis.py

class OvisConfig(PretrainedConfig):
    model_type = "ovis"

    def __init__(self,
                 llm_config: Optional[Union[PretrainedConfig, dict]] = None,
                 visual_tokenizer_config: Optional[Union[PretrainedConfig,
                                                         dict]] = None,
                 multimodal_max_length=8192,
                 hidden_size=None,
                 conversation_formatter_class=None,
                 llm_attn_implementation=None,
                 disable_tie_weight=False,
                 **kwargs):
        super().__init__(**kwargs)
        if llm_config is not None:
            assert isinstance(llm_config, (PretrainedConfig, dict)), \
                f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
            if not isinstance(llm_config, PretrainedConfig):
                model_type = llm_config['model_type']
                llm_config.pop('model_type')
                llm_config = AutoConfig.for_model(model_type, **llm_config)

        # map llm_config to text_config
        self.text_config = llm_config
        if visual_tokenizer_config is not None:
            assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \
                f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
            if not isinstance(visual_tokenizer_config, PretrainedConfig):
                model_type = visual_tokenizer_config['model_type']
                visual_tokenizer_config.pop('model_type')
                visual_tokenizer_config = AutoConfig.for_model(
                    model_type, **visual_tokenizer_config)

        self.visual_tokenizer_config = visual_tokenizer_config
        self.multimodal_max_length = multimodal_max_length
        self.hidden_size = hidden_size
        self.conversation_formatter_class = conversation_formatter_class
        self.llm_attn_implementation = llm_attn_implementation
        self.disable_tie_weight = disable_tie_weight

conversation_formatter_class `instance-attribute` ¶

conversation_formatter_class = conversation_formatter_class

disable_tie_weight `instance-attribute` ¶

disable_tie_weight = disable_tie_weight

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

llm_attn_implementation `instance-attribute` ¶

llm_attn_implementation = llm_attn_implementation

model_type `class-attribute` `instance-attribute` ¶

model_type = 'ovis'

multimodal_max_length `instance-attribute` ¶

multimodal_max_length = multimodal_max_length

text_config `instance-attribute` ¶

text_config = llm_config

visual_tokenizer_config `instance-attribute` ¶

visual_tokenizer_config = visual_tokenizer_config

init ¶

__init__(
    llm_config: Optional[
        Union[PretrainedConfig, dict]
    ] = None,
    visual_tokenizer_config: Optional[
        Union[PretrainedConfig, dict]
    ] = None,
    multimodal_max_length=8192,
    hidden_size=None,
    conversation_formatter_class=None,
    llm_attn_implementation=None,
    disable_tie_weight=False,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/ovis.py

def __init__(self,
             llm_config: Optional[Union[PretrainedConfig, dict]] = None,
             visual_tokenizer_config: Optional[Union[PretrainedConfig,
                                                     dict]] = None,
             multimodal_max_length=8192,
             hidden_size=None,
             conversation_formatter_class=None,
             llm_attn_implementation=None,
             disable_tie_weight=False,
             **kwargs):
    super().__init__(**kwargs)
    if llm_config is not None:
        assert isinstance(llm_config, (PretrainedConfig, dict)), \
            f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
        if not isinstance(llm_config, PretrainedConfig):
            model_type = llm_config['model_type']
            llm_config.pop('model_type')
            llm_config = AutoConfig.for_model(model_type, **llm_config)

    # map llm_config to text_config
    self.text_config = llm_config
    if visual_tokenizer_config is not None:
        assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \
            f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
        if not isinstance(visual_tokenizer_config, PretrainedConfig):
            model_type = visual_tokenizer_config['model_type']
            visual_tokenizer_config.pop('model_type')
            visual_tokenizer_config = AutoConfig.for_model(
                model_type, **visual_tokenizer_config)

    self.visual_tokenizer_config = visual_tokenizer_config
    self.multimodal_max_length = multimodal_max_length
    self.hidden_size = hidden_size
    self.conversation_formatter_class = conversation_formatter_class
    self.llm_attn_implementation = llm_attn_implementation
    self.disable_tie_weight = disable_tie_weight

RWConfig ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/falcon.py

class RWConfig(PretrainedConfig):
    model_type = "falcon"
    keys_to_ignore_at_inference = ["past_key_values"]
    attribute_map = {
        "num_hidden_layers": "n_layer",
        "num_attention_heads": "n_head",
        "num_kv_heads": "n_head_kv",
    }

    def __init__(
        self,
        vocab_size=250880,
        hidden_size=64,
        n_layer=2,
        n_head=8,
        layer_norm_epsilon=1e-5,
        initializer_range=0.02,
        use_cache=True,
        bos_token_id=1,
        eos_token_id=2,
        hidden_dropout=0.0,
        attention_dropout=0.0,
        multi_query=True,
        n_head_kv=None,
        alibi=False,
        bias=False,
        parallel_attn=False,
        new_decoder_architecture=False,
        **kwargs,
    ) -> None:
        self.vocab_size = vocab_size
        # Backward compatibility with n_embed kwarg
        n_embed = kwargs.pop("n_embed", None)
        self.hidden_size = hidden_size if n_embed is None else n_embed
        self.n_layer = n_layer
        self.n_head = n_head
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_range = initializer_range
        self.use_cache = use_cache
        self.hidden_dropout = hidden_dropout
        self.attention_dropout = attention_dropout

        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id
        self.multi_query = multi_query
        self.n_head_kv = 1 if n_head_kv is None else n_head_kv
        self.alibi = alibi
        self.bias = bias
        self.parallel_attn = parallel_attn
        self.new_decoder_architecture = new_decoder_architecture

        if self.hidden_size == 8192:
            # Hack for falcon-40b
            self.new_decoder_architecture = True

        super().__init__(bos_token_id=bos_token_id,
                         eos_token_id=eos_token_id,
                         **kwargs)

    @property
    def head_dim(self):
        return self.hidden_size // self.n_head

    @property
    def rotary(self):
        return not self.alibi

alibi `instance-attribute` ¶

alibi = alibi

attention_dropout `instance-attribute` ¶

attention_dropout = attention_dropout

attribute_map `class-attribute` `instance-attribute` ¶

attribute_map = {
    "num_hidden_layers": "n_layer",
    "num_attention_heads": "n_head",
    "num_kv_heads": "n_head_kv",
}

bias `instance-attribute` ¶

bias = bias

bos_token_id `instance-attribute` ¶

bos_token_id = bos_token_id

eos_token_id `instance-attribute` ¶

eos_token_id = eos_token_id

head_dim `property` ¶

head_dim

hidden_dropout `instance-attribute` ¶

hidden_dropout = hidden_dropout

hidden_size `instance-attribute` ¶

hidden_size = hidden_size if n_embed is None else n_embed

initializer_range `instance-attribute` ¶

initializer_range = initializer_range

keys_to_ignore_at_inference `class-attribute` `instance-attribute` ¶

keys_to_ignore_at_inference = ['past_key_values']

layer_norm_epsilon `instance-attribute` ¶

layer_norm_epsilon = layer_norm_epsilon

model_type `class-attribute` `instance-attribute` ¶

model_type = 'falcon'

multi_query `instance-attribute` ¶

multi_query = multi_query

n_head `instance-attribute` ¶

n_head = n_head

n_head_kv `instance-attribute` ¶

n_head_kv = 1 if n_head_kv is None else n_head_kv

n_layer `instance-attribute` ¶

n_layer = n_layer

new_decoder_architecture `instance-attribute` ¶

new_decoder_architecture = new_decoder_architecture

parallel_attn `instance-attribute` ¶

parallel_attn = parallel_attn

rotary `property` ¶

rotary

use_cache `instance-attribute` ¶

use_cache = use_cache

vocab_size `instance-attribute` ¶

vocab_size = vocab_size

init ¶

__init__(
    vocab_size=250880,
    hidden_size=64,
    n_layer=2,
    n_head=8,
    layer_norm_epsilon=1e-05,
    initializer_range=0.02,
    use_cache=True,
    bos_token_id=1,
    eos_token_id=2,
    hidden_dropout=0.0,
    attention_dropout=0.0,
    multi_query=True,
    n_head_kv=None,
    alibi=False,
    bias=False,
    parallel_attn=False,
    new_decoder_architecture=False,
    **kwargs,
) -> None

Source code in vllm/transformers_utils/configs/falcon.py

def __init__(
    self,
    vocab_size=250880,
    hidden_size=64,
    n_layer=2,
    n_head=8,
    layer_norm_epsilon=1e-5,
    initializer_range=0.02,
    use_cache=True,
    bos_token_id=1,
    eos_token_id=2,
    hidden_dropout=0.0,
    attention_dropout=0.0,
    multi_query=True,
    n_head_kv=None,
    alibi=False,
    bias=False,
    parallel_attn=False,
    new_decoder_architecture=False,
    **kwargs,
) -> None:
    self.vocab_size = vocab_size
    # Backward compatibility with n_embed kwarg
    n_embed = kwargs.pop("n_embed", None)
    self.hidden_size = hidden_size if n_embed is None else n_embed
    self.n_layer = n_layer
    self.n_head = n_head
    self.layer_norm_epsilon = layer_norm_epsilon
    self.initializer_range = initializer_range
    self.use_cache = use_cache
    self.hidden_dropout = hidden_dropout
    self.attention_dropout = attention_dropout

    self.bos_token_id = bos_token_id
    self.eos_token_id = eos_token_id
    self.multi_query = multi_query
    self.n_head_kv = 1 if n_head_kv is None else n_head_kv
    self.alibi = alibi
    self.bias = bias
    self.parallel_attn = parallel_attn
    self.new_decoder_architecture = new_decoder_architecture

    if self.hidden_size == 8192:
        # Hack for falcon-40b
        self.new_decoder_architecture = True

    super().__init__(bos_token_id=bos_token_id,
                     eos_token_id=eos_token_id,
                     **kwargs)

SpeculatorsConfig ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/speculators/base.py

class SpeculatorsConfig(PretrainedConfig):
    model_type = "speculators"

    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path: Union[str, os.PathLike],
        **kwargs,
    ) -> "SpeculatorsConfig":
        """Load speculators Eagle config and convert to vLLM format."""
        config_dict, _ = cls.get_config_dict(pretrained_model_name_or_path,
                                             **kwargs)

        speculators_model_type = config_dict.get("speculators_model_type")
        if speculators_model_type not in SUPPORTED_SPECULATORS_TYPES:
            raise ValueError(
                f"Expected one of: {SUPPORTED_SPECULATORS_TYPES}. "
                "Please ensure you're loading a speculators-format model.")

        # validate fields
        # TODO: @dsikka - use speculators pydantic model to validate
        cls.validate_speculators_config(config_dict=config_dict)
        # Convert from speculators config -> format that can be ingested by vLLM
        vllm_config = cls.convert_speculators_to_vllm(config_dict=config_dict)
        # Apply anything specific to the supported algorithm
        algo_updater = SUPPORTED_SPECULATORS_TYPES[speculators_model_type]
        algo_updater(config_dict=config_dict, vllm_config=vllm_config)
        return cls(**vllm_config)

    @classmethod
    def validate_speculators_config(cls, config_dict: dict[str, Any]) -> None:
        try:
            spec_config = config_dict["speculators_config"]
            methods = spec_config["proposal_methods"]
            first_method = methods[0]
            _ = first_method["speculative_tokens"]
            _ = spec_config["verifier"]["name_or_path"]
            _ = config_dict["speculators_model_type"]
        except (KeyError, IndexError, TypeError) as e:
            raise ValueError("Invalid speculators config structure") from e

        if "transformer_layer_config" not in config_dict:
            raise ValueError("Must provide transformer_layer_config")

        if not isinstance(config_dict["transformer_layer_config"], dict):
            raise TypeError(
                "'transformer_layer_config' must be a dictionary if provided")

    @classmethod
    def convert_speculators_to_vllm(
            cls, config_dict: dict[str, Any]) -> dict[str, Any]:
        """
        Convert speculators config format to vLLM format.

        This method handles the translation of field names and structure
        between speculators and vLLM formats.

        Returns:
            Dictionary with vLLM-compatible configuration
        """
        # Currently we only support one proposal method
        spec_config = config_dict["speculators_config"]
        first_method = spec_config.get("proposal_methods")[0]
        num_lookahead_tokens = first_method.get("speculative_tokens")

        if num_lookahead_tokens is None:
            raise ValueError(
                "Missing 'speculative_tokens' in proposal method. "
                f"Got: {first_method}")

        # Build base vLLM config
        vllm_config = {
            "method": config_dict.get("speculators_model_type"),
            "num_lookahead_tokens": num_lookahead_tokens,
            "target_model": spec_config.get("verifier")["name_or_path"]
        }
        vllm_config.update(config_dict["transformer_layer_config"])
        return vllm_config

model_type `class-attribute` `instance-attribute` ¶

model_type = 'speculators'

convert_speculators_to_vllm `classmethod` ¶

convert_speculators_to_vllm(
    config_dict: dict[str, Any],
) -> dict[str, Any]

Convert speculators config format to vLLM format.

This method handles the translation of field names and structure between speculators and vLLM formats.

Returns:

Type	Description
`dict[str, Any]`	Dictionary with vLLM-compatible configuration

Source code in vllm/transformers_utils/configs/speculators/base.py

@classmethod
def convert_speculators_to_vllm(
        cls, config_dict: dict[str, Any]) -> dict[str, Any]:
    """
    Convert speculators config format to vLLM format.

    This method handles the translation of field names and structure
    between speculators and vLLM formats.

    Returns:
        Dictionary with vLLM-compatible configuration
    """
    # Currently we only support one proposal method
    spec_config = config_dict["speculators_config"]
    first_method = spec_config.get("proposal_methods")[0]
    num_lookahead_tokens = first_method.get("speculative_tokens")

    if num_lookahead_tokens is None:
        raise ValueError(
            "Missing 'speculative_tokens' in proposal method. "
            f"Got: {first_method}")

    # Build base vLLM config
    vllm_config = {
        "method": config_dict.get("speculators_model_type"),
        "num_lookahead_tokens": num_lookahead_tokens,
        "target_model": spec_config.get("verifier")["name_or_path"]
    }
    vllm_config.update(config_dict["transformer_layer_config"])
    return vllm_config

from_pretrained `classmethod` ¶

from_pretrained(
    pretrained_model_name_or_path: Union[str, PathLike],
    **kwargs,
) -> SpeculatorsConfig

Load speculators Eagle config and convert to vLLM format.

Source code in vllm/transformers_utils/configs/speculators/base.py

@classmethod
def from_pretrained(
    cls,
    pretrained_model_name_or_path: Union[str, os.PathLike],
    **kwargs,
) -> "SpeculatorsConfig":
    """Load speculators Eagle config and convert to vLLM format."""
    config_dict, _ = cls.get_config_dict(pretrained_model_name_or_path,
                                         **kwargs)

    speculators_model_type = config_dict.get("speculators_model_type")
    if speculators_model_type not in SUPPORTED_SPECULATORS_TYPES:
        raise ValueError(
            f"Expected one of: {SUPPORTED_SPECULATORS_TYPES}. "
            "Please ensure you're loading a speculators-format model.")

    # validate fields
    # TODO: @dsikka - use speculators pydantic model to validate
    cls.validate_speculators_config(config_dict=config_dict)
    # Convert from speculators config -> format that can be ingested by vLLM
    vllm_config = cls.convert_speculators_to_vllm(config_dict=config_dict)
    # Apply anything specific to the supported algorithm
    algo_updater = SUPPORTED_SPECULATORS_TYPES[speculators_model_type]
    algo_updater(config_dict=config_dict, vllm_config=vllm_config)
    return cls(**vllm_config)

validate_speculators_config `classmethod` ¶

validate_speculators_config(
    config_dict: dict[str, Any],
) -> None

Source code in vllm/transformers_utils/configs/speculators/base.py

@classmethod
def validate_speculators_config(cls, config_dict: dict[str, Any]) -> None:
    try:
        spec_config = config_dict["speculators_config"]
        methods = spec_config["proposal_methods"]
        first_method = methods[0]
        _ = first_method["speculative_tokens"]
        _ = spec_config["verifier"]["name_or_path"]
        _ = config_dict["speculators_model_type"]
    except (KeyError, IndexError, TypeError) as e:
        raise ValueError("Invalid speculators config structure") from e

    if "transformer_layer_config" not in config_dict:
        raise ValueError("Must provide transformer_layer_config")

    if not isinstance(config_dict["transformer_layer_config"], dict):
        raise TypeError(
            "'transformer_layer_config' must be a dictionary if provided")

Step3TextConfig ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/step3_vl.py

class Step3TextConfig(PretrainedConfig):
    model_type = "step3_text"
    architectures = ["Step3TextForCausalLM"]

    def __init__(
        self,
        hidden_size: int = 7168,
        intermediate_size: int = 18432,
        num_attention_heads: int = 64,
        num_attention_groups: int = 1,
        num_hidden_layers: int = 61,
        max_seq_len: int = 65536,
        vocab_size: int = 128815,
        rms_norm_eps: float = 1e-5,
        moe_intermediate_size: int = 5120,
        moe_num_experts: int = 48,
        moe_top_k: int = 3,
        rope_theta: float = 500000,
        rope_scaling: Optional[dict[str, Any]] = None,
        max_position_embedding: int = 65536,
        share_expert_dim: int = 5120,
        share_q_dim: int = 2048,
        head_dim: int = 256,
        norm_expert_weight: bool = False,
        moe_layers_enum: tuple[int,
                               ...] = (4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
                                       15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
                                       25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
                                       35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
                                       45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
                                       55, 56, 57, 58, 59),
        **kwargs,
    ) -> None:
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_attention_heads = num_attention_heads
        self.num_attention_groups = num_attention_groups
        self.num_hidden_layers = num_hidden_layers
        self.max_seq_len = max_seq_len
        self.vocab_size = vocab_size
        self.rms_norm_eps = rms_norm_eps
        self.moe_intermediate_size = moe_intermediate_size
        self.moe_num_experts = moe_num_experts
        self.moe_top_k = moe_top_k
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.max_position_embedding = max_position_embedding
        self.share_expert_dim = share_expert_dim
        self.share_q_dim = share_q_dim
        self.head_dim = head_dim
        self.norm_expert_weight = norm_expert_weight
        self.moe_layers_enum = moe_layers_enum

        super().__init__(**kwargs)

architectures `class-attribute` `instance-attribute` ¶

architectures = ['Step3TextForCausalLM']

head_dim `instance-attribute` ¶

head_dim = head_dim

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

intermediate_size `instance-attribute` ¶

intermediate_size = intermediate_size

max_position_embedding `instance-attribute` ¶

max_position_embedding = max_position_embedding

max_seq_len `instance-attribute` ¶

max_seq_len = max_seq_len

model_type `class-attribute` `instance-attribute` ¶

model_type = 'step3_text'

moe_intermediate_size `instance-attribute` ¶

moe_intermediate_size = moe_intermediate_size

moe_layers_enum `instance-attribute` ¶

moe_layers_enum = moe_layers_enum

moe_num_experts `instance-attribute` ¶

moe_num_experts = moe_num_experts

moe_top_k `instance-attribute` ¶

moe_top_k = moe_top_k

norm_expert_weight `instance-attribute` ¶

norm_expert_weight = norm_expert_weight

num_attention_groups `instance-attribute` ¶

num_attention_groups = num_attention_groups

num_attention_heads `instance-attribute` ¶

num_attention_heads = num_attention_heads

num_hidden_layers `instance-attribute` ¶

num_hidden_layers = num_hidden_layers

rms_norm_eps `instance-attribute` ¶

rms_norm_eps = rms_norm_eps

rope_scaling `instance-attribute` ¶

rope_scaling = rope_scaling

rope_theta `instance-attribute` ¶

rope_theta = rope_theta

share_expert_dim `instance-attribute` ¶

share_expert_dim = share_expert_dim

share_q_dim `instance-attribute` ¶

share_q_dim = share_q_dim

vocab_size `instance-attribute` ¶

vocab_size = vocab_size

init ¶

__init__(
    hidden_size: int = 7168,
    intermediate_size: int = 18432,
    num_attention_heads: int = 64,
    num_attention_groups: int = 1,
    num_hidden_layers: int = 61,
    max_seq_len: int = 65536,
    vocab_size: int = 128815,
    rms_norm_eps: float = 1e-05,
    moe_intermediate_size: int = 5120,
    moe_num_experts: int = 48,
    moe_top_k: int = 3,
    rope_theta: float = 500000,
    rope_scaling: Optional[dict[str, Any]] = None,
    max_position_embedding: int = 65536,
    share_expert_dim: int = 5120,
    share_q_dim: int = 2048,
    head_dim: int = 256,
    norm_expert_weight: bool = False,
    moe_layers_enum: tuple[int, ...] = (
        4,
        5,
        6,
        7,
        8,
        9,
        10,
        11,
        12,
        13,
        14,
        15,
        16,
        17,
        18,
        19,
        20,
        21,
        22,
        23,
        24,
        25,
        26,
        27,
        28,
        29,
        30,
        31,
        32,
        33,
        34,
        35,
        36,
        37,
        38,
        39,
        40,
        41,
        42,
        43,
        44,
        45,
        46,
        47,
        48,
        49,
        50,
        51,
        52,
        53,
        54,
        55,
        56,
        57,
        58,
        59,
    ),
    **kwargs,
) -> None

Source code in vllm/transformers_utils/configs/step3_vl.py

def __init__(
    self,
    hidden_size: int = 7168,
    intermediate_size: int = 18432,
    num_attention_heads: int = 64,
    num_attention_groups: int = 1,
    num_hidden_layers: int = 61,
    max_seq_len: int = 65536,
    vocab_size: int = 128815,
    rms_norm_eps: float = 1e-5,
    moe_intermediate_size: int = 5120,
    moe_num_experts: int = 48,
    moe_top_k: int = 3,
    rope_theta: float = 500000,
    rope_scaling: Optional[dict[str, Any]] = None,
    max_position_embedding: int = 65536,
    share_expert_dim: int = 5120,
    share_q_dim: int = 2048,
    head_dim: int = 256,
    norm_expert_weight: bool = False,
    moe_layers_enum: tuple[int,
                           ...] = (4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
                                   15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
                                   25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
                                   35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
                                   45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
                                   55, 56, 57, 58, 59),
    **kwargs,
) -> None:
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    self.num_attention_heads = num_attention_heads
    self.num_attention_groups = num_attention_groups
    self.num_hidden_layers = num_hidden_layers
    self.max_seq_len = max_seq_len
    self.vocab_size = vocab_size
    self.rms_norm_eps = rms_norm_eps
    self.moe_intermediate_size = moe_intermediate_size
    self.moe_num_experts = moe_num_experts
    self.moe_top_k = moe_top_k
    self.rope_theta = rope_theta
    self.rope_scaling = rope_scaling
    self.max_position_embedding = max_position_embedding
    self.share_expert_dim = share_expert_dim
    self.share_q_dim = share_q_dim
    self.head_dim = head_dim
    self.norm_expert_weight = norm_expert_weight
    self.moe_layers_enum = moe_layers_enum

    super().__init__(**kwargs)

Step3VLConfig ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/step3_vl.py

class Step3VLConfig(PretrainedConfig):
    model_type = "step3_vl"

    def __init__(
        self,
        vision_config: Optional[Union[dict, Step3VisionEncoderConfig]] = None,
        text_config: Optional[Union[dict, Step3TextConfig]] = None,
        understand_projector_stride: int = 1,
        projector_bias: bool = True,
        image_token_id: int = 128001,
        **kwargs,
    ) -> None:
        if vision_config is None:
            vision_config = Step3VisionEncoderConfig()
        elif isinstance(vision_config, dict):
            vision_config = Step3VisionEncoderConfig(**vision_config)
        self.vision_config = vision_config

        if text_config is None:
            text_config = Step3TextConfig()
        elif isinstance(text_config, dict):
            text_config = Step3TextConfig(**text_config)
        self.text_config = text_config

        self.understand_projector_stride = understand_projector_stride
        self.projector_bias = projector_bias
        self.hidden_size = text_config.hidden_size
        self.image_token_id = image_token_id

        super().__init__(**kwargs)

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

image_token_id `instance-attribute` ¶

image_token_id = image_token_id

model_type `class-attribute` `instance-attribute` ¶

model_type = 'step3_vl'

projector_bias `instance-attribute` ¶

projector_bias = projector_bias

text_config `instance-attribute` ¶

text_config = text_config

understand_projector_stride `instance-attribute` ¶

understand_projector_stride = understand_projector_stride

vision_config `instance-attribute` ¶

vision_config = vision_config

init ¶

__init__(
    vision_config: Optional[
        Union[dict, Step3VisionEncoderConfig]
    ] = None,
    text_config: Optional[
        Union[dict, Step3TextConfig]
    ] = None,
    understand_projector_stride: int = 1,
    projector_bias: bool = True,
    image_token_id: int = 128001,
    **kwargs,
) -> None

Source code in vllm/transformers_utils/configs/step3_vl.py

def __init__(
    self,
    vision_config: Optional[Union[dict, Step3VisionEncoderConfig]] = None,
    text_config: Optional[Union[dict, Step3TextConfig]] = None,
    understand_projector_stride: int = 1,
    projector_bias: bool = True,
    image_token_id: int = 128001,
    **kwargs,
) -> None:
    if vision_config is None:
        vision_config = Step3VisionEncoderConfig()
    elif isinstance(vision_config, dict):
        vision_config = Step3VisionEncoderConfig(**vision_config)
    self.vision_config = vision_config

    if text_config is None:
        text_config = Step3TextConfig()
    elif isinstance(text_config, dict):
        text_config = Step3TextConfig(**text_config)
    self.text_config = text_config

    self.understand_projector_stride = understand_projector_stride
    self.projector_bias = projector_bias
    self.hidden_size = text_config.hidden_size
    self.image_token_id = image_token_id

    super().__init__(**kwargs)

Step3VisionEncoderConfig ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/step3_vl.py

class Step3VisionEncoderConfig(PretrainedConfig):
    model_type = "step3_vision_encoder"

    def __init__(
        self,
        hidden_size=1792,
        intermediate_size=3072,
        output_hidden_size=4096,
        num_hidden_layers=63,
        num_attention_heads=16,
        num_channels=3,
        image_size=728,
        patch_size=14,
        hidden_act="quick_gelu",
        layer_norm_eps=1e-5,
        **kwargs,
    ):
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.output_hidden_size = output_hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.num_channels = num_channels
        self.patch_size = patch_size
        self.image_size = image_size
        self.layer_norm_eps = layer_norm_eps
        self.hidden_act = hidden_act
        super().__init__(**kwargs)

hidden_act `instance-attribute` ¶

hidden_act = hidden_act

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

image_size `instance-attribute` ¶

image_size = image_size

intermediate_size `instance-attribute` ¶

intermediate_size = intermediate_size

layer_norm_eps `instance-attribute` ¶

layer_norm_eps = layer_norm_eps

model_type `class-attribute` `instance-attribute` ¶

model_type = 'step3_vision_encoder'

num_attention_heads `instance-attribute` ¶

num_attention_heads = num_attention_heads

num_channels `instance-attribute` ¶

num_channels = num_channels

num_hidden_layers `instance-attribute` ¶

num_hidden_layers = num_hidden_layers

output_hidden_size `instance-attribute` ¶

output_hidden_size = output_hidden_size

patch_size `instance-attribute` ¶

patch_size = patch_size

init ¶

__init__(
    hidden_size=1792,
    intermediate_size=3072,
    output_hidden_size=4096,
    num_hidden_layers=63,
    num_attention_heads=16,
    num_channels=3,
    image_size=728,
    patch_size=14,
    hidden_act="quick_gelu",
    layer_norm_eps=1e-05,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/step3_vl.py

def __init__(
    self,
    hidden_size=1792,
    intermediate_size=3072,
    output_hidden_size=4096,
    num_hidden_layers=63,
    num_attention_heads=16,
    num_channels=3,
    image_size=728,
    patch_size=14,
    hidden_act="quick_gelu",
    layer_norm_eps=1e-5,
    **kwargs,
):
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    self.output_hidden_size = output_hidden_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    self.num_channels = num_channels
    self.patch_size = patch_size
    self.image_size = image_size
    self.layer_norm_eps = layer_norm_eps
    self.hidden_act = hidden_act
    super().__init__(**kwargs)

UltravoxConfig ¶

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [UltravoxForConditionalGeneration]. It is used to instantiate an Ultravox model according to the specified arguments, defining the model architecture.

Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the documentation from [PretrainedConfig] for more information.

Parameters:

Name	Type	Description	Default
`audio_config`	`Union[AutoConfig, dict]`, optional	Custom audio config or dict	`None`
`text_config`	`Union[AutoConfig, dict]`, optional	The config object of the text backbone. Can be any of `LlamaConfig` or `MistralConfig`.	`None`
`ignore_index`	`int`, optional, defaults to -100	The ignore index for the loss function.	`-100`
`audio_token_index`	`int`, optional, defaults to 32000	The audio token index to encode the audio prompt.	`32000`
`stack_factor`	`int`, optional, defaults to 8	Audio downsampling factor for the multimodal projector.	`8`
`norm_init`	`float`, optional, defaults to 0.4	The initialization value for the layer normalization.	`0.4`
`projector_act`	`str`, optional, defaults to `"swiglu"`	The activation function used by the multimodal projector.	`'swiglu'`
`text_model_lora_config`	`LoraConfigSimplified`, optional	The LoRA configuration for finetuning the text model.	`None`
`audio_model_lora_config`	`LoraConfigSimplified`, optional	The LoRA configuration for finetuning the audio model.	`None`
`projector_ln_mid`	`bool`, optional, defaults to `False`	Whether to apply layer normalization at the middle of the projector or at the end. Versions v0.4.1 and below use `False`, but v0.5 and above use `True`.	`False`

Source code in vllm/transformers_utils/configs/ultravox.py

class UltravoxConfig(transformers.PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a
    [`UltravoxForConditionalGeneration`]. It is used to instantiate an
    Ultravox model according to the specified arguments, defining the model
    architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to
    control the model outputs. Read the documentation from [`PretrainedConfig`]
    for more information.

    Args:
        audio_config (`Union[AutoConfig, dict]`,  *optional*):
            Custom audio config or dict
        text_config (`Union[AutoConfig, dict]`, *optional*):
            The config object of the text backbone. Can be any of `LlamaConfig`
            or `MistralConfig`.
        ignore_index (`int`, *optional*, defaults to -100):
            The ignore index for the loss function.
        audio_token_index (`int`, *optional*, defaults to 32000):
            The audio token index to encode the audio prompt.
        stack_factor (`int`, *optional*, defaults to 8):
            Audio downsampling factor for the multimodal projector.
        norm_init (`float`, *optional*, defaults to 0.4):
            The initialization value for the layer normalization.
        projector_act (`str`, *optional*, defaults to `"swiglu"`):
            The activation function used by the multimodal projector.
        text_model_lora_config (`LoraConfigSimplified`, *optional*):
            The LoRA configuration for finetuning the text model.
        audio_model_lora_config (`LoraConfigSimplified`, *optional*):
            The LoRA configuration for finetuning the audio model.
        projector_ln_mid (`bool`, *optional*, defaults to `False`):
            Whether to apply layer normalization at the middle of the
            projector or at the end. Versions v0.4.1 and below
            use `False`, but v0.5 and above use `True`.
    """

    model_type = "ultravox"
    audio_token = "<|audio|>"
    is_composition = False

    def __init__(
        self,
        audio_config: Optional[dict[str, Any]] = None,
        text_config: Optional[dict[str, Any]] = None,
        audio_model_id: Optional[str] = None,
        text_model_id: Optional[str] = None,
        ignore_index: int = -100,
        audio_token_index: int = 32000,
        hidden_size: int = 4096,
        stack_factor: int = 8,
        norm_init: float = 0.4,
        projector_act: str = "swiglu",
        text_model_lora_config: Optional[dict[str, Any]] = None,
        audio_model_lora_config: Optional[dict[str, Any]] = None,
        projector_ln_mid: bool = False,
        **kwargs,
    ):
        self.ignore_index = ignore_index

        self.audio_model_id = audio_model_id
        self.text_model_id = text_model_id
        self.audio_token_index = audio_token_index

        self.hidden_size = hidden_size
        self.stack_factor = stack_factor
        self.norm_init = norm_init
        self.projector_act = projector_act
        self.projector_ln_mid = projector_ln_mid

        if text_model_id is not None:
            # Avoid circular import
            from vllm.transformers_utils.config import get_config

            text_config_obj = get_config(text_model_id,
                                         trust_remote_code=False)
        else:
            text_config = text_config or {}
            text_config_obj = transformers.CONFIG_MAPPING[text_config.get(
                "model_type", "llama")](**text_config)

        inner_text_config = text_config_obj.get_text_config()

        if audio_model_id is not None:
            # Avoid circular import
            from vllm.transformers_utils.config import get_config

            audio_config = get_config(audio_model_id, trust_remote_code=False)
        else:
            audio_config = audio_config or {}
            audio_config = transformers.CONFIG_MAPPING[audio_config.get(
                "model_type", "whisper")](**audio_config)

        self.text_config = text_config_obj
        self.audio_config = audio_config
        self.text_model_lora_config = text_model_lora_config or {}
        self.audio_model_lora_config = audio_model_lora_config or {}

        self.vocab_size = inner_text_config.vocab_size
        self.initializer_range = inner_text_config.initializer_range
        self.text_hidden_size = inner_text_config.hidden_size

        super().__init__(**kwargs)

audio_config `instance-attribute` ¶

audio_config = audio_config

audio_model_id `instance-attribute` ¶

audio_model_id = audio_model_id

audio_model_lora_config `instance-attribute` ¶

audio_model_lora_config = audio_model_lora_config or {}

audio_token `class-attribute` `instance-attribute` ¶

audio_token = '<|audio|>'

audio_token_index `instance-attribute` ¶

audio_token_index = audio_token_index

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

ignore_index `instance-attribute` ¶

ignore_index = ignore_index

initializer_range `instance-attribute` ¶

initializer_range = initializer_range

is_composition `class-attribute` `instance-attribute` ¶

is_composition = False

model_type `class-attribute` `instance-attribute` ¶

model_type = 'ultravox'

norm_init `instance-attribute` ¶

norm_init = norm_init

projector_act `instance-attribute` ¶

projector_act = projector_act

projector_ln_mid `instance-attribute` ¶

projector_ln_mid = projector_ln_mid

stack_factor `instance-attribute` ¶

stack_factor = stack_factor

text_config `instance-attribute` ¶

text_config = text_config_obj

text_hidden_size `instance-attribute` ¶

text_hidden_size = hidden_size

text_model_id `instance-attribute` ¶

text_model_id = text_model_id

text_model_lora_config `instance-attribute` ¶

text_model_lora_config = text_model_lora_config or {}

vocab_size `instance-attribute` ¶

vocab_size = vocab_size

init ¶

__init__(
    audio_config: Optional[dict[str, Any]] = None,
    text_config: Optional[dict[str, Any]] = None,
    audio_model_id: Optional[str] = None,
    text_model_id: Optional[str] = None,
    ignore_index: int = -100,
    audio_token_index: int = 32000,
    hidden_size: int = 4096,
    stack_factor: int = 8,
    norm_init: float = 0.4,
    projector_act: str = "swiglu",
    text_model_lora_config: Optional[dict[str, Any]] = None,
    audio_model_lora_config: Optional[
        dict[str, Any]
    ] = None,
    projector_ln_mid: bool = False,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/ultravox.py

def __init__(
    self,
    audio_config: Optional[dict[str, Any]] = None,
    text_config: Optional[dict[str, Any]] = None,
    audio_model_id: Optional[str] = None,
    text_model_id: Optional[str] = None,
    ignore_index: int = -100,
    audio_token_index: int = 32000,
    hidden_size: int = 4096,
    stack_factor: int = 8,
    norm_init: float = 0.4,
    projector_act: str = "swiglu",
    text_model_lora_config: Optional[dict[str, Any]] = None,
    audio_model_lora_config: Optional[dict[str, Any]] = None,
    projector_ln_mid: bool = False,
    **kwargs,
):
    self.ignore_index = ignore_index

    self.audio_model_id = audio_model_id
    self.text_model_id = text_model_id
    self.audio_token_index = audio_token_index

    self.hidden_size = hidden_size
    self.stack_factor = stack_factor
    self.norm_init = norm_init
    self.projector_act = projector_act
    self.projector_ln_mid = projector_ln_mid

    if text_model_id is not None:
        # Avoid circular import
        from vllm.transformers_utils.config import get_config

        text_config_obj = get_config(text_model_id,
                                     trust_remote_code=False)
    else:
        text_config = text_config or {}
        text_config_obj = transformers.CONFIG_MAPPING[text_config.get(
            "model_type", "llama")](**text_config)

    inner_text_config = text_config_obj.get_text_config()

    if audio_model_id is not None:
        # Avoid circular import
        from vllm.transformers_utils.config import get_config

        audio_config = get_config(audio_model_id, trust_remote_code=False)
    else:
        audio_config = audio_config or {}
        audio_config = transformers.CONFIG_MAPPING[audio_config.get(
            "model_type", "whisper")](**audio_config)

    self.text_config = text_config_obj
    self.audio_config = audio_config
    self.text_model_lora_config = text_model_lora_config or {}
    self.audio_model_lora_config = audio_model_lora_config or {}

    self.vocab_size = inner_text_config.vocab_size
    self.initializer_range = inner_text_config.initializer_range
    self.text_hidden_size = inner_text_config.hidden_size

    super().__init__(**kwargs)

vllm.transformers_utils.configs

__all__ module-attribute ¶

ChatGLMConfig ¶

add_bias_linear instance-attribute ¶

add_qkv_bias instance-attribute ¶

apply_query_key_layer_scaling instance-attribute ¶

apply_residual_connection_post_layernorm instance-attribute ¶

attention_dropout instance-attribute ¶

attention_softmax_in_fp32 instance-attribute ¶

attribute_map class-attribute instance-attribute ¶

bias_dropout_fusion instance-attribute ¶

ffn_hidden_size instance-attribute ¶

fp32_residual_connection instance-attribute ¶

hidden_dropout instance-attribute ¶

hidden_size instance-attribute ¶

interleaved_qkv instance-attribute ¶

kv_channels instance-attribute ¶

layernorm_epsilon instance-attribute ¶

max_position_embeddings instance-attribute ¶

model_type class-attribute instance-attribute ¶

multi_query_attention instance-attribute ¶

multi_query_group_num instance-attribute ¶

num_attention_heads instance-attribute ¶

num_layers instance-attribute ¶

padded_vocab_size instance-attribute ¶

post_layer_norm instance-attribute ¶

pre_seq_len instance-attribute ¶

prefix_projection instance-attribute ¶

quantization_bit instance-attribute ¶

rmsnorm instance-attribute ¶

seq_length instance-attribute ¶

vocab_size instance-attribute ¶

__init__ ¶

DeepseekVLV2Config ¶

candidate_resolutions class-attribute instance-attribute ¶

global_view_pos class-attribute instance-attribute ¶

model_type class-attribute instance-attribute ¶

projector_config instance-attribute ¶

text_config instance-attribute ¶

tile_tag class-attribute instance-attribute ¶

vision_config instance-attribute ¶

vocab_size instance-attribute ¶

__init__ ¶

EAGLEConfig ¶

model instance-attribute ¶

model_type class-attribute instance-attribute ¶

truncated_vocab_size instance-attribute ¶

__init__ ¶

from_pretrained classmethod ¶

JAISConfig ¶

activation_function instance-attribute ¶

alibi_scaling instance-attribute ¶

attn_pdrop instance-attribute ¶

attribute_map class-attribute instance-attribute ¶

bos_token_id instance-attribute ¶

embd_pdrop instance-attribute ¶

eos_token_id instance-attribute ¶

initializer_range instance-attribute ¶

keys_to_ignore_at_inference class-attribute instance-attribute ¶

layer_norm_epsilon instance-attribute ¶

model_type class-attribute instance-attribute ¶

mup_embeddings_scale instance-attribute ¶

mup_output_alpha instance-attribute ¶

mup_scale_qk_dot_by_d instance-attribute ¶

mup_width_scale instance-attribute ¶

n_embd instance-attribute ¶

n_head instance-attribute ¶

n_inner instance-attribute ¶

n_layer instance-attribute ¶

n_positions instance-attribute ¶

position_embedding_type instance-attribute ¶

reorder_and_upcast_attn instance-attribute ¶

resid_pdrop instance-attribute ¶

scale_attn_by_inverse_layer_idx instance-attribute ¶

scale_attn_weights instance-attribute ¶

use_cache instance-attribute ¶

vocab_size instance-attribute ¶

__init__ ¶

_alibi_scaling_validation ¶

KimiVLConfig ¶

all `module-attribute` ¶

add_bias_linear `instance-attribute` ¶

add_qkv_bias `instance-attribute` ¶

apply_query_key_layer_scaling `instance-attribute` ¶

apply_residual_connection_post_layernorm `instance-attribute` ¶

attention_dropout `instance-attribute` ¶

attention_softmax_in_fp32 `instance-attribute` ¶

attribute_map `class-attribute` `instance-attribute` ¶

bias_dropout_fusion `instance-attribute` ¶

ffn_hidden_size `instance-attribute` ¶

fp32_residual_connection `instance-attribute` ¶

hidden_dropout `instance-attribute` ¶

hidden_size `instance-attribute` ¶

interleaved_qkv `instance-attribute` ¶

kv_channels `instance-attribute` ¶

layernorm_epsilon `instance-attribute` ¶

max_position_embeddings `instance-attribute` ¶

model_type `class-attribute` `instance-attribute` ¶

multi_query_attention `instance-attribute` ¶

multi_query_group_num `instance-attribute` ¶

num_attention_heads `instance-attribute` ¶

num_layers `instance-attribute` ¶

padded_vocab_size `instance-attribute` ¶

post_layer_norm `instance-attribute` ¶

pre_seq_len `instance-attribute` ¶

prefix_projection `instance-attribute` ¶

quantization_bit `instance-attribute` ¶

rmsnorm `instance-attribute` ¶

seq_length `instance-attribute` ¶

vocab_size `instance-attribute` ¶

init ¶

candidate_resolutions `class-attribute` `instance-attribute` ¶

global_view_pos `class-attribute` `instance-attribute` ¶

model_type `class-attribute` `instance-attribute` ¶

projector_config `instance-attribute` ¶

text_config `instance-attribute` ¶

tile_tag `class-attribute` `instance-attribute` ¶

vision_config `instance-attribute` ¶

vocab_size `instance-attribute` ¶

init ¶

model `instance-attribute` ¶

model_type `class-attribute` `instance-attribute` ¶

truncated_vocab_size `instance-attribute` ¶

init ¶

from_pretrained `classmethod` ¶

activation_function `instance-attribute` ¶

alibi_scaling `instance-attribute` ¶

attn_pdrop `instance-attribute` ¶

attribute_map `class-attribute` `instance-attribute` ¶

bos_token_id `instance-attribute` ¶

embd_pdrop `instance-attribute` ¶

eos_token_id `instance-attribute` ¶

initializer_range `instance-attribute` ¶

keys_to_ignore_at_inference `class-attribute` `instance-attribute` ¶

layer_norm_epsilon `instance-attribute` ¶

model_type `class-attribute` `instance-attribute` ¶

mup_embeddings_scale `instance-attribute` ¶

mup_output_alpha `instance-attribute` ¶

mup_scale_qk_dot_by_d `instance-attribute` ¶

mup_width_scale `instance-attribute` ¶

n_embd `instance-attribute` ¶

n_head `instance-attribute` ¶

n_inner `instance-attribute` ¶

n_layer `instance-attribute` ¶

n_positions `instance-attribute` ¶

position_embedding_type `instance-attribute` ¶

reorder_and_upcast_attn `instance-attribute` ¶

resid_pdrop `instance-attribute` ¶

scale_attn_by_inverse_layer_idx `instance-attribute` ¶

scale_attn_weights `instance-attribute` ¶

use_cache `instance-attribute` ¶

vocab_size `instance-attribute` ¶

init ¶

ignore_index `instance-attribute` ¶

media_placeholder_token_id `instance-attribute` ¶

model_type `class-attribute` `instance-attribute` ¶

text_config `instance-attribute` ¶

vision_config `instance-attribute` ¶

init ¶

attribute_map `class-attribute` `instance-attribute` ¶