Skip to content

vllm.model_executor.layers.quantization.kernels.mixed_precision.dynamic_4bit

Dynamic4bitLinearKernel

Bases: MPLinearKernel

Source code in vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py
class Dynamic4bitLinearKernel(MPLinearKernel):
    SUPPORTED_QUANT_TYPES = [scalar_types.int4]

    @classmethod
    def get_min_capability(cls) -> int:
        return 1

    @classmethod
    def can_implement(cls,
                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
        if not current_platform.is_cpu():
            return False, "Only CPU is supported"
        if c.weight_type not in cls.SUPPORTED_QUANT_TYPES:
            return False, f"Unsupported quant type {c.weight_type}"
        if current_platform.get_cpu_architecture(
        ) == CpuArchEnum.ARM and c.act_type not in [
                torch.float32,
        ]:
            return False, "Dynamic4bitLinearKernel on Arm requires"\
                " Float32 activations"
        if c.full_weight_shape[0] % c.group_size != 0:
            return False, f"Group size ({c.group_size}) does not evenly divide"\
                " the number of input features "\
                f"({c.full_weight_shape[0]})"
        if current_platform.get_cpu_architecture() == CpuArchEnum.ARM:
            try:
                # Attempt to retrieve the operation
                _ = torch.ops.aten._dyn_quant_matmul_4bit
            except AttributeError:
                return False, f"PyTorch {torch.__version__} does not support"\
                    " _dyn_quant_matmul_4bit. Install a newer version"
        return True, None

    def process_weights_after_loading(self, layer: torch.nn.Module):
        c = self.config
        packed_weight = getattr(layer, self.w_q_name)
        packed_weight = packed_weight.add(8)
        uint8_packed = (packed_weight[::, 1::2] << 4
                        | packed_weight[::, ::2]).to(torch.uint8)

        scales = getattr(layer, self.w_s_name)
        block_size = c.group_size

        # Handle scaling factors for partitioned weights
        if block_size == c.partition_weight_shape[0]:
            scales = scales.to(
                torch.float32
            )  # Float32 & Bfloat16 variants requires float32 scales
            scales = scales.view(-1, 1)  # Channel-wise scales
            if layer.bias is not None:
                layer.bias = layer.bias.to(
                    torch.float32
                )  # Float32 & Bfloat16 variants requires float32 bias
        else:
            # KleidiAI kernel requires bfloat16 scales with groupwise scheme
            scales = scales.to(torch.bfloat16)

        # Repack weights as per kernel requirement
        w = torch.ops.aten._dyn_quant_pack_4bit_weight(
            uint8_packed, scales, layer.bias, block_size,
            c.partition_weight_shape[0], c.partition_weight_shape[1])
        replace_parameter(layer, self.w_q_name,
                          torch.nn.Parameter(w, requires_grad=False))
        setattr(layer, self.w_s_name, None)

    def apply_weights(self,
                      layer: torch.nn.Module,
                      x: torch.Tensor,
                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
        c = self.config
        x_2d = x.reshape(-1, x.shape[-1])
        out_shape = x.shape[:-1] + (c.partition_weight_shape[1], )

        w_q = getattr(layer, self.w_q_name)
        output = torch.ops.aten._dyn_quant_matmul_4bit(
            x_2d, w_q, c.group_size, c.partition_weight_shape[0],
            c.partition_weight_shape[1])
        return output.reshape(out_shape)

SUPPORTED_QUANT_TYPES class-attribute instance-attribute

SUPPORTED_QUANT_TYPES = [int4]

apply_weights

apply_weights(
    layer: Module, x: Tensor, bias: Optional[Tensor] = None
) -> Tensor
Source code in vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py
def apply_weights(self,
                  layer: torch.nn.Module,
                  x: torch.Tensor,
                  bias: Optional[torch.Tensor] = None) -> torch.Tensor:
    c = self.config
    x_2d = x.reshape(-1, x.shape[-1])
    out_shape = x.shape[:-1] + (c.partition_weight_shape[1], )

    w_q = getattr(layer, self.w_q_name)
    output = torch.ops.aten._dyn_quant_matmul_4bit(
        x_2d, w_q, c.group_size, c.partition_weight_shape[0],
        c.partition_weight_shape[1])
    return output.reshape(out_shape)

can_implement classmethod

can_implement(
    c: MPLinearLayerConfig,
) -> tuple[bool, Optional[str]]
Source code in vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py
@classmethod
def can_implement(cls,
                  c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
    if not current_platform.is_cpu():
        return False, "Only CPU is supported"
    if c.weight_type not in cls.SUPPORTED_QUANT_TYPES:
        return False, f"Unsupported quant type {c.weight_type}"
    if current_platform.get_cpu_architecture(
    ) == CpuArchEnum.ARM and c.act_type not in [
            torch.float32,
    ]:
        return False, "Dynamic4bitLinearKernel on Arm requires"\
            " Float32 activations"
    if c.full_weight_shape[0] % c.group_size != 0:
        return False, f"Group size ({c.group_size}) does not evenly divide"\
            " the number of input features "\
            f"({c.full_weight_shape[0]})"
    if current_platform.get_cpu_architecture() == CpuArchEnum.ARM:
        try:
            # Attempt to retrieve the operation
            _ = torch.ops.aten._dyn_quant_matmul_4bit
        except AttributeError:
            return False, f"PyTorch {torch.__version__} does not support"\
                " _dyn_quant_matmul_4bit. Install a newer version"
    return True, None

get_min_capability classmethod

get_min_capability() -> int
Source code in vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py
@classmethod
def get_min_capability(cls) -> int:
    return 1

process_weights_after_loading

process_weights_after_loading(layer: Module)
Source code in vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py
def process_weights_after_loading(self, layer: torch.nn.Module):
    c = self.config
    packed_weight = getattr(layer, self.w_q_name)
    packed_weight = packed_weight.add(8)
    uint8_packed = (packed_weight[::, 1::2] << 4
                    | packed_weight[::, ::2]).to(torch.uint8)

    scales = getattr(layer, self.w_s_name)
    block_size = c.group_size

    # Handle scaling factors for partitioned weights
    if block_size == c.partition_weight_shape[0]:
        scales = scales.to(
            torch.float32
        )  # Float32 & Bfloat16 variants requires float32 scales
        scales = scales.view(-1, 1)  # Channel-wise scales
        if layer.bias is not None:
            layer.bias = layer.bias.to(
                torch.float32
            )  # Float32 & Bfloat16 variants requires float32 bias
    else:
        # KleidiAI kernel requires bfloat16 scales with groupwise scheme
        scales = scales.to(torch.bfloat16)

    # Repack weights as per kernel requirement
    w = torch.ops.aten._dyn_quant_pack_4bit_weight(
        uint8_packed, scales, layer.bias, block_size,
        c.partition_weight_shape[0], c.partition_weight_shape[1])
    replace_parameter(layer, self.w_q_name,
                      torch.nn.Parameter(w, requires_grad=False))
    setattr(layer, self.w_s_name, None)