vllm.model_executor.layers.quantization.utils.marlin_utils
_check_marlin_supported ¶
_check_marlin_supported(
quant_type: ScalarType,
group_size: Optional[int],
has_zp: bool,
device_capability: Optional[int] = None,
) -> tuple[bool, Optional[str]]
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
apply_awq_marlin_linear ¶
apply_awq_marlin_linear(
input: Tensor,
weight: Tensor,
weight_scale: Tensor,
weight_zp: Tensor,
g_idx: Tensor,
g_idx_sort_indices: Tensor,
workspace: Tensor,
quant_type: ScalarType,
output_size_per_partition: int,
input_size_per_partition: int,
bias: Optional[Tensor] = None,
use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
) -> Tensor
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
apply_gptq_marlin_linear ¶
apply_gptq_marlin_linear(
input: Tensor,
weight: Tensor,
weight_scale: Tensor,
weight_zp: Tensor,
g_idx: Tensor,
g_idx_sort_indices: Tensor,
workspace: Tensor,
wtype: ScalarType,
output_size_per_partition: int,
input_size_per_partition: int,
is_k_full: bool,
bias: Optional[Tensor] = None,
use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
) -> Tensor
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
awq_to_marlin_zero_points ¶
awq_to_marlin_zero_points(
q_zp_packed: Tensor,
size_k: int,
size_n: int,
num_bits: int,
) -> Tensor
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
check_marlin_supported ¶
check_marlin_supported(
quant_type: ScalarType,
group_size: int,
has_zp: bool = False,
device_capability: Optional[int] = None,
) -> bool
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
check_marlin_supports_layer ¶
check_marlin_supports_layer(
layer: LinearBase, group_size: int
) -> bool
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
check_marlin_supports_shape ¶
check_marlin_supports_shape(
output_size_per_partition: int,
input_size_per_partition: int,
input_size: int,
group_size: int,
) -> tuple[bool, Optional[str]]
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
check_moe_marlin_supports_layer ¶
check_moe_marlin_supports_layer(
layer: LinearBase, group_size: int
) -> bool
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
get_scale_perms ¶
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
marlin_is_k_full ¶
marlin_make_empty_g_idx ¶
marlin_make_empty_zp ¶
marlin_make_workspace ¶
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
marlin_make_workspace_new ¶
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
marlin_moe_permute_scales ¶
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
marlin_permute_bias ¶
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
marlin_permute_scales ¶
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
marlin_repeat_scales_on_all_ranks ¶
marlin_repeat_scales_on_all_ranks(
act_order: bool, group_size: int, is_row_parallel: bool
) -> bool
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
marlin_sort_g_idx ¶
marlin_zero_points ¶
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
maybe_warn_marlin_atomic_add ¶
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
maybe_warn_marlin_atomic_add_env ¶
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
moe_awq_to_marlin_zero_points ¶
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
query_marlin_supported_quant_types ¶
query_marlin_supported_quant_types(
has_zp: Optional[bool] = None,
include_fp_type: bool = True,
device_capability: Optional[int] = None,
)
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
should_use_atomic_add_reduce ¶
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
verify_marlin_supported ¶
verify_marlin_supported(
quant_type: ScalarType,
group_size: int,
has_zp: bool = False,
) -> None
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
verify_marlin_supports_shape ¶
verify_marlin_supports_shape(
output_size_per_partition: int,
input_size_per_partition: int,
input_size: int,
group_size: int,
) -> None