Skip to content

vllm.model_executor.layers.quantization.utils.nvfp4_moe_support

__all__ module-attribute

__all__ = ['detect_nvfp4_moe_support', 'NvFp4Support']

_logger module-attribute

_logger = init_logger(__name__)

NvFp4Support dataclass

Result container for NV-FP4 capability probing.

Source code in vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py
@dataclass(frozen=True)
class NvFp4Support:
    """Result container for NV-FP4 capability probing."""

    cutlass_supported: bool
    allow_flashinfer: bool
    use_marlin: bool

allow_flashinfer instance-attribute

allow_flashinfer: bool

cutlass_supported instance-attribute

cutlass_supported: bool

use_marlin instance-attribute

use_marlin: bool

__init__

__init__(
    cutlass_supported: bool,
    allow_flashinfer: bool,
    use_marlin: bool,
) -> None

detect_nvfp4_moe_support

detect_nvfp4_moe_support(
    class_name: str = "",
) -> NvFp4Support

Detect platform support for NV-FP4 fused-MoE path

Source code in vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py
def detect_nvfp4_moe_support(class_name: str = "") -> NvFp4Support:
    """Detect platform support for NV-FP4 fused-MoE path"""
    cutlass_supported = cutlass_fp4_supported()

    allow_flashinfer = (cutlass_supported
                        and is_flashinfer_fp4_cutlass_moe_available())

    if allow_flashinfer:
        _logger.info_once("Using FlashInfer kernels for %s.", class_name
                          or "NVFP4 path")
    else:
        if envs.VLLM_USE_FLASHINFER_MOE_FP4:
            _logger.warning_once(
                "FlashInfer kernels unavailable for %s on current platform.",
                class_name or "NVFP4 path",
            )

    use_marlin = False
    if not cutlass_supported:
        if is_fp4_marlin_supported():
            use_marlin = True
            _logger.info_once("Falling back to Marlin FP4 MoE kernel.")
        else:
            raise ValueError(
                "Current platform does not support NVFP4 quantization. "
                "Please use Blackwell GPUs or enable FlashInfer.")

    return NvFp4Support(
        cutlass_supported=cutlass_supported,
        allow_flashinfer=allow_flashinfer,
        use_marlin=use_marlin,
    )