Skip to content

vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe

Utility helpers for NVFP4 + FlashInfer fused-MoE path

__all__ module-attribute

__all__ = [
    "is_flashinfer_fp4_cutlass_moe_available",
    "reorder_w1w3_to_w3w1",
    "build_flashinfer_fp4_cutlass_moe_prepare_finalize",
]

build_flashinfer_fp4_cutlass_moe_prepare_finalize

build_flashinfer_fp4_cutlass_moe_prepare_finalize(
    moe: FusedMoEConfig, a1_gscale: Tensor
) -> FusedMoEPrepareAndFinalize

Create a FlashInfer CUTLASS fused-MoE prepare finalize kernel

Source code in vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
def build_flashinfer_fp4_cutlass_moe_prepare_finalize(
    moe: FusedMoEConfig,
    a1_gscale: torch.Tensor,
) -> mk.FusedMoEPrepareAndFinalize:
    """Create a FlashInfer CUTLASS fused-MoE prepare finalize kernel"""
    use_dp = moe.moe_parallel_config.dp_size > 1
    return FlashInferCutlassMoEPrepareAndFinalize(use_dp, a1_gscale=a1_gscale)

is_flashinfer_fp4_cutlass_moe_available

is_flashinfer_fp4_cutlass_moe_available() -> bool

Return True when FlashInfer CUTLASS NV-FP4 kernels can be used.

Source code in vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
def is_flashinfer_fp4_cutlass_moe_available() -> bool:
    """Return ``True`` when FlashInfer CUTLASS NV-FP4 kernels can be used."""
    return (envs.VLLM_USE_FLASHINFER_MOE_FP4
            and has_flashinfer_cutlass_fused_moe()
            and current_platform.is_cuda()
            and current_platform.is_device_capability(100))

reorder_w1w3_to_w3w1

reorder_w1w3_to_w3w1(
    weight: Tensor, scale: Tensor, dim: int = -2
) -> tuple[Tensor, Tensor]

Re-order the concatenated [w1, w3] tensors to [w3, w1]

Source code in vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
def reorder_w1w3_to_w3w1(weight: torch.Tensor,
                         scale: torch.Tensor,
                         dim: int = -2) -> tuple[torch.Tensor, torch.Tensor]:
    """Re-order the concatenated `[w1, w3]` tensors to `[w3, w1]`"""
    size = weight.size(dim)
    assert size % 2 == 0, f"Expected even size in dim {dim}, got {size}"
    half = size // 2

    w1, w3 = weight.split(half, dim=dim)
    s1, s3 = scale.split(half, dim=dim)

    return (torch.cat([w3, w1],
                      dim=dim).contiguous(), torch.cat([s3, s1],
                                                       dim=dim).contiguous())

select_nvfp4_gemm_impl

select_nvfp4_gemm_impl(
    moe: FusedMoEConfig,
    g1_alphas: Tensor,
    g2_alphas: Tensor,
    a1_gscale: Tensor,
    a2_gscale: Tensor,
    allow_flashinfer: bool,
) -> FusedMoEPermuteExpertsUnpermute

Return a GEMM experts implementation for NV-FP4 fused-MoE layers

Source code in vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
def select_nvfp4_gemm_impl(
    moe: FusedMoEConfig,
    g1_alphas: torch.Tensor,
    g2_alphas: torch.Tensor,
    a1_gscale: torch.Tensor,
    a2_gscale: torch.Tensor,
    allow_flashinfer: bool,
) -> mk.FusedMoEPermuteExpertsUnpermute:
    """Return a GEMM *experts* implementation for NV-FP4 fused-MoE layers"""

    if allow_flashinfer:
        return FlashInferExperts(
            g1_alphas=g1_alphas,
            g2_alphas=g2_alphas,
            a1_gscale=a1_gscale,
            a2_gscale=a2_gscale,
            out_dtype=moe.in_dtype,
            quant_dtype="nvfp4",
            ep_rank=moe.moe_parallel_config.ep_rank,
            ep_size=moe.moe_parallel_config.ep_size,
            tp_rank=moe.moe_parallel_config.tp_rank,
            tp_size=moe.moe_parallel_config.tp_size,
        )

    # native cutlass experts currently don't support DP; TP case won't call this
    raise ValueError(
        "CutlassExpertsFp4 doesn't support DP. Use flashinfer CUTLASS "
        "Fused MoE backend instead (set VLLM_USE_FLASHINFER_MOE_FP4=1)")