vllm.utils.flashinfer
Compatibility wrapper for FlashInfer API changes.
Users of vLLM should always import only these wrappers.
FLASHINFER_CUBINS_REPOSITORY module-attribute
¶
FLASHINFER_CUBINS_REPOSITORY = get(
"FLASHINFER_CUBINS_REPOSITORY",
"https://edge.urm.nvidia.com/artifactory/sw-kernelinferencelibrary-public-generic-local/",
)
__all__ module-attribute
¶
__all__ = [
"has_flashinfer",
"flashinfer_trtllm_fp8_block_scale_moe",
"flashinfer_cutlass_fused_moe",
"fp4_quantize",
"nvfp4_block_scale_interleave",
"trtllm_fp4_block_scale_moe",
"autotune",
"has_flashinfer_moe",
"has_flashinfer_cutlass_fused_moe",
"has_nvidia_artifactory",
"supports_trtllm_attention",
"use_trtllm_attention",
"flashinfer_scaled_fp4_mm",
"flashinfer_scaled_fp8_mm",
]
autotune module-attribute
¶
autotune = _lazy_import_wrapper(
"flashinfer.autotuner",
"autotune",
fallback_fn=lambda *args, **kwargs: nullcontext(),
)
flashinfer_cutlass_fused_moe module-attribute
¶
flashinfer_cutlass_fused_moe = _lazy_import_wrapper(
"flashinfer.fused_moe", "cutlass_fused_moe"
)
flashinfer_trtllm_fp8_block_scale_moe module-attribute
¶
flashinfer_trtllm_fp8_block_scale_moe = (
_lazy_import_wrapper(
"flashinfer.fused_moe", "trtllm_fp8_block_scale_moe"
)
)
flashinfer_trtllm_fp8_per_tensor_scale_moe module-attribute
¶
flashinfer_trtllm_fp8_per_tensor_scale_moe = (
_lazy_import_wrapper(
"flashinfer.fused_moe",
"trtllm_fp8_per_tensor_scale_moe",
)
)
nvfp4_block_scale_interleave module-attribute
¶
nvfp4_block_scale_interleave = _lazy_import_wrapper(
"flashinfer", "nvfp4_block_scale_interleave"
)
trtllm_fp4_block_scale_moe module-attribute
¶
trtllm_fp4_block_scale_moe = _lazy_import_wrapper(
"flashinfer", "trtllm_fp4_block_scale_moe"
)
_get_submodule ¶
Safely import a submodule and return it, or None if not available.
_lazy_import_wrapper ¶
_lazy_import_wrapper(
module_name: str,
attr_name: str,
fallback_fn: Callable[..., Any] = _missing,
)
Create a lazy import wrapper for a specific function.
Source code in vllm/utils/flashinfer.py
_missing ¶
Placeholder for unavailable FlashInfer backend.
Source code in vllm/utils/flashinfer.py
bmm_fp8 ¶
bmm_fp8(
A: Tensor,
B: Tensor,
A_scale: Tensor,
B_scale: Tensor,
dtype: dtype,
backend: str,
) -> Tensor
Source code in vllm/utils/flashinfer.py
bmm_fp8_fake ¶
bmm_fp8_fake(
A: Tensor,
B: Tensor,
A_scale: Tensor,
B_scale: Tensor,
dtype: dtype,
backend: str,
) -> Tensor
Source code in vllm/utils/flashinfer.py
flashinfer_mm_fp4 ¶
flashinfer_mm_fp4(
A: Tensor,
B: Tensor,
A_scale: Tensor,
B_scale: Tensor,
g_scale: Tensor,
dtype: dtype,
backend: str,
) -> Tensor
Source code in vllm/utils/flashinfer.py
flashinfer_mm_fp4_fake ¶
flashinfer_mm_fp4_fake(
A: Tensor,
B: Tensor,
A_scale: Tensor,
B_scale: Tensor,
g_scale: Tensor,
dtype: dtype,
backend: str,
) -> Tensor
Source code in vllm/utils/flashinfer.py
flashinfer_scaled_fp4_mm ¶
flashinfer_scaled_fp4_mm(
a: Tensor,
b: Tensor,
block_scale_a: Tensor,
block_scale_b: Tensor,
alpha: Tensor,
out_dtype: dtype,
backend: str,
) -> Tensor
Source code in vllm/utils/flashinfer.py
flashinfer_scaled_fp8_mm ¶
flashinfer_scaled_fp8_mm(
a: Tensor,
b: Tensor,
scale_a: Tensor,
scale_b: Tensor,
out_dtype: dtype,
bias: Optional[Tensor] = None,
) -> Tensor
Source code in vllm/utils/flashinfer.py
has_flashinfer cached
¶
has_flashinfer() -> bool
Return True
if FlashInfer is available.
Source code in vllm/utils/flashinfer.py
has_flashinfer_cutlass_fused_moe cached
¶
has_flashinfer_cutlass_fused_moe() -> bool
Return True
if FlashInfer CUTLASS fused MoE is available.
Source code in vllm/utils/flashinfer.py
has_flashinfer_moe cached
¶
has_flashinfer_moe() -> bool
Return True
if FlashInfer MoE module is available.
has_nvidia_artifactory cached
¶
has_nvidia_artifactory() -> bool
Return True
if NVIDIA's artifactory is accessible.
This checks connectivity to the kernel inference library artifactory which is required for downloading certain cubin kernels like TRTLLM FHMA.
Source code in vllm/utils/flashinfer.py
supports_trtllm_attention cached
¶
Cache result which only depends on the environment
Source code in vllm/utils/flashinfer.py
use_trtllm_attention ¶
use_trtllm_attention(
num_qo_heads: int,
num_kv_heads: int,
num_tokens: int,
max_seq_len: int,
kv_cache_dtype: str,
q_dtype: dtype,
is_prefill: bool,
has_sinks: bool = False,
) -> bool