vllm.model_executor.layers.fused_moe.deep_gemm_utils
Taken from https://github.com/ModelTC/LightLLM/blob/8ed97c74c18f11505b048b1ba00ba5c0cef8bff6/lightllm/common/fused_moe/deepep_scatter_gather.py and updated to fit vllm needs and terminology.
_fwd_kernel_ep_gather ¶
_fwd_kernel_ep_gather(
total_token_num,
input_tensor,
input_tensor_stride0,
input_tensor_stride1,
recv_topk_ids,
recv_topk_ids_stride0,
recv_topk_ids_stride1,
recv_topk_weight,
recv_topk_weight_stride0,
recv_topk_weight_stride1,
input_index,
input_index_stride0,
input_index_stride1,
output_tensor,
output_tensor_stride0,
output_tensor_stride1,
topk_num: constexpr,
expert_map,
HAS_EXPERT_MAP: constexpr,
BLOCK_D: constexpr,
)
Source code in vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
_fwd_kernel_ep_scatter_1 ¶
_fwd_kernel_ep_scatter_1(
num_recv_tokens_per_expert,
expert_start_loc,
m_indices,
num_experts: constexpr,
BLOCK_E: constexpr,
BLOCK_EXPERT_NUM: constexpr,
)
Source code in vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
_fwd_kernel_ep_scatter_2 ¶
_fwd_kernel_ep_scatter_2(
total_token_num,
expert_start_loc,
recv_x,
recv_x_stride0,
recv_x_stride1,
recv_x_scale,
recv_x_scale_stride0,
recv_x_scale_stride1,
recv_topk,
recv_topk_stride0,
recv_topk_stride1,
output_tensor,
output_tensor_stride0,
output_tensor_stride1,
output_tensor_scale,
output_tensor_scale_stride0,
output_tensor_scale_stride1,
output_index,
output_index_stride0,
output_index_stride1,
topk_num: constexpr,
expert_map,
HAS_EXPERT_MAP: constexpr,
HIDDEN_SIZE: constexpr,
HIDDEN_SIZE_PAD: constexpr,
SCALE_HIDDEN_SIZE: constexpr,
SCALE_HIDDEN_SIZE_PAD: constexpr,
)
Source code in vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
apply_expert_map ¶
compute_aligned_M ¶
compute_aligned_M(
M: int,
num_topk: int,
local_num_experts: int,
alignment: int,
expert_tokens_meta: Optional[ExpertTokensMetadata],
)
Source code in vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
deep_gemm_block_shape cached
¶
deepgemm_moe_permute ¶
deepgemm_moe_permute(
aq: Tensor,
aq_scale: Tensor,
topk_ids: Tensor,
local_num_experts: int,
expert_map: Optional[Tensor],
expert_tokens_meta: Optional[ExpertTokensMetadata],
aq_out: Optional[Tensor] = None,
)
Source code in vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
deepgemm_unpermute_and_reduce ¶
deepgemm_unpermute_and_reduce(
a: Tensor,
topk_ids: Tensor,
topk_weights: Tensor,
inv_perm: Tensor,
expert_map: Optional[Tensor],
output: Tensor,
)
Source code in vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
ep_gather ¶
ep_gather(
input_tensor: Tensor,
recv_topk_ids: Tensor,
recv_topk_weight: Tensor,
input_index: Tensor,
expert_map: Optional[Tensor],
output_tensor: Tensor,
)
Source code in vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
ep_scatter ¶
ep_scatter(
recv_x: Tensor,
recv_x_scale: Tensor,
recv_topk: Tensor,
num_recv_tokens_per_expert: Tensor,
expert_map: Optional[Tensor],
expert_start_loc: Tensor,
output_tensor: Tensor,
output_tensor_scale: Tensor,
m_indices: Tensor,
output_index: Tensor,
)