vllm.model_executor.layers.fused_moe.flashinfer_trtllm_moe ¶
flashinfer_fused_moe_bf16 ¶
flashinfer_fused_moe_bf16(
routing_logits: Tensor,
routing_bias: Tensor | None,
hidden_states: Tensor,
gemm1_weights: Tensor,
gemm2_weights: Tensor,
num_experts: int,
top_k: int,
n_group: int | None,
topk_group: int | None,
intermediate_size: int,
local_expert_offset: int,
local_num_experts: int,
routing_method_type: int = Renormalize,
tune_max_num_tokens: int = 8192,
) -> Tensor
Source code in vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
flashinfer_fused_moe_bf16_fake ¶
flashinfer_fused_moe_bf16_fake(
routing_logits: Tensor,
routing_bias: Tensor | None,
hidden_states: Tensor,
gemm1_weights: Tensor,
gemm2_weights: Tensor,
num_experts: int,
top_k: int,
n_group: int | None,
topk_group: int | None,
intermediate_size: int,
local_expert_offset: int,
local_num_experts: int,
routing_method_type: int = Renormalize,
tune_max_num_tokens: int = 8192,
) -> Tensor
Source code in vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
flashinfer_fused_moe_blockscale_fp8 ¶
flashinfer_fused_moe_blockscale_fp8(
routing_logits: Tensor,
routing_bias: Tensor,
x: Tensor,
w13_weight: Tensor,
w13_weight_scale_inv: Tensor,
w2_weight: Tensor,
w2_weight_scale_inv: Tensor,
global_num_experts: int,
top_k: int,
num_expert_group: int | None,
topk_group: int | None,
intermediate_size: int,
expert_offset: int,
local_num_experts: int,
block_shape: list[int],
routing_method_type: int = DeepSeekV3,
routed_scaling: float | None = 1.0,
) -> Tensor
Source code in vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
flashinfer_fused_moe_blockscale_fp8_fake ¶
flashinfer_fused_moe_blockscale_fp8_fake(
routing_logits: Tensor,
routing_bias: Tensor,
x: Tensor,
w13_weight: Tensor,
w13_weight_scale_inv: Tensor,
w2_weight: Tensor,
w2_weight_scale_inv: Tensor,
global_num_experts: int,
top_k: int,
num_expert_group: int,
topk_group: int,
intermediate_size: int,
expert_offset: int,
local_num_experts: int,
block_shape: list[int],
routing_method_type: int,
routed_scaling: float = 1.0,
) -> Tensor
Source code in vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
flashinfer_fused_moe_per_tensor_scale_fp8 ¶
flashinfer_fused_moe_per_tensor_scale_fp8(
routing_logits: Tensor,
routing_bias: Tensor | None,
hidden_states: Tensor,
input_scale: Tensor,
gemm1_weights: Tensor,
gemm2_weights: Tensor,
output1_scales_scalar: Tensor,
output1_scales_gate_scalar: Tensor,
output2_scales_scalar: Tensor,
num_experts: int,
top_k: int,
num_expert_group: int | None,
topk_group: int | None,
intermediate_size: int,
local_expert_offset: int,
local_num_experts: int,
use_routing_scales_on_input: bool,
routing_method_type: int,
routed_scaling_factor: float = 1.0,
) -> Tensor
Source code in vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
flashinfer_fused_moe_per_tensor_scale_fp8_fake ¶
flashinfer_fused_moe_per_tensor_scale_fp8_fake(
routing_logits: Tensor,
routing_bias: Tensor | None,
hidden_states: Tensor,
input_scale: Tensor,
gemm1_weights: Tensor,
gemm2_weights: Tensor,
output1_scales_scalar: Tensor,
output1_scales_gate_scalar: Tensor,
output2_scales_scalar: Tensor,
num_experts: int,
top_k: int,
num_expert_group: int | None,
topk_group: int | None,
intermediate_size: int,
local_expert_offset: int,
local_num_experts: int,
use_routing_scales_on_input: bool,
routing_method_type: int,
routed_scaling_factor: float = 1.0,
) -> Tensor