KernelDex

Community-curated GPU kernel index

Sign in with GitHub

27 kernels

File	Algorithm	Language	Hardware	Source
10-block-scaled-matmul.py	gemm_mxfp4	Python		triton
72a_blackwell_nvfp4_bf16_gemm.cu	gemm_mxfp4	CUDA		cutlass
aiter_mla_hk_mi3xx_v32_fwd_decode_h128_fp8_fp8.cuh	attention_mla_decode	HIP	MI300X	AITER
asm_mla_decode_fwd.cpp	attention_mla_decode	HIP		aiter
fp4_utils.py	quantization_mxfp4	Python		aiter
fused_moe.py	moe_fused	Python		vllm
fused_moe.py	moe_fused	Python		sglang
fused_moe.py	moe_fused	Python		aiter
fused_mxfp4_quant.py	quantization_mxfp4	Python		aiter
gemm_afp4wfp4.py	gemm_mxfp4	Python		aiter
gemm_op_a4w4.py	gemm_mxfp4	Python		aiter
hk_decode_fwd.cu	attention_mla_decode	CUDA		aiter
kernel.cuh	attention_mla_decode	CUDA		FlashMLA
mfma_preshuffle_pipeline.py	gemm_mxfp4	Python		aiter
mla.cuh	attention_mla_decode	CUDA		flashinfer
mla_decode_rope.py	attention_mla_decode	Python		aiter
mla_sm120.cu	attention_mla_decode	CUDA		flashinfer
mmq.cuh	gemm_quantized	CUDA		llama.cpp
moe_op_gemm_a4w4.py	moe_gemm	Python		aiter
moe_op_mxfp4.py	moe_fused	Python		aiter
moe_op_mxfp4_silu_fused.py	moe_fused	Python		aiter
moe_op.py	moe_fused	Python		aiter
moe.py	moe_distributed	Python		megablocks
mx_ops.py	quantization_mx	Python		microxcaling
pa_decode.py	attention_paged	Python		aiter
splitkv_mla.cuh	attention_mla_decode	CUDA		FlashMLA
unified_attention_sparse_mla.py	attention_mla_sparse	Python		aiter