Skip to content

Commit c0bfc57

Browse files
authored
CUDA: mul_mat_id for mmf for bs <= 64 for f16 and bs <= 32 for f32 (#16277)
* CUDA: mul_mat_id for mmf for bs <= 64 for f16 and bs <= 32 for f32 This commit adds mul_mat_id support for ncols_dst >= 16. It does this by packing ncols_dst tiles into the blockDim.y. My tests on a RTX 3090 show that this is faster than the cuBLAS fallback for f16 till bs=64, and for f32 till bs=32 * Review: refactor if statement
1 parent 75a3a6c commit c0bfc57

File tree

4 files changed

+92
-47
lines changed

4 files changed

+92
-47
lines changed

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2031,15 +2031,15 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
20312031
const int cc = ggml_cuda_info().devices[id].cc;
20322032
const int warp_size = ggml_cuda_info().devices[id].warp_size;
20332033
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
2034-
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1]);
2034+
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1], /*mul_mat_id=*/false);
20352035
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]);
20362036
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
20372037
}
20382038
} else {
20392039
const int cc = ggml_cuda_info().devices[ctx.device].cc;
20402040
const int warp_size = ggml_cuda_info().devices[ctx.device].warp_size;
20412041
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
2042-
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1]);
2042+
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1], /*mul_mat_id=*/false);
20432043
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]);
20442044
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
20452045
}
@@ -2111,7 +2111,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
21112111
return;
21122112
}
21132113

2114-
if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src1->ne[2])) {
2114+
if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src1->ne[2], /*mul_mat_id=*/true)) {
21152115
ggml_cuda_mul_mat_f(ctx, src0, src1, ids, dst);
21162116
return;
21172117
}

ggml/src/ggml-cuda/mmf.cu

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * sr
8484
}
8585
}
8686

87-
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne, const int src1_ncols) {
87+
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne, const int src1_ncols, bool mul_mat_id) {
8888

8989
if (ggml_is_quantized(type)) {
9090
return false;
@@ -96,8 +96,18 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
9696
if (src0_ne[1] % MMF_ROWS_PER_BLOCK != 0) {
9797
return false;
9898
}
99-
if (src1_ncols > 16) {
100-
return false;
99+
100+
if (mul_mat_id) {
101+
if (type == GGML_TYPE_F32 && src1_ncols > 32) {
102+
return false;
103+
}
104+
if ((type == GGML_TYPE_F16 || type == GGML_TYPE_BF16) && src1_ncols > 64) {
105+
return false;
106+
}
107+
} else {
108+
if (src1_ncols > 16) {
109+
return false;
110+
}
101111
}
102112

103113
switch (type) {

0 commit comments

Comments
 (0)