@@ -1285,7 +1285,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
1285
1285
code << tab << " // s_G_[in,out]_i: Gradient matrix, shared memory\n " ;
1286
1286
code << tab << " // -----------------------------------------------------------------------------\n " ;
1287
1287
code << tab << " extern \" C\" __global__ void " << operator_name
1288
- << " (CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
1288
+ << " (CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalarCPU *W, Points_Cuda "
1289
1289
" points) {\n " ;
1290
1290
tab.push ();
1291
1291
@@ -1295,11 +1295,11 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
1295
1295
1296
1296
CeedCallBackend (CeedQFunctionFieldGetEvalMode (qf_input_fields[i], &eval_mode));
1297
1297
if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
1298
- code << tab << " const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << " ];\n " ;
1298
+ code << tab << " const CeedScalarCPU *__restrict__ d_in_" << i << " = fields.inputs[" << i << " ];\n " ;
1299
1299
}
1300
1300
}
1301
1301
for (CeedInt i = 0 ; i < num_output_fields; i++) {
1302
- code << tab << " CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << " ];\n " ;
1302
+ code << tab << " CeedScalarCPU *__restrict__ d_out_" << i << " = fields.outputs[" << i << " ];\n " ;
1303
1303
}
1304
1304
1305
1305
code << tab << " const CeedInt max_dim = " << max_dim << " ;\n " ;
@@ -1574,9 +1574,18 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
1574
1574
{
1575
1575
bool is_compile_good = false ;
1576
1576
const CeedInt T_1d = CeedIntMax (is_all_tensor ? Q_1d : Q, data->max_P_1d );
1577
+ bool use_mixed_precision;
1578
+
1579
+ // Check for mixed precision
1580
+ CeedCallBackend (CeedOperatorGetMixedPrecision (op, &use_mixed_precision));
1577
1581
1578
1582
data->thread_1d = T_1d;
1579
- CeedCallBackend (CeedTryCompile_Cuda (ceed, code.str ().c_str (), &is_compile_good, &data->module , 1 , " OP_T_1D" , T_1d));
1583
+ if (use_mixed_precision) {
1584
+ CeedCallBackend (
1585
+ CeedTryCompile_Cuda (ceed, code.str ().c_str (), &is_compile_good, &data->module , 2 , " OP_T_1D" , T_1d, " CEED_JIT_MIXED_PRECISION" , 1 ));
1586
+ } else {
1587
+ CeedCallBackend (CeedTryCompile_Cuda (ceed, code.str ().c_str (), &is_compile_good, &data->module , 1 , " OP_T_1D" , T_1d));
1588
+ }
1580
1589
if (is_compile_good) {
1581
1590
*is_good_build = true ;
1582
1591
CeedCallBackend (CeedGetKernel_Cuda (ceed, data->module , operator_name.c_str (), &data->op ));
@@ -1689,8 +1698,8 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo
1689
1698
code << tab << " // s_G_[in,out]_i: Gradient matrix, shared memory\n " ;
1690
1699
code << tab << " // -----------------------------------------------------------------------------\n " ;
1691
1700
code << tab << " extern \" C\" __global__ void " << operator_name
1692
- << " (CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
1693
- " points, CeedScalar *__restrict__ values_array) {\n " ;
1701
+ << " (CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalarCPU *W, Points_Cuda "
1702
+ " points, CeedScalarCPU *__restrict__ values_array) {\n " ;
1694
1703
tab.push ();
1695
1704
1696
1705
// Scratch buffers
@@ -1699,11 +1708,11 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo
1699
1708
1700
1709
CeedCallBackend (CeedQFunctionFieldGetEvalMode (qf_input_fields[i], &eval_mode));
1701
1710
if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
1702
- code << tab << " const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << " ];\n " ;
1711
+ code << tab << " const CeedScalarCPU *__restrict__ d_in_" << i << " = fields.inputs[" << i << " ];\n " ;
1703
1712
}
1704
1713
}
1705
1714
for (CeedInt i = 0 ; i < num_output_fields; i++) {
1706
- code << tab << " CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << " ];\n " ;
1715
+ code << tab << " CeedScalarCPU *__restrict__ d_out_" << i << " = fields.outputs[" << i << " ];\n " ;
1707
1716
}
1708
1717
1709
1718
code << tab << " const CeedInt max_dim = " << max_dim << " ;\n " ;
@@ -2045,10 +2054,20 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo
2045
2054
{
2046
2055
bool is_compile_good = false ;
2047
2056
const CeedInt T_1d = CeedIntMax (is_all_tensor ? Q_1d : Q, data->max_P_1d );
2057
+ bool use_mixed_precision;
2058
+
2059
+ // Check for mixed precision
2060
+ CeedCallBackend (CeedOperatorGetMixedPrecision (op, &use_mixed_precision));
2048
2061
2049
2062
data->thread_1d = T_1d;
2050
- CeedCallBackend (CeedTryCompile_Cuda (ceed, code.str ().c_str (), &is_compile_good,
2051
- is_full ? &data->module_assemble_full : &data->module_assemble_diagonal , 1 , " OP_T_1D" , T_1d));
2063
+ if (use_mixed_precision) {
2064
+ CeedCallBackend (CeedTryCompile_Cuda (ceed, code.str ().c_str (), &is_compile_good,
2065
+ is_full ? &data->module_assemble_full : &data->module_assemble_diagonal , 2 , " OP_T_1D" , T_1d,
2066
+ " CEED_JIT_MIXED_PRECISION" , 1 ));
2067
+ } else {
2068
+ CeedCallBackend (CeedTryCompile_Cuda (ceed, code.str ().c_str (), &is_compile_good,
2069
+ is_full ? &data->module_assemble_full : &data->module_assemble_diagonal , 1 , " OP_T_1D" , T_1d));
2070
+ }
2052
2071
if (is_compile_good) {
2053
2072
*is_good_build = true ;
2054
2073
CeedCallBackend (CeedGetKernel_Cuda (ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal , operator_name.c_str (),
@@ -2221,8 +2240,8 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera
2221
2240
code << tab << " // s_G_[in,out]_i: Gradient matrix, shared memory\n " ;
2222
2241
code << tab << " // -----------------------------------------------------------------------------\n " ;
2223
2242
code << tab << " extern \" C\" __global__ void " << operator_name
2224
- << " (CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
2225
- " points, CeedScalar *__restrict__ values_array) {\n " ;
2243
+ << " (CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalarCPU *W, Points_Cuda "
2244
+ " points, CeedScalarCPU *__restrict__ values_array) {\n " ;
2226
2245
tab.push ();
2227
2246
2228
2247
// Scratch buffers
@@ -2231,11 +2250,11 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera
2231
2250
2232
2251
CeedCallBackend (CeedQFunctionFieldGetEvalMode (qf_input_fields[i], &eval_mode));
2233
2252
if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
2234
- code << tab << " const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << " ];\n " ;
2253
+ code << tab << " const CeedScalarCPU *__restrict__ d_in_" << i << " = fields.inputs[" << i << " ];\n " ;
2235
2254
}
2236
2255
}
2237
2256
for (CeedInt i = 0 ; i < num_output_fields; i++) {
2238
- code << tab << " CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << " ];\n " ;
2257
+ code << tab << " CeedScalarCPU *__restrict__ d_out_" << i << " = fields.outputs[" << i << " ];\n " ;
2239
2258
}
2240
2259
2241
2260
code << tab << " const CeedInt max_dim = " << max_dim << " ;\n " ;
@@ -2485,8 +2504,8 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera
2485
2504
CeedCallBackend (CeedQFunctionFieldGetSize (qf_input_fields[f], &field_size));
2486
2505
CeedCallBackend (CeedQFunctionFieldGetEvalMode (qf_input_fields[f], &eval_mode));
2487
2506
if (eval_mode == CEED_EVAL_GRAD) {
2488
- code << tab << " CeedScalar r_q_in_" << f << " [num_comp_in_" << f << " *" << " dim_in_ " << f << " * "
2489
- << (is_all_tensor && (max_dim >= 3 ) ? " Q_1d" : " 1" ) << " ] = {0.};\n " ;
2507
+ code << tab << " CeedScalar r_q_in_" << f << " [num_comp_in_" << f << " *"
2508
+ << " dim_in_ " << f << " * " << (is_all_tensor && (max_dim >= 3 ) ? " Q_1d" : " 1" ) << " ] = {0.};\n " ;
2490
2509
} else {
2491
2510
code << tab << " CeedScalar r_q_in_" << f << " [num_comp_in_" << f << " *" << (is_all_tensor && (max_dim >= 3 ) ? " Q_1d" : " 1" ) << " ] = {0.};\n " ;
2492
2511
}
@@ -2625,9 +2644,18 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera
2625
2644
{
2626
2645
bool is_compile_good = false ;
2627
2646
const CeedInt T_1d = CeedIntMax (is_all_tensor ? Q_1d : Q, data->max_P_1d );
2647
+ bool use_mixed_precision;
2648
+
2649
+ // Check for mixed precision
2650
+ CeedCallBackend (CeedOperatorGetMixedPrecision (op, &use_mixed_precision));
2628
2651
2629
2652
data->thread_1d = T_1d;
2630
- CeedCallBackend (CeedTryCompile_Cuda (ceed, code.str ().c_str (), &is_compile_good, &data->module_assemble_qfunction , 1 , " OP_T_1D" , T_1d));
2653
+ if (use_mixed_precision) {
2654
+ CeedCallBackend (CeedTryCompile_Cuda (ceed, code.str ().c_str (), &is_compile_good, &data->module_assemble_qfunction , 2 , " OP_T_1D" , T_1d,
2655
+ " CEED_JIT_MIXED_PRECISION" , 1 ));
2656
+ } else {
2657
+ CeedCallBackend (CeedTryCompile_Cuda (ceed, code.str ().c_str (), &is_compile_good, &data->module_assemble_qfunction , 1 , " OP_T_1D" , T_1d));
2658
+ }
2631
2659
if (is_compile_good) {
2632
2660
*is_good_build = true ;
2633
2661
CeedCallBackend (CeedGetKernel_Cuda (ceed, data->module_assemble_qfunction , operator_name.c_str (), &data->assemble_qfunction ));
0 commit comments