Skip to content

gen - add support for mixed precision operators #1853

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 51 additions & 23 deletions backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1285,7 +1285,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
code << tab << "// -----------------------------------------------------------------------------\n";
code << tab << "extern \"C\" __global__ void " << operator_name
<< "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
<< "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalarCPU *W, Points_Cuda "
"points) {\n";
tab.push();

Expand All @@ -1295,11 +1295,11 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b

CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
code << tab << "const CeedScalarCPU *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
}
}
for (CeedInt i = 0; i < num_output_fields; i++) {
code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
code << tab << "CeedScalarCPU *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
}

code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
Expand Down Expand Up @@ -1572,11 +1572,20 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b

// Compile
{
bool is_compile_good = false;
const CeedInt T_1d = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
bool is_compile_good = false;
const CeedInt T_1d = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
CeedScalarType precision;

// Check for mixed precision
CeedCallBackend(CeedOperatorGetPrecision(op, &precision));

data->thread_1d = T_1d;
CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "OP_T_1D", T_1d));
if (precision) {
CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", T_1d, "CEED_JIT_PRECISION",
(CeedInt)precision));
} else {
CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "OP_T_1D", T_1d));
}
if (is_compile_good) {
*is_good_build = true;
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op));
Expand Down Expand Up @@ -1689,8 +1698,8 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo
code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
code << tab << "// -----------------------------------------------------------------------------\n";
code << tab << "extern \"C\" __global__ void " << operator_name
<< "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
"points, CeedScalar *__restrict__ values_array) {\n";
<< "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalarCPU *W, Points_Cuda "
"points, CeedScalarCPU *__restrict__ values_array) {\n";
tab.push();

// Scratch buffers
Expand All @@ -1699,11 +1708,11 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo

CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
code << tab << "const CeedScalarCPU *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
}
}
for (CeedInt i = 0; i < num_output_fields; i++) {
code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
code << tab << "CeedScalarCPU *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
}

code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
Expand Down Expand Up @@ -2043,12 +2052,22 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo

// Compile
{
bool is_compile_good = false;
const CeedInt T_1d = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
bool is_compile_good = false;
const CeedInt T_1d = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
CeedScalarType precision;

// Check for mixed precision
CeedCallBackend(CeedOperatorGetPrecision(op, &precision));

data->thread_1d = T_1d;
CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good,
is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 1, "OP_T_1D", T_1d));
if (precision) {
CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good,
is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 2, "OP_T_1D", T_1d,
"CEED_JIT_PRECISION", (CeedInt)precision));
} else {
CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good,
is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 1, "OP_T_1D", T_1d));
}
if (is_compile_good) {
*is_good_build = true;
CeedCallBackend(CeedGetKernel_Cuda(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(),
Expand Down Expand Up @@ -2221,8 +2240,8 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera
code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
code << tab << "// -----------------------------------------------------------------------------\n";
code << tab << "extern \"C\" __global__ void " << operator_name
<< "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
"points, CeedScalar *__restrict__ values_array) {\n";
<< "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalarCPU *W, Points_Cuda "
"points, CeedScalarCPU *__restrict__ values_array) {\n";
tab.push();

// Scratch buffers
Expand All @@ -2231,11 +2250,11 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera

CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
code << tab << "const CeedScalarCPU *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
}
}
for (CeedInt i = 0; i < num_output_fields; i++) {
code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
code << tab << "CeedScalarCPU *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
}

code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
Expand Down Expand Up @@ -2485,8 +2504,8 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera
CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size));
CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode));
if (eval_mode == CEED_EVAL_GRAD) {
code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*"
<< (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*"
<< "dim_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
} else {
code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
}
Expand Down Expand Up @@ -2623,11 +2642,20 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera

// Compile
{
bool is_compile_good = false;
const CeedInt T_1d = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
bool is_compile_good = false;
const CeedInt T_1d = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
CeedScalarType precision;

// Check for mixed precision
CeedCallBackend(CeedOperatorGetPrecision(op, &precision));

data->thread_1d = T_1d;
CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 1, "OP_T_1D", T_1d));
if (precision) {
CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 2, "OP_T_1D", T_1d,
"CEED_JIT_PRECISION", (CeedInt)precision));
} else {
CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 1, "OP_T_1D", T_1d));
}
if (is_compile_good) {
*is_good_build = true;
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module_assemble_qfunction, operator_name.c_str(), &data->assemble_qfunction));
Expand Down
1 change: 1 addition & 0 deletions backends/cuda-gen/ceed-cuda-gen.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ static int CeedInit_Cuda_gen(const char *resource, Ceed ceed) {
CeedCallBackend(CeedCalloc(1, &data));
CeedCallBackend(CeedSetData(ceed, data));
CeedCallBackend(CeedInit_Cuda(ceed, resource));
CeedCallBackend(CeedSetSupportsMixedPrecision(ceed, true));

CeedCallBackend(CeedInit("/gpu/cuda/shared", &ceed_shared));
CeedCallBackend(CeedSetDelegate(ceed, ceed_shared));
Expand Down
4 changes: 2 additions & 2 deletions backends/hip-gen/ceed-hip-gen-operator-build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2483,8 +2483,8 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperat
CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size));
CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode));
if (eval_mode == CEED_EVAL_GRAD) {
code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*"
<< (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*"
<< "dim_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
} else {
code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
}
Expand Down
2 changes: 2 additions & 0 deletions include/ceed-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ struct Ceed_private {
bool is_debug;
bool has_valid_op_fallback_resource;
bool is_deterministic;
bool supports_mixed_precision;
char err_msg[CEED_MAX_RESOURCE_LEN];
FOffset *f_offsets;
CeedWorkVectors work_vectors;
Expand Down Expand Up @@ -380,6 +381,7 @@ struct CeedOperator_private {
bool is_composite;
bool is_at_points;
bool has_restriction;
CeedScalarType precision;
CeedQFunctionAssemblyData qf_assembled;
CeedOperatorAssemblyData op_assembled;
CeedOperator *sub_operators;
Expand Down
1 change: 1 addition & 0 deletions include/ceed/backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ CEED_EXTERN int CeedGetOperatorFallbackResource(Ceed ceed, const char **resource
CEED_EXTERN int CeedGetOperatorFallbackCeed(Ceed ceed, Ceed *fallback_ceed);
CEED_EXTERN int CeedSetOperatorFallbackResource(Ceed ceed, const char *resource);
CEED_EXTERN int CeedSetDeterministic(Ceed ceed, bool is_deterministic);
CEED_INTERN int CeedSetSupportsMixedPrecision(Ceed ceed, bool supports_mixed_precision);
CEED_EXTERN int CeedSetBackendFunctionImpl(Ceed ceed, const char *type, void *object, const char *func_name, void (*f)(void));
CEED_EXTERN int CeedGetData(Ceed ceed, void *data);
CEED_EXTERN int CeedSetData(Ceed ceed, void *data);
Expand Down
19 changes: 17 additions & 2 deletions include/ceed/ceed-f32.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,26 @@
/// Include this header in ceed.h to use float instead of double.
#pragma once

#ifndef CEED_RUNNING_JIT_PASS
#include <float.h>
#endif

#define CEED_SCALAR_IS_FP32

/// Set base scalar type to FP32. (See CeedScalarType enum in ceed.h for all options.)
#define CEED_SCALAR_TYPE CEED_SCALAR_FP32
typedef float CeedScalar;
#if defined(CEED_RUNNING_JIT_PASS) && defined(CEED_JIT_PRECISION) && (CEED_JIT_PRECISION != CEED_SCALAR_TYPE)
#ifdef CEED_JIT_PRECISION == CEED_SCALAR_FP64
typedef double CeedScalar;
typedef float CeedScalarCPU;

/// Machine epsilon
static const CeedScalar CEED_EPSILON = DBL_EPSILON;
#endif // CEED_JIT_PRECISION
#else
typedef float CeedScalar;
typedef CeedScalar CeedScalarCPU;

/// Machine epsilon
#define CEED_EPSILON 6e-08
static const CeedScalar CEED_EPSILON = FLT_EPSILON;
#endif
19 changes: 17 additions & 2 deletions include/ceed/ceed-f64.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,26 @@
/// This is the default header included in ceed.h.
#pragma once

#ifndef CEED_RUNNING_JIT_PASS
#include <float.h>
#endif

#define CEED_SCALAR_IS_FP64

/// Set base scalar type to FP64. (See CeedScalarType enum in ceed.h for all options.)
#define CEED_SCALAR_TYPE CEED_SCALAR_FP64
typedef double CeedScalar;
#if defined(CEED_RUNNING_JIT_PASS) && defined(CEED_JIT_PRECISION) && (CEED_JIT_PRECISION != CEED_SCALAR_TYPE)
#if CEED_JIT_PRECISION == CEED_SCALAR_FP32
typedef float CeedScalar;
typedef double CeedScalarCPU;

/// Machine epsilon
static const CeedScalar CEED_EPSILON = FLT_EPSILON;
#endif // CEED_JIT_PRECISION
#else
typedef double CeedScalar;
typedef CeedScalar CeedScalarCPU;

/// Machine epsilon
#define CEED_EPSILON 1e-16
static const CeedScalar CEED_EPSILON = DBL_EPSILON;
#endif // CEED_RUNNING_JIT_PASS && CEED_JIT_MIXED_PRECISION
3 changes: 3 additions & 0 deletions include/ceed/ceed.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ CEED_EXTERN int CeedSetStream(Ceed ceed, void *handle);
CEED_EXTERN int CeedReferenceCopy(Ceed ceed, Ceed *ceed_copy);
CEED_EXTERN int CeedGetResource(Ceed ceed, const char **resource);
CEED_EXTERN int CeedIsDeterministic(Ceed ceed, bool *is_deterministic);
CEED_EXTERN int CeedGetSupportsMixedPrecision(Ceed ceed, bool *supports_mixed_precision);
CEED_EXTERN int CeedAddJitSourceRoot(Ceed ceed, const char *jit_source_root);
CEED_EXTERN int CeedAddJitDefine(Ceed ceed, const char *jit_define);
CEED_EXTERN int CeedView(Ceed ceed, FILE *stream);
Expand Down Expand Up @@ -426,6 +427,8 @@ CEED_EXTERN int CeedOperatorCheckReady(CeedOperator op);
CEED_EXTERN int CeedOperatorGetActiveVectorLengths(CeedOperator op, CeedSize *input_size, CeedSize *output_size);
CEED_EXTERN int CeedOperatorSetQFunctionAssemblyReuse(CeedOperator op, bool reuse_assembly_data);
CEED_EXTERN int CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(CeedOperator op, bool needs_data_update);
CEED_EXTERN int CeedOperatorSetPrecision(CeedOperator op, CeedScalarType precision);
CEED_EXTERN int CeedOperatorGetPrecision(CeedOperator op, CeedScalarType *precision);
CEED_EXTERN int CeedOperatorLinearAssembleQFunction(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request);
CEED_EXTERN int CeedOperatorLinearAssembleQFunctionBuildOrUpdate(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr,
CeedRequest *request);
Expand Down
Loading