From 4b254d5a5e68784e43d89bda8df363baa8a18280 Mon Sep 17 00:00:00 2001 From: Zach Atkins Date: Thu, 10 Jul 2025 17:08:05 -0600 Subject: [PATCH 1/5] gen - add support for mixed precision CUDA operators --- .../cuda-gen/ceed-cuda-gen-operator-build.cpp | 62 ++-- backends/cuda-gen/ceed-cuda-gen.c | 1 + include/ceed-impl.h | 2 + include/ceed/backend.h | 1 + include/ceed/ceed-f32.h | 5 +- include/ceed/ceed-f64.h | 13 +- include/ceed/ceed.h | 3 + .../ceed/jit-source/cuda/cuda-gen-templates.h | 143 +++++---- .../cuda-shared-basis-nontensor-templates.h | 33 +- .../cuda/cuda-shared-basis-nontensor.h | 24 +- .../cuda-shared-basis-read-write-templates.h | 50 ++-- ...-shared-basis-tensor-at-points-templates.h | 72 ++--- .../cuda/cuda-shared-basis-tensor-at-points.h | 34 ++- ...-shared-basis-tensor-flattened-templates.h | 162 +++++----- .../cuda/cuda-shared-basis-tensor-templates.h | 152 +++++----- .../cuda/cuda-shared-basis-tensor.h | 25 +- include/ceed/jit-source/cuda/cuda-types.h | 12 +- interface/ceed-operator.c | 45 ++- interface/ceed.c | 30 ++ tests/t502-operator-mixed.c | 125 ++++++++ tests/t503-operator-mixed.c | 114 +++++++ tests/t505-operator-mixed.c | 128 ++++++++ tests/t506-operator-mixed.c | 173 +++++++++++ tests/t510-operator-mixed.c | 133 +++++++++ tests/t520-operator-mixed.c | 234 +++++++++++++++ tests/t522-operator-mixed.c | 221 ++++++++++++++ tests/t591-operator-mixed.c | 197 ++++++++++++ tests/t592-operator-mixed.c | 249 ++++++++++++++++ tests/t593-operator-mixed.c | 155 ++++++++++ tests/t594-operator-mixed.c | 182 +++++++++++ tests/t596-operator-mixed.c | 205 +++++++++++++ tests/t597-operator-mixed.c | 206 +++++++++++++ tests/t598-operator-mixed.c | 282 ++++++++++++++++++ 33 files changed, 3094 insertions(+), 379 deletions(-) create mode 100644 tests/t502-operator-mixed.c create mode 100644 tests/t503-operator-mixed.c create mode 100644 tests/t505-operator-mixed.c create mode 100644 tests/t506-operator-mixed.c create mode 100644 tests/t510-operator-mixed.c create mode 100644 tests/t520-operator-mixed.c create mode 100644 tests/t522-operator-mixed.c create mode 100644 tests/t591-operator-mixed.c create mode 100644 tests/t592-operator-mixed.c create mode 100644 tests/t593-operator-mixed.c create mode 100644 tests/t594-operator-mixed.c create mode 100644 tests/t596-operator-mixed.c create mode 100644 tests/t597-operator-mixed.c create mode 100644 tests/t598-operator-mixed.c diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp index 481c358466..59fcf03d42 100644 --- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp +++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp @@ -1285,7 +1285,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; code << tab << "// -----------------------------------------------------------------------------\n"; code << tab << "extern \"C\" __global__ void " << operator_name - << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda " + << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalarCPU *W, Points_Cuda " "points) {\n"; tab.push(); @@ -1295,11 +1295,11 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT - code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; + code << tab << "const CeedScalarCPU *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; } } for (CeedInt i = 0; i < num_output_fields; i++) { - code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; + code << tab << "CeedScalarCPU *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; } code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; @@ -1574,9 +1574,18 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b { bool is_compile_good = false; const CeedInt T_1d = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d); + bool use_mixed_precision; + + // Check for mixed precision + CeedCallBackend(CeedOperatorGetMixedPrecision(op, &use_mixed_precision)); data->thread_1d = T_1d; - CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "OP_T_1D", T_1d)); + if (use_mixed_precision) { + CeedCallBackend( + CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", T_1d, "CEED_JIT_MIXED_PRECISION", 1)); + } else { + CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "OP_T_1D", T_1d)); + } if (is_compile_good) { *is_good_build = true; CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op)); @@ -1689,8 +1698,8 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; code << tab << "// -----------------------------------------------------------------------------\n"; code << tab << "extern \"C\" __global__ void " << operator_name - << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda " - "points, CeedScalar *__restrict__ values_array) {\n"; + << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalarCPU *W, Points_Cuda " + "points, CeedScalarCPU *__restrict__ values_array) {\n"; tab.push(); // Scratch buffers @@ -1699,11 +1708,11 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT - code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; + code << tab << "const CeedScalarCPU *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; } } for (CeedInt i = 0; i < num_output_fields; i++) { - code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; + code << tab << "CeedScalarCPU *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; } code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; @@ -2045,10 +2054,20 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo { bool is_compile_good = false; const CeedInt T_1d = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d); + bool use_mixed_precision; + + // Check for mixed precision + CeedCallBackend(CeedOperatorGetMixedPrecision(op, &use_mixed_precision)); data->thread_1d = T_1d; - CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, - is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 1, "OP_T_1D", T_1d)); + if (use_mixed_precision) { + CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, + is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 2, "OP_T_1D", T_1d, + "CEED_JIT_MIXED_PRECISION", 1)); + } else { + CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, + is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 1, "OP_T_1D", T_1d)); + } if (is_compile_good) { *is_good_build = true; CeedCallBackend(CeedGetKernel_Cuda(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(), @@ -2221,8 +2240,8 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; code << tab << "// -----------------------------------------------------------------------------\n"; code << tab << "extern \"C\" __global__ void " << operator_name - << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda " - "points, CeedScalar *__restrict__ values_array) {\n"; + << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalarCPU *W, Points_Cuda " + "points, CeedScalarCPU *__restrict__ values_array) {\n"; tab.push(); // Scratch buffers @@ -2231,11 +2250,11 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT - code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; + code << tab << "const CeedScalarCPU *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; } } for (CeedInt i = 0; i < num_output_fields; i++) { - code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; + code << tab << "CeedScalarCPU *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; } code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; @@ -2485,8 +2504,8 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode)); if (eval_mode == CEED_EVAL_GRAD) { - code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*" - << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n"; + code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" + << "dim_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n"; } else { code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n"; } @@ -2625,9 +2644,18 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera { bool is_compile_good = false; const CeedInt T_1d = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d); + bool use_mixed_precision; + + // Check for mixed precision + CeedCallBackend(CeedOperatorGetMixedPrecision(op, &use_mixed_precision)); data->thread_1d = T_1d; - CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 1, "OP_T_1D", T_1d)); + if (use_mixed_precision) { + CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 2, "OP_T_1D", T_1d, + "CEED_JIT_MIXED_PRECISION", 1)); + } else { + CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 1, "OP_T_1D", T_1d)); + } if (is_compile_good) { *is_good_build = true; CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module_assemble_qfunction, operator_name.c_str(), &data->assemble_qfunction)); diff --git a/backends/cuda-gen/ceed-cuda-gen.c b/backends/cuda-gen/ceed-cuda-gen.c index f38b700225..ee5068e72c 100644 --- a/backends/cuda-gen/ceed-cuda-gen.c +++ b/backends/cuda-gen/ceed-cuda-gen.c @@ -29,6 +29,7 @@ static int CeedInit_Cuda_gen(const char *resource, Ceed ceed) { CeedCallBackend(CeedCalloc(1, &data)); CeedCallBackend(CeedSetData(ceed, data)); CeedCallBackend(CeedInit_Cuda(ceed, resource)); + CeedCallBackend(CeedSetSupportsMixedPrecision(ceed, true)); CeedCallBackend(CeedInit("/gpu/cuda/shared", &ceed_shared)); CeedCallBackend(CeedSetDelegate(ceed, ceed_shared)); diff --git a/include/ceed-impl.h b/include/ceed-impl.h index 95c920604d..e8f6976736 100644 --- a/include/ceed-impl.h +++ b/include/ceed-impl.h @@ -128,6 +128,7 @@ struct Ceed_private { bool is_debug; bool has_valid_op_fallback_resource; bool is_deterministic; + bool supports_mixed_precision; char err_msg[CEED_MAX_RESOURCE_LEN]; FOffset *f_offsets; CeedWorkVectors work_vectors; @@ -380,6 +381,7 @@ struct CeedOperator_private { bool is_composite; bool is_at_points; bool has_restriction; + bool use_mixed_precision; CeedQFunctionAssemblyData qf_assembled; CeedOperatorAssemblyData op_assembled; CeedOperator *sub_operators; diff --git a/include/ceed/backend.h b/include/ceed/backend.h index e6b608e571..d4bfa22f35 100644 --- a/include/ceed/backend.h +++ b/include/ceed/backend.h @@ -250,6 +250,7 @@ CEED_EXTERN int CeedGetOperatorFallbackResource(Ceed ceed, const char **resource CEED_EXTERN int CeedGetOperatorFallbackCeed(Ceed ceed, Ceed *fallback_ceed); CEED_EXTERN int CeedSetOperatorFallbackResource(Ceed ceed, const char *resource); CEED_EXTERN int CeedSetDeterministic(Ceed ceed, bool is_deterministic); +CEED_INTERN int CeedSetSupportsMixedPrecision(Ceed ceed, bool supports_mixed_precision); CEED_EXTERN int CeedSetBackendFunctionImpl(Ceed ceed, const char *type, void *object, const char *func_name, void (*f)(void)); CEED_EXTERN int CeedGetData(Ceed ceed, void *data); CEED_EXTERN int CeedSetData(Ceed ceed, void *data); diff --git a/include/ceed/ceed-f32.h b/include/ceed/ceed-f32.h index 0bce734257..39d2fb1187 100644 --- a/include/ceed/ceed-f32.h +++ b/include/ceed/ceed-f32.h @@ -14,7 +14,8 @@ /// Set base scalar type to FP32. (See CeedScalarType enum in ceed.h for all options.) #define CEED_SCALAR_TYPE CEED_SCALAR_FP32 -typedef float CeedScalar; +typedef float CeedScalar; +typedef CeedScalar CeedScalarCPU; /// Machine epsilon -#define CEED_EPSILON 6e-08 +#define CEED_EPSILON 0x1p-23 diff --git a/include/ceed/ceed-f64.h b/include/ceed/ceed-f64.h index b74d867c18..1e3a7fd7bf 100644 --- a/include/ceed/ceed-f64.h +++ b/include/ceed/ceed-f64.h @@ -14,7 +14,16 @@ /// Set base scalar type to FP64. (See CeedScalarType enum in ceed.h for all options.) #define CEED_SCALAR_TYPE CEED_SCALAR_FP64 -typedef double CeedScalar; +#if defined(CEED_RUNNING_JIT_PASS) && defined(CEED_JIT_MIXED_PRECISION) +typedef float CeedScalar; +typedef double CeedScalarCPU; /// Machine epsilon -#define CEED_EPSILON 1e-16 +#define CEED_EPSILON 0x1p-23 +#else +typedef double CeedScalar; +typedef CeedScalar CeedScalarCPU; + +/// Machine epsilon +#define CEED_EPSILON 0x1p-52 +#endif // CEED_RUNNING_JIT_PASS && CEED_JIT_MIXED_PRECISION diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h index af510065eb..46d22b5dab 100644 --- a/include/ceed/ceed.h +++ b/include/ceed/ceed.h @@ -106,6 +106,7 @@ CEED_EXTERN int CeedSetStream(Ceed ceed, void *handle); CEED_EXTERN int CeedReferenceCopy(Ceed ceed, Ceed *ceed_copy); CEED_EXTERN int CeedGetResource(Ceed ceed, const char **resource); CEED_EXTERN int CeedIsDeterministic(Ceed ceed, bool *is_deterministic); +CEED_EXTERN int CeedGetSupportsMixedPrecision(Ceed ceed, bool *supports_mixed_precision); CEED_EXTERN int CeedAddJitSourceRoot(Ceed ceed, const char *jit_source_root); CEED_EXTERN int CeedAddJitDefine(Ceed ceed, const char *jit_define); CEED_EXTERN int CeedView(Ceed ceed, FILE *stream); @@ -426,6 +427,8 @@ CEED_EXTERN int CeedOperatorCheckReady(CeedOperator op); CEED_EXTERN int CeedOperatorGetActiveVectorLengths(CeedOperator op, CeedSize *input_size, CeedSize *output_size); CEED_EXTERN int CeedOperatorSetQFunctionAssemblyReuse(CeedOperator op, bool reuse_assembly_data); CEED_EXTERN int CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(CeedOperator op, bool needs_data_update); +CEED_EXTERN int CeedOperatorSetMixedPrecision(CeedOperator op); +CEED_EXTERN int CeedOperatorGetMixedPrecision(CeedOperator op, bool *use_mixed_precision); CEED_EXTERN int CeedOperatorLinearAssembleQFunction(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request); CEED_EXTERN int CeedOperatorLinearAssembleQFunctionBuildOrUpdate(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request); diff --git a/include/ceed/jit-source/cuda/cuda-gen-templates.h b/include/ceed/jit-source/cuda/cuda-gen-templates.h index f4dccf54ea..d78e83eeab 100644 --- a/include/ceed/jit-source/cuda/cuda-gen-templates.h +++ b/include/ceed/jit-source/cuda/cuda-gen-templates.h @@ -12,8 +12,8 @@ //------------------------------------------------------------------------------ // Load matrices for basis actions //------------------------------------------------------------------------------ -template -inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { +template +inline __device__ void LoadMatrix(SharedData_Cuda &data, const ScalarIn *__restrict__ d_B, ScalarOut *B) { for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; } @@ -24,9 +24,9 @@ inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__res //------------------------------------------------------------------------------ // L-vector -> single point //------------------------------------------------------------------------------ -template +template inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, - const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { + const CeedInt *__restrict__ indices, const ScalarIn *__restrict__ d_u, ScalarOut *r_u) { const CeedInt ind = indices[p + elem * NUM_PTS]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { @@ -37,9 +37,9 @@ inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, cons //------------------------------------------------------------------------------ // Single point -> L-vector //------------------------------------------------------------------------------ -template +template inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, - const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) { + const CeedInt *__restrict__ indices, const ScalarIn *__restrict__ r_u, ScalarOut *d_u) { if (p < points_in_elem) { const CeedInt ind = indices[p + elem * NUM_PTS]; @@ -56,8 +56,8 @@ inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, con //------------------------------------------------------------------------------ // Set E-vector value //------------------------------------------------------------------------------ -template -inline __device__ void SetEVecStandard1d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { +template +inline __device__ void SetEVecStandard1d_Single(SharedData_Cuda &data, const CeedInt n, const ScalarIn value, ScalarOut *__restrict__ r_v) { const CeedInt target_comp = n / P_1D; const CeedInt target_node = n % P_1D; @@ -69,9 +69,9 @@ inline __device__ void SetEVecStandard1d_Single(SharedData_Cuda &data, const Cee //------------------------------------------------------------------------------ // L-vector -> E-vector, offsets provided //------------------------------------------------------------------------------ -template +template inline __device__ void ReadLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, - const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { + const ScalarIn *__restrict__ d_u, ScalarOut *__restrict__ r_u) { if (data.t_id_x < P_1D) { const CeedInt node = data.t_id_x; const CeedInt ind = indices[node + elem * P_1D]; @@ -83,9 +83,8 @@ inline __device__ void ReadLVecStandard1d(SharedData_Cuda &data, const CeedInt n //------------------------------------------------------------------------------ // L-vector -> E-vector, strided //------------------------------------------------------------------------------ -template -inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, - CeedScalar *__restrict__ r_u) { +template +inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const ScalarIn *__restrict__ d_u, ScalarOut *__restrict__ r_u) { if (data.t_id_x < P_1D) { const CeedInt node = data.t_id_x; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; @@ -97,9 +96,9 @@ inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt el //------------------------------------------------------------------------------ // E-vector -> L-vector, offsets provided //------------------------------------------------------------------------------ -template +template inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, - const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) { if (data.t_id_x < P_1D) { const CeedInt node = data.t_id_x; const CeedInt ind = indices[node + elem * P_1D]; @@ -108,10 +107,10 @@ inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt } } -template +template inline __device__ void WriteLVecStandard1d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, - const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, - CeedScalar *__restrict__ d_v) { + const CeedInt *__restrict__ indices, const ScalarIn *__restrict__ r_v, + ScalarOut *__restrict__ d_v) { const CeedInt target_comp = n / P_1D; const CeedInt target_node = n % P_1D; @@ -125,9 +124,9 @@ inline __device__ void WriteLVecStandard1d_Single(SharedData_Cuda &data, const C //------------------------------------------------------------------------------ // E-vector -> L-vector, full assembly //------------------------------------------------------------------------------ -template +template inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in, - const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) { const CeedInt in_comp = in / P_1D; const CeedInt in_node = in % P_1D; const CeedInt e_vec_size = P_1D * NUM_COMP; @@ -144,9 +143,9 @@ inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Cuda &data, const //------------------------------------------------------------------------------ // E-vector -> L-vector, Qfunction assembly //------------------------------------------------------------------------------ -template +template inline __device__ void WriteLVecStandard1d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset, - const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + const CeedInt output_offset, const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) { if (data.t_id_x < Q_1D) { const CeedInt ind = data.t_id_x + elem * Q_1D; @@ -159,9 +158,8 @@ inline __device__ void WriteLVecStandard1d_QFAssembly(SharedData_Cuda &data, con //------------------------------------------------------------------------------ // E-vector -> L-vector, strided //------------------------------------------------------------------------------ -template -inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, - CeedScalar *__restrict__ d_v) { +template +inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) { if (data.t_id_x < P_1D) { const CeedInt node = data.t_id_x; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; @@ -177,8 +175,8 @@ inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt e //------------------------------------------------------------------------------ // Set E-vector value //------------------------------------------------------------------------------ -template -inline __device__ void SetEVecStandard2d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { +template +inline __device__ void SetEVecStandard2d_Single(SharedData_Cuda &data, const CeedInt n, const ScalarIn value, ScalarOut *__restrict__ r_v) { const CeedInt target_comp = n / (P_1D * P_1D); const CeedInt target_node_x = n % P_1D; const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; @@ -191,9 +189,9 @@ inline __device__ void SetEVecStandard2d_Single(SharedData_Cuda &data, const Cee //------------------------------------------------------------------------------ // L-vector -> E-vector, offsets provided //------------------------------------------------------------------------------ -template +template inline __device__ void ReadLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, - const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { + const ScalarIn *__restrict__ d_u, ScalarOut *__restrict__ r_u) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D; const CeedInt ind = indices[node + elem * P_1D * P_1D]; @@ -205,9 +203,8 @@ inline __device__ void ReadLVecStandard2d(SharedData_Cuda &data, const CeedInt n //------------------------------------------------------------------------------ // L-vector -> E-vector, strided //------------------------------------------------------------------------------ -template -inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, - CeedScalar *__restrict__ r_u) { +template +inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const ScalarIn *__restrict__ d_u, ScalarOut *__restrict__ r_u) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; @@ -219,9 +216,9 @@ inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt el //------------------------------------------------------------------------------ // E-vector -> L-vector, offsets provided //------------------------------------------------------------------------------ -template +template inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, - const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D; const CeedInt ind = indices[node + elem * P_1D * P_1D]; @@ -230,10 +227,10 @@ inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt } } -template +template inline __device__ void WriteLVecStandard2d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, - const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, - CeedScalar *__restrict__ d_v) { + const CeedInt *__restrict__ indices, const ScalarIn *__restrict__ r_v, + ScalarOut *__restrict__ d_v) { const CeedInt target_comp = n / (P_1D * P_1D); const CeedInt target_node_x = n % P_1D; const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; @@ -249,9 +246,9 @@ inline __device__ void WriteLVecStandard2d_Single(SharedData_Cuda &data, const C //------------------------------------------------------------------------------ // E-vector -> L-vector, full assembly //------------------------------------------------------------------------------ -template +template inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in, - const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) { const CeedInt elem_size = P_1D * P_1D; const CeedInt in_comp = in / elem_size; const CeedInt in_node_x = in % P_1D; @@ -273,9 +270,9 @@ inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Cuda &data, const //------------------------------------------------------------------------------ // E-vector -> L-vector, Qfunction assembly //------------------------------------------------------------------------------ -template +template inline __device__ void WriteLVecStandard2d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset, - const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + const CeedInt output_offset, const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) { if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D) + elem * Q_1D * Q_1D; @@ -288,9 +285,8 @@ inline __device__ void WriteLVecStandard2d_QFAssembly(SharedData_Cuda &data, con //------------------------------------------------------------------------------ // E-vector -> L-vector, strided //------------------------------------------------------------------------------ -template -inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, - CeedScalar *__restrict__ d_v) { +template +inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; @@ -306,8 +302,8 @@ inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt e //------------------------------------------------------------------------------ // Set E-vector value //------------------------------------------------------------------------------ -template -inline __device__ void SetEVecStandard3d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { +template +inline __device__ void SetEVecStandard3d_Single(SharedData_Cuda &data, const CeedInt n, const ScalarIn value, ScalarOut *__restrict__ r_v) { const CeedInt target_comp = n / (P_1D * P_1D * P_1D); const CeedInt target_node_x = n % P_1D; const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; @@ -321,9 +317,9 @@ inline __device__ void SetEVecStandard3d_Single(SharedData_Cuda &data, const Cee //------------------------------------------------------------------------------ // L-vector -> E-vector, offsets provided //------------------------------------------------------------------------------ -template +template inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, - const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { + const ScalarIn *__restrict__ d_u, ScalarOut *__restrict__ r_u) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { for (CeedInt z = 0; z < P_1D; z++) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; @@ -337,9 +333,8 @@ inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt n //------------------------------------------------------------------------------ // L-vector -> E-vector, strided //------------------------------------------------------------------------------ -template -inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, - CeedScalar *__restrict__ r_u) { +template +inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const ScalarIn *__restrict__ d_u, ScalarOut *__restrict__ r_u) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { for (CeedInt z = 0; z < P_1D; z++) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; @@ -353,10 +348,9 @@ inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt el //------------------------------------------------------------------------------ // E-vector -> Q-vector, offests provided //------------------------------------------------------------------------------ -template +template inline __device__ void ReadEVecSliceStandard3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, - const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, - CeedScalar *__restrict__ r_u) { + const CeedInt *__restrict__ indices, const ScalarIn *__restrict__ d_u, ScalarOut *__restrict__ r_u) { if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D; const CeedInt ind = indices[node + elem * Q_1D * Q_1D * Q_1D]; @@ -368,9 +362,9 @@ inline __device__ void ReadEVecSliceStandard3d(SharedData_Cuda &data, const Ceed //------------------------------------------------------------------------------ // E-vector -> Q-vector, strided //------------------------------------------------------------------------------ -template -inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, - CeedScalar *__restrict__ r_u) { +template +inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const ScalarIn *__restrict__ d_u, + ScalarOut *__restrict__ r_u) { if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; @@ -382,9 +376,9 @@ inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedI //------------------------------------------------------------------------------ // E-vector -> L-vector, offsets provided //------------------------------------------------------------------------------ -template +template inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, - const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { for (CeedInt z = 0; z < P_1D; z++) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; @@ -395,10 +389,10 @@ inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt } } -template +template inline __device__ void WriteLVecStandard3d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, - const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, - CeedScalar *__restrict__ d_v) { + const CeedInt *__restrict__ indices, const ScalarIn *__restrict__ r_v, + ScalarOut *__restrict__ d_v) { const CeedInt target_comp = n / (P_1D * P_1D * P_1D); const CeedInt target_node_x = n % P_1D; const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; @@ -415,9 +409,9 @@ inline __device__ void WriteLVecStandard3d_Single(SharedData_Cuda &data, const C //------------------------------------------------------------------------------ // E-vector -> L-vector, full assembly //------------------------------------------------------------------------------ -template +template inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in, - const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) { const CeedInt elem_size = P_1D * P_1D * P_1D; const CeedInt in_comp = in / elem_size; const CeedInt in_node_x = in % P_1D; @@ -442,9 +436,9 @@ inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Cuda &data, const //------------------------------------------------------------------------------ // E-vector -> L-vector, Qfunction assembly //------------------------------------------------------------------------------ -template +template inline __device__ void WriteLVecStandard3d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset, - const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + const CeedInt output_offset, const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) { if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { for (CeedInt z = 0; z < Q_1D; z++) { const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D + z * Q_1D * Q_1D) + elem * Q_1D * Q_1D * Q_1D; @@ -459,9 +453,8 @@ inline __device__ void WriteLVecStandard3d_QFAssembly(SharedData_Cuda &data, con //------------------------------------------------------------------------------ // E-vector -> L-vector, strided //------------------------------------------------------------------------------ -template -inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, - CeedScalar *__restrict__ d_v) { +template +inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { for (CeedInt z = 0; z < P_1D; z++) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; @@ -475,9 +468,9 @@ inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt e //------------------------------------------------------------------------------ // 3D collocated derivatives computation //------------------------------------------------------------------------------ -template -inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_G, + ScalarOut *__restrict__ r_V) { if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { __syncthreads(); @@ -505,9 +498,9 @@ inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, //------------------------------------------------------------------------------ // 3D collocated derivatives transpose //------------------------------------------------------------------------------ -template -inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const CeedInt q, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_G, + ScalarOut *__restrict__ r_V) { if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { __syncthreads(); diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h index e5b31970ff..04a7718b90 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h @@ -12,8 +12,8 @@ //------------------------------------------------------------------------------ // 1D tensor contraction //------------------------------------------------------------------------------ -template -inline __device__ void Contract1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void Contract1d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { data.slice[data.t_id_x] = *U; __syncthreads(); *V = 0.0; @@ -28,8 +28,8 @@ inline __device__ void Contract1d(SharedData_Cuda &data, const CeedScalar *U, co //------------------------------------------------------------------------------ // 1D transpose tensor contraction //------------------------------------------------------------------------------ -template -inline __device__ void ContractTranspose1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTranspose1d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { data.slice[data.t_id_x] = *U; __syncthreads(); if (data.t_id_x < P_1D) { @@ -43,9 +43,8 @@ inline __device__ void ContractTranspose1d(SharedData_Cuda &data, const CeedScal //------------------------------------------------------------------------------ // Interpolate to quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void InterpNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void InterpNonTensor(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, ScalarOut *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { Contract1d(data, &r_U[comp], c_B, &r_V[comp]); } @@ -54,9 +53,9 @@ inline __device__ void InterpNonTensor(SharedData_Cuda &data, const CeedScalar * //------------------------------------------------------------------------------ // Interpolate transpose //------------------------------------------------------------------------------ -template -inline __device__ void InterpTransposeNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void InterpTransposeNonTensor(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, + ScalarOut *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { r_V[comp] = 0.0; ContractTranspose1d(data, &r_U[comp], c_B, &r_V[comp]); @@ -66,8 +65,8 @@ inline __device__ void InterpTransposeNonTensor(SharedData_Cuda &data, const Cee //------------------------------------------------------------------------------ // Derivatives at quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void GradNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradNonTensor(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_G, ScalarOut *__restrict__ r_V) { for (CeedInt dim = 0; dim < DIM; dim++) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { Contract1d(data, &r_U[comp], &c_G[dim * P * Q], &r_V[comp + dim * NUM_COMP]); @@ -78,9 +77,9 @@ inline __device__ void GradNonTensor(SharedData_Cuda &data, const CeedScalar *__ //------------------------------------------------------------------------------ // Derivatives transpose //------------------------------------------------------------------------------ -template -inline __device__ void GradTransposeNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTransposeNonTensor(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_G, + ScalarOut *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_V[comp] = 0.0; for (CeedInt dim = 0; dim < DIM; dim++) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { @@ -92,7 +91,7 @@ inline __device__ void GradTransposeNonTensor(SharedData_Cuda &data, const CeedS //------------------------------------------------------------------------------ // Quadrature weights //------------------------------------------------------------------------------ -template -inline __device__ void WeightNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight, CeedScalar *w) { +template +inline __device__ void WeightNonTensor(SharedData_Cuda &data, const ScalarIn *__restrict__ q_weight, ScalarOut *w) { *w = (data.t_id_x < Q) ? q_weight[data.t_id_x] : 0.0; } diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h index ec7102ea2c..5a4cdff8cf 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h @@ -15,7 +15,8 @@ //------------------------------------------------------------------------------ // Interp kernels //------------------------------------------------------------------------------ -extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { +extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalarCPU *c_B, const CeedScalarCPU *__restrict__ d_U, + CeedScalarCPU *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -41,8 +42,8 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, } } -extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { +extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalarCPU *c_B, const CeedScalarCPU *__restrict__ d_U, + CeedScalarCPU *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -68,8 +69,8 @@ extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedSca } } -extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { +extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalarCPU *c_B, const CeedScalarCPU *__restrict__ d_U, + CeedScalarCPU *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -98,7 +99,8 @@ extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const Ceed //------------------------------------------------------------------------------ // Grad kernels //------------------------------------------------------------------------------ -extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { +extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalarCPU *c_G, const CeedScalarCPU *__restrict__ d_U, + CeedScalarCPU *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -124,8 +126,8 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G, c } } -extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { +extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalarCPU *c_G, const CeedScalarCPU *__restrict__ d_U, + CeedScalarCPU *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -151,8 +153,8 @@ extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScala } } -extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { +extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalarCPU *c_G, const CeedScalarCPU *__restrict__ d_U, + CeedScalarCPU *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -181,7 +183,7 @@ extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedSc //------------------------------------------------------------------------------ // Weight kernel //------------------------------------------------------------------------------ -extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_W) { +extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalarCPU *__restrict__ q_weight, CeedScalarCPU *__restrict__ d_W) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h index cb62c4f80b..74cbeb6809 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h @@ -12,8 +12,8 @@ //------------------------------------------------------------------------------ // Load matrices for basis actions //------------------------------------------------------------------------------ -template -inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { +template +inline __device__ void LoadMatrix(SharedData_Cuda &data, const ScalarIn *__restrict__ d_B, ScalarOut *B) { for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; } @@ -24,9 +24,9 @@ inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__res //------------------------------------------------------------------------------ // E-vector -> single element //------------------------------------------------------------------------------ -template +template inline __device__ void ReadElementStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, - const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { + const CeedInt strides_elem, const ScalarIn *__restrict__ d_u, ScalarOut *r_u) { if (data.t_id_x < P_1D) { const CeedInt node = data.t_id_x; const CeedInt ind = node * strides_node + elem * strides_elem; @@ -40,9 +40,9 @@ inline __device__ void ReadElementStrided1d(SharedData_Cuda &data, const CeedInt //------------------------------------------------------------------------------ // Single element -> E-vector //------------------------------------------------------------------------------ -template +template inline __device__ void WriteElementStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, - const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + const CeedInt strides_elem, const ScalarIn *r_v, ScalarOut *d_v) { if (data.t_id_x < P_1D) { const CeedInt node = data.t_id_x; const CeedInt ind = node * strides_node + elem * strides_elem; @@ -53,9 +53,9 @@ inline __device__ void WriteElementStrided1d(SharedData_Cuda &data, const CeedIn } } -template +template inline __device__ void SumElementStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, - const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + const CeedInt strides_elem, const ScalarIn *r_v, ScalarOut *d_v) { if (data.t_id_x < P_1D) { const CeedInt node = data.t_id_x; const CeedInt ind = node * strides_node + elem * strides_elem; @@ -73,9 +73,9 @@ inline __device__ void SumElementStrided1d(SharedData_Cuda &data, const CeedInt //------------------------------------------------------------------------------ // E-vector -> single element //------------------------------------------------------------------------------ -template +template inline __device__ void ReadElementStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, - const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { + const CeedInt strides_elem, const ScalarIn *__restrict__ d_u, ScalarOut *r_u) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D; const CeedInt ind = node * strides_node + elem * strides_elem; @@ -89,9 +89,9 @@ inline __device__ void ReadElementStrided2d(SharedData_Cuda &data, const CeedInt //------------------------------------------------------------------------------ // Single element -> E-vector //------------------------------------------------------------------------------ -template +template inline __device__ void WriteElementStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, - const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + const CeedInt strides_elem, const ScalarIn *r_v, ScalarOut *d_v) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D; const CeedInt ind = node * strides_node + elem * strides_elem; @@ -102,9 +102,9 @@ inline __device__ void WriteElementStrided2d(SharedData_Cuda &data, const CeedIn } } -template +template inline __device__ void SumElementStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, - const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + const CeedInt strides_elem, const ScalarIn *r_v, ScalarOut *d_v) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D; const CeedInt ind = node * strides_node + elem * strides_elem; @@ -122,9 +122,9 @@ inline __device__ void SumElementStrided2d(SharedData_Cuda &data, const CeedInt //------------------------------------------------------------------------------ // E-vector -> single element //------------------------------------------------------------------------------ -template +template inline __device__ void ReadElementStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, - const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { + const CeedInt strides_elem, const ScalarIn *__restrict__ d_u, ScalarOut *r_u) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { for (CeedInt z = 0; z < P_1D; z++) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; @@ -140,9 +140,9 @@ inline __device__ void ReadElementStrided3d(SharedData_Cuda &data, const CeedInt //------------------------------------------------------------------------------ // Single element -> E-vector //------------------------------------------------------------------------------ -template +template inline __device__ void WriteElementStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, - const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + const CeedInt strides_elem, const ScalarIn *r_v, ScalarOut *d_v) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { for (CeedInt z = 0; z < P_1D; z++) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; @@ -155,9 +155,9 @@ inline __device__ void WriteElementStrided3d(SharedData_Cuda &data, const CeedIn } } -template +template inline __device__ void SumElementStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, - const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + const CeedInt strides_elem, const ScalarIn *r_v, ScalarOut *d_v) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { for (CeedInt z = 0; z < P_1D; z++) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; @@ -177,10 +177,10 @@ inline __device__ void SumElementStrided3d(SharedData_Cuda &data, const CeedInt //------------------------------------------------------------------------------ // E-vector -> single point //------------------------------------------------------------------------------ -template +template inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem, - const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { + const ScalarIn *__restrict__ d_u, ScalarOut *r_u) { const CeedInt ind = (p % NUM_PTS) * strides_point + elem * strides_elem; if (p < points_in_elem) { @@ -197,10 +197,10 @@ inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, cons //------------------------------------------------------------------------------ // Single point -> E-vector //------------------------------------------------------------------------------ -template +template inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, - const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *r_v, - CeedScalar *d_v) { + const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem, const ScalarIn *r_v, + ScalarOut *d_v) { if (p < points_in_elem) { const CeedInt ind = (p % NUM_PTS) * strides_point + elem * strides_elem; diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h index a9cd1209ef..a9e0258dec 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h @@ -40,9 +40,9 @@ inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar //------------------------------------------------------------------------------ // 1D interpolate to points //------------------------------------------------------------------------------ -template -inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_C, const ScalarIn2 *r_X, + ScalarOut *__restrict__ r_V) { CeedScalar chebyshev_x[Q_1D]; for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; @@ -61,9 +61,9 @@ inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, //------------------------------------------------------------------------------ // 1D interpolate transpose //------------------------------------------------------------------------------ -template -inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, - CeedScalar *__restrict__ r_C) { +template +inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *r_X, + ScalarOut *__restrict__ r_C) { CeedScalar chebyshev_x[Q_1D]; ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); @@ -86,9 +86,9 @@ inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const Ce //------------------------------------------------------------------------------ // 1D derivatives at points //------------------------------------------------------------------------------ -template -inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_C, const ScalarIn2 *r_X, + ScalarOut *__restrict__ r_V) { CeedScalar chebyshev_x[Q_1D]; ChebyshevDerivativeAtPoint(r_X[0], chebyshev_x); @@ -108,9 +108,9 @@ inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, co //------------------------------------------------------------------------------ // 1D derivatives transpose //------------------------------------------------------------------------------ -template -inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, - CeedScalar *__restrict__ r_C) { +template +inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *r_X, + ScalarOut *__restrict__ r_C) { CeedScalar chebyshev_x[Q_1D]; ChebyshevDerivativeAtPoint(r_X[0], chebyshev_x); @@ -137,9 +137,9 @@ inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const Ceed //------------------------------------------------------------------------------ // 2D interpolate to points //------------------------------------------------------------------------------ -template -inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_C, const ScalarIn2 *r_X, + ScalarOut *__restrict__ r_V) { for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { CeedScalar buffer[Q_1D]; @@ -168,9 +168,9 @@ inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, //------------------------------------------------------------------------------ // 2D interpolate transpose //------------------------------------------------------------------------------ -template -inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, - CeedScalar *__restrict__ r_C) { +template +inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *r_X, + ScalarOut *__restrict__ r_C) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { CeedScalar buffer[Q_1D]; CeedScalar chebyshev_x[Q_1D]; @@ -206,9 +206,9 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const Ce //------------------------------------------------------------------------------ // 2D derivatives at points //------------------------------------------------------------------------------ -template -inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_C, const ScalarIn2 *r_X, + ScalarOut *__restrict__ r_V) { for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { CeedScalar buffer[Q_1D]; @@ -241,9 +241,9 @@ inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, co //------------------------------------------------------------------------------ // 2D derivatives transpose //------------------------------------------------------------------------------ -template -inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, - CeedScalar *__restrict__ r_C) { +template +inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *r_X, + ScalarOut *__restrict__ r_C) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { CeedScalar buffer[Q_1D]; CeedScalar chebyshev_x[Q_1D]; @@ -287,9 +287,9 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const Ceed //------------------------------------------------------------------------------ // 3D interpolate to points //------------------------------------------------------------------------------ -template -inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_C, const ScalarIn2 *r_X, + ScalarOut *__restrict__ r_V) { for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; for (CeedInt k = 0; k < Q_1D; k++) { CeedScalar buffer[Q_1D]; @@ -324,9 +324,9 @@ inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, //------------------------------------------------------------------------------ // 3D interpolate transpose //------------------------------------------------------------------------------ -template -inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, - CeedScalar *__restrict__ r_C) { +template +inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *r_X, + ScalarOut *__restrict__ r_C) { for (CeedInt k = 0; k < Q_1D; k++) { CeedScalar buffer[Q_1D]; CeedScalar chebyshev_x[Q_1D]; @@ -368,9 +368,9 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const Ce //------------------------------------------------------------------------------ // 3D derivatives at points //------------------------------------------------------------------------------ -template -inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_C, const ScalarIn2 *r_X, + ScalarOut *__restrict__ r_V) { for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0; for (CeedInt k = 0; k < Q_1D; k++) { CeedScalar buffer[Q_1D]; @@ -415,9 +415,9 @@ inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, co //------------------------------------------------------------------------------ // 3D derivatives transpose //------------------------------------------------------------------------------ -template -inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, - CeedScalar *__restrict__ r_C) { +template +inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *r_X, + ScalarOut *__restrict__ r_C) { for (CeedInt k = 0; k < Q_1D; k++) { CeedScalar buffer[Q_1D]; CeedScalar chebyshev_x[Q_1D]; diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h index dcb1763e38..0e39830c4d 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h @@ -20,8 +20,9 @@ //------------------------------------------------------------------------------ // Interp //------------------------------------------------------------------------------ -extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem, - const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { +extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalarCPU *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem, + const CeedScalarCPU *__restrict__ d_X, const CeedScalarCPU *__restrict__ d_U, + CeedScalarCPU *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -75,9 +76,9 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScal } } -extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, - const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, - const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { +extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalarCPU *__restrict__ c_B, + const CeedInt *__restrict__ points_per_elem, const CeedScalarCPU *__restrict__ d_X, + const CeedScalarCPU *__restrict__ d_U, CeedScalarCPU *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -145,9 +146,9 @@ extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const } } -extern "C" __global__ void InterpTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, - const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, - const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { +extern "C" __global__ void InterpTransposeAddAtPoints(const CeedInt num_elem, const CeedScalarCPU *__restrict__ c_B, + const CeedInt *__restrict__ points_per_elem, const CeedScalarCPU *__restrict__ d_X, + const CeedScalarCPU *__restrict__ d_U, CeedScalarCPU *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -207,8 +208,9 @@ extern "C" __global__ void InterpTransposeAddAtPoints(const CeedInt num_elem, co //------------------------------------------------------------------------------ // Grad //------------------------------------------------------------------------------ -extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem, - const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { +extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalarCPU *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem, + const CeedScalarCPU *__restrict__ d_X, const CeedScalarCPU *__restrict__ d_U, + CeedScalarCPU *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -262,9 +264,9 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar } } -extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, - const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, - const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { +extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalarCPU *__restrict__ c_B, + const CeedInt *__restrict__ points_per_elem, const CeedScalarCPU *__restrict__ d_X, + const CeedScalarCPU *__restrict__ d_U, CeedScalarCPU *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -333,9 +335,9 @@ extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const C } } -extern "C" __global__ void GradTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, - const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, - const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { +extern "C" __global__ void GradTransposeAddAtPoints(const CeedInt num_elem, const CeedScalarCPU *__restrict__ c_B, + const CeedInt *__restrict__ points_per_elem, const CeedScalarCPU *__restrict__ d_X, + const CeedScalarCPU *__restrict__ d_U, CeedScalarCPU *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h index 4f76825d50..3429fcc76f 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h @@ -16,9 +16,9 @@ //------------------------------------------------------------------------------ // 2D tensor contraction x //------------------------------------------------------------------------------ -template -inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, - CeedScalar *V) { +template +inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const ScalarIn1 *U, const ScalarIn2 *B, + ScalarOut *V) { __syncthreads(); data.slice[t_id_x + t_id_y * T_1D] = *U; __syncthreads(); @@ -33,9 +33,9 @@ inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_i //------------------------------------------------------------------------------ // 2D tensor contract y //------------------------------------------------------------------------------ -template -inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, - CeedScalar *V) { +template +inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const ScalarIn1 *U, const ScalarIn2 *B, + ScalarOut *V) { __syncthreads(); data.slice[t_id_x + t_id_y * T_1D] = *U; __syncthreads(); @@ -50,9 +50,9 @@ inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_i //------------------------------------------------------------------------------ // 2D transpose tensor contract y //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, - const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const ScalarIn1 *U, + const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[t_id_x + t_id_y * T_1D] = *U; __syncthreads(); @@ -67,9 +67,9 @@ inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, cons //------------------------------------------------------------------------------ // 2D transpose tensor contract x //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, - const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const ScalarIn1 *U, + const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[t_id_x + t_id_y * T_1D] = *U; __syncthreads(); @@ -84,9 +84,9 @@ inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, cons //------------------------------------------------------------------------------ // 2D transpose tensor contract and add x //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, - const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const ScalarIn1 *U, + const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[t_id_x + t_id_y * T_1D] = *U; __syncthreads(); @@ -100,8 +100,8 @@ inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, c //------------------------------------------------------------------------------ // 2D pack/unpack quadrature values //------------------------------------------------------------------------------ -template -inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) { +template +inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, ScalarOut *U) { const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { @@ -112,8 +112,8 @@ inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const in } } -template -inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) { +template +inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, ScalarOut *U) { const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { @@ -127,9 +127,9 @@ inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const //------------------------------------------------------------------------------ // 2D interpolate to quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, + ScalarOut *__restrict__ r_V) { const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; CeedScalar r_t[1]; @@ -146,9 +146,9 @@ inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar //------------------------------------------------------------------------------ // 2D interpolate transpose //------------------------------------------------------------------------------ -template -inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, + ScalarOut *__restrict__ r_V) { const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; CeedScalar r_t[1]; @@ -164,9 +164,9 @@ inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, C //------------------------------------------------------------------------------ // 2D derivatives at quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G, + ScalarOut *__restrict__ r_V) { const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; CeedScalar r_t[1]; @@ -185,9 +185,9 @@ inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar * //------------------------------------------------------------------------------ // 2D derivatives transpose //------------------------------------------------------------------------------ -template -inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G, + ScalarOut *__restrict__ r_V) { const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; CeedScalar r_t[1]; @@ -205,8 +205,8 @@ inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, Cee //------------------------------------------------------------------------------ // 2D quadrature weights //------------------------------------------------------------------------------ -template -inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { +template +inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const ScalarIn *__restrict__ q_weight_1d, ScalarOut *w) { const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D; *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0; @@ -219,9 +219,9 @@ inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const Ceed //------------------------------------------------------------------------------ // 3D tensor contract x //------------------------------------------------------------------------------ -template -inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, - const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const ScalarIn1 *U, + const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; __syncthreads(); @@ -236,9 +236,9 @@ inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_i //------------------------------------------------------------------------------ // 3D tensor contract y //------------------------------------------------------------------------------ -template -inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, - const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const ScalarIn1 *U, + const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; __syncthreads(); @@ -253,9 +253,9 @@ inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_i //------------------------------------------------------------------------------ // 3D tensor contract z //------------------------------------------------------------------------------ -template -inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, - const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const ScalarIn1 *U, + const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; __syncthreads(); @@ -270,9 +270,9 @@ inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_i //------------------------------------------------------------------------------ // 3D tensor contract z //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, - const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const ScalarIn1 *U, + const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; __syncthreads(); @@ -287,9 +287,9 @@ inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, cons //------------------------------------------------------------------------------ // 3D transpose tensor contract z //------------------------------------------------------------------------------ -template +template inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, - const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; __syncthreads(); @@ -303,9 +303,9 @@ inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, c //------------------------------------------------------------------------------ // 3D transpose tensor contract y //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, - const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const ScalarIn1 *U, + const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; __syncthreads(); @@ -320,9 +320,9 @@ inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, cons //------------------------------------------------------------------------------ // 3D transpose tensor contract y //------------------------------------------------------------------------------ -template +template inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, - const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; __syncthreads(); @@ -336,9 +336,9 @@ inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, c //------------------------------------------------------------------------------ // 3D transpose tensor contract x //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, - const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const ScalarIn1 *U, + const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; __syncthreads(); @@ -353,9 +353,9 @@ inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, cons //------------------------------------------------------------------------------ // 3D transpose tensor contract add x //------------------------------------------------------------------------------ -template +template inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, - const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; __syncthreads(); @@ -369,8 +369,8 @@ inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, c //------------------------------------------------------------------------------ // 3D pack/unpack quadrature values //------------------------------------------------------------------------------ -template -inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { +template +inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, ScalarOut *U) { const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D); for (CeedInt comp = 0; comp < NUM_COMP; comp++) { @@ -381,8 +381,8 @@ inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const in } } -template -inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { +template +inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, ScalarOut *U) { const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D); for (CeedInt comp = 0; comp < NUM_COMP; comp++) { @@ -396,9 +396,9 @@ inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const //------------------------------------------------------------------------------ // 3D interpolate to quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, + ScalarOut *__restrict__ r_V) { const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); CeedScalar r_t1[1], r_t2[1]; @@ -416,9 +416,9 @@ inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar //------------------------------------------------------------------------------ // 3D interpolate transpose //------------------------------------------------------------------------------ -template -inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, + ScalarOut *__restrict__ r_V) { const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); CeedScalar r_t1[1], r_t2[1]; @@ -435,9 +435,9 @@ inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, C //------------------------------------------------------------------------------ // 3D derivatives at quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G, + ScalarOut *__restrict__ r_V) { const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); CeedScalar r_t1[1], r_t2[1]; @@ -461,9 +461,9 @@ inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar * //------------------------------------------------------------------------------ // 3D derivatives transpose //------------------------------------------------------------------------------ -template -inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G, + ScalarOut *__restrict__ r_V) { const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); CeedScalar r_t1[1], r_t2[1]; @@ -486,9 +486,9 @@ inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, Cee //------------------------------------------------------------------------------ // 3D derivatives at quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G, + ScalarOut *__restrict__ r_V) { const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); CeedScalar r_t1[1], r_t2[1]; @@ -509,9 +509,9 @@ inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, Ce //------------------------------------------------------------------------------ // 3D derivatives transpose //------------------------------------------------------------------------------ -template -inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, + const ScalarIn3 *c_G, ScalarOut *__restrict__ r_V) { const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); CeedScalar r_t1[1], r_t2[1]; @@ -531,8 +531,8 @@ inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda //------------------------------------------------------------------------------ // 3D quadrature weights //------------------------------------------------------------------------------ -template -inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { +template +inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const ScalarIn *__restrict__ q_weight_1d, ScalarOut *w) { const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D); *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0; diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h index f4f701505a..b453d66596 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h @@ -16,8 +16,8 @@ //------------------------------------------------------------------------------ // 1D tensor contraction x //------------------------------------------------------------------------------ -template -inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractX1d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[data.t_id_x] = *U; __syncthreads(); @@ -32,8 +32,8 @@ inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, c //------------------------------------------------------------------------------ // 1D transpose tensor contraction x //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[data.t_id_x] = *U; __syncthreads(); @@ -48,8 +48,8 @@ inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedSca //------------------------------------------------------------------------------ // 1D interpolate to quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { +template +inline __device__ void Interp1d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, ScalarOut *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ContractX1d(data, &r_U[comp], c_B, &r_V[comp]); } @@ -58,9 +58,9 @@ inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restr //------------------------------------------------------------------------------ // 1D interpolate transpose //------------------------------------------------------------------------------ -template -inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, + ScalarOut *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ContractTransposeX1d(data, &r_U[comp], c_B, &r_V[comp]); } @@ -69,9 +69,9 @@ inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar //------------------------------------------------------------------------------ // 1D derivatives at quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void Grad1d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G, + ScalarOut *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ContractX1d(data, &r_U[comp], c_G, &r_V[comp]); } @@ -80,9 +80,9 @@ inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restric //------------------------------------------------------------------------------ // 1D derivatives transpose //------------------------------------------------------------------------------ -template -inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTranspose1d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G, + ScalarOut *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ContractTransposeX1d(data, &r_U[comp], c_G, &r_V[comp]); } @@ -91,8 +91,8 @@ inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar * //------------------------------------------------------------------------------ // 1D quadrature weights //------------------------------------------------------------------------------ -template -inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { +template +inline __device__ void Weight1d(SharedData_Cuda &data, const ScalarIn *__restrict__ q_weight_1d, ScalarOut *w) { *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0; } @@ -103,8 +103,8 @@ inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restr //------------------------------------------------------------------------------ // 2D tensor contraction x //------------------------------------------------------------------------------ -template -inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractX2d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); @@ -119,8 +119,8 @@ inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, c //------------------------------------------------------------------------------ // 2D tensor contract y //------------------------------------------------------------------------------ -template -inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractY2d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); @@ -135,8 +135,8 @@ inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, c //------------------------------------------------------------------------------ // 2D transpose tensor contract y //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); @@ -151,8 +151,8 @@ inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedSca //------------------------------------------------------------------------------ // 2D transpose tensor contract x //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); @@ -167,8 +167,8 @@ inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedSca //------------------------------------------------------------------------------ // 2D transpose tensor contract and add x //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); @@ -182,9 +182,8 @@ inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const Ceed //------------------------------------------------------------------------------ // 2D interpolate to quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void InterpTensor2d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, ScalarOut *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ContractX2d(data, &r_U[comp], c_B, r_t); @@ -195,9 +194,9 @@ inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *_ //------------------------------------------------------------------------------ // 2D interpolate transpose //------------------------------------------------------------------------------ -template -inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, + ScalarOut *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ContractTransposeY2d(data, &r_U[comp], c_B, r_t); @@ -208,9 +207,9 @@ inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const Ceed //------------------------------------------------------------------------------ // 2D derivatives at quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTensor2d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G, + ScalarOut *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ContractX2d(data, &r_U[comp], c_G, r_t); @@ -223,9 +222,9 @@ inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__r //------------------------------------------------------------------------------ // 2D derivatives transpose //------------------------------------------------------------------------------ -template -inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G, + ScalarOut *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ContractTransposeY2d(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t); @@ -238,8 +237,8 @@ inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedSc //------------------------------------------------------------------------------ // 2D quadrature weights //------------------------------------------------------------------------------ -template -inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { +template +inline __device__ void WeightTensor2d(SharedData_Cuda &data, const ScalarIn *__restrict__ q_weight_1d, ScalarOut *w) { *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; } @@ -250,8 +249,8 @@ inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *_ //------------------------------------------------------------------------------ // 3D tensor contract x //------------------------------------------------------------------------------ -template -inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractX3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { CeedScalar r_B[P_1D]; for (CeedInt i = 0; i < P_1D; i++) { r_B[i] = B[i + data.t_id_x * P_1D]; @@ -273,8 +272,8 @@ inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, c //------------------------------------------------------------------------------ // 3D tensor contract y //------------------------------------------------------------------------------ -template -inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractY3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { CeedScalar r_B[P_1D]; for (CeedInt i = 0; i < P_1D; i++) { r_B[i] = B[i + data.t_id_y * P_1D]; @@ -296,8 +295,8 @@ inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, c //------------------------------------------------------------------------------ // 3D tensor contract z //------------------------------------------------------------------------------ -template -inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractZ3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { for (CeedInt k = 0; k < Q_1D; k++) { V[k] = 0.0; if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { @@ -311,8 +310,8 @@ inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, c //------------------------------------------------------------------------------ // 3D transpose tensor contract z //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { for (CeedInt k = 0; k < P_1D; k++) { V[k] = 0.0; if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { @@ -326,8 +325,8 @@ inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedSca //------------------------------------------------------------------------------ // 3D transpose tensor contract y //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { CeedScalar r_B[Q_1D]; for (CeedInt i = 0; i < Q_1D; i++) { r_B[i] = B[data.t_id_y + i * P_1D]; @@ -349,8 +348,8 @@ inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedSca //------------------------------------------------------------------------------ // 3D transpose tensor contract y //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { CeedScalar r_B[Q_1D]; for (CeedInt i = 0; i < Q_1D; i++) { r_B[i] = B[data.t_id_y + i * P_1D]; @@ -371,8 +370,8 @@ inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const Ceed //------------------------------------------------------------------------------ // 3D transpose tensor contract x //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { CeedScalar r_B[Q_1D]; for (CeedInt i = 0; i < Q_1D; i++) { r_B[i] = B[data.t_id_x + i * P_1D]; @@ -394,8 +393,8 @@ inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedSca //------------------------------------------------------------------------------ // 3D transpose tensor contract add x //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { CeedScalar r_B[Q_1D]; for (CeedInt i = 0; i < Q_1D; i++) { r_B[i] = B[data.t_id_x + i * P_1D]; @@ -416,9 +415,8 @@ inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const Ceed //------------------------------------------------------------------------------ // 3D interpolate to quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void InterpTensor3d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, ScalarOut *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { @@ -431,9 +429,9 @@ inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *_ //------------------------------------------------------------------------------ // 3D interpolate transpose //------------------------------------------------------------------------------ -template -inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, + ScalarOut *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { @@ -446,9 +444,9 @@ inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const Ceed //------------------------------------------------------------------------------ // 3D derivatives at quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTensor3d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G, + ScalarOut *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { @@ -467,9 +465,9 @@ inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__r //------------------------------------------------------------------------------ // 3D derivatives transpose //------------------------------------------------------------------------------ -template -inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G, + ScalarOut *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { @@ -488,9 +486,9 @@ inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedSc //------------------------------------------------------------------------------ // 3D derivatives at quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G, + ScalarOut *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { @@ -506,9 +504,9 @@ inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedS //------------------------------------------------------------------------------ // 3D derivatives transpose //------------------------------------------------------------------------------ -template -inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, + const ScalarIn3 *c_G, ScalarOut *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { @@ -524,8 +522,8 @@ inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, co //------------------------------------------------------------------------------ // 3D quadrature weights //------------------------------------------------------------------------------ -template -inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { +template +inline __device__ void WeightTensor3d(SharedData_Cuda &data, const ScalarIn *__restrict__ q_weight_1d, ScalarOut *w) { const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D); const CeedScalar pw = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; for (CeedInt q = 0; q < Q_1D; q++) { diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h index 1252c8197d..d21fec19f5 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h @@ -15,7 +15,8 @@ //------------------------------------------------------------------------------ // Interp kernel by dim //------------------------------------------------------------------------------ -extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { +extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalarCPU *c_B, const CeedScalarCPU *__restrict__ d_U, + CeedScalarCPU *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -53,8 +54,8 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, } } -extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { +extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalarCPU *c_B, const CeedScalarCPU *__restrict__ d_U, + CeedScalarCPU *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -92,8 +93,8 @@ extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedSca } } -extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { +extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalarCPU *c_B, const CeedScalarCPU *__restrict__ d_U, + CeedScalarCPU *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -134,8 +135,8 @@ extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const Ceed //------------------------------------------------------------------------------ // Grad kernel by dim //------------------------------------------------------------------------------ -extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { +extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalarCPU *c_B, const CeedScalarCPU *c_G, const CeedScalarCPU *__restrict__ d_U, + CeedScalarCPU *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -177,8 +178,8 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_B, c } } -extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { +extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalarCPU *c_B, const CeedScalarCPU *c_G, + const CeedScalarCPU *__restrict__ d_U, CeedScalarCPU *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -220,8 +221,8 @@ extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScala } } -extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { +extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalarCPU *c_B, const CeedScalarCPU *c_G, + const CeedScalarCPU *__restrict__ d_U, CeedScalarCPU *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -266,7 +267,7 @@ extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedSc //------------------------------------------------------------------------------ // Weight kernels by dim //------------------------------------------------------------------------------ -extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *__restrict__ d_W) { +extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalarCPU *__restrict__ q_weight_1d, CeedScalarCPU *__restrict__ d_W) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; diff --git a/include/ceed/jit-source/cuda/cuda-types.h b/include/ceed/jit-source/cuda/cuda-types.h index 9acb0064a3..969cbb1d90 100644 --- a/include/ceed/jit-source/cuda/cuda-types.h +++ b/include/ceed/jit-source/cuda/cuda-types.h @@ -14,8 +14,8 @@ #define CEED_CUDA_NUMBER_FIELDS 16 typedef struct { - const CeedScalar *inputs[CEED_CUDA_NUMBER_FIELDS]; - CeedScalar *outputs[CEED_CUDA_NUMBER_FIELDS]; + const CeedScalarCPU *inputs[CEED_CUDA_NUMBER_FIELDS]; + CeedScalarCPU *outputs[CEED_CUDA_NUMBER_FIELDS]; } Fields_Cuda; typedef struct { @@ -24,10 +24,10 @@ typedef struct { } FieldsInt_Cuda; typedef struct { - CeedInt num_elem; - const CeedInt *num_per_elem; - const CeedInt *indices; - const CeedScalar *coords; + CeedInt num_elem; + const CeedInt *num_per_elem; + const CeedInt *indices; + const CeedScalarCPU *coords; } Points_Cuda; typedef struct { diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c index d0545ba7e6..6c2a0cc5cd 100644 --- a/interface/ceed-operator.c +++ b/interface/ceed-operator.c @@ -634,6 +634,47 @@ int CeedOperatorIsSetupDone(CeedOperator op, bool *is_setup_done) { return CEED_ERROR_SUCCESS; } +/** + @brief Set a `CeedOperator` to use reduced precision for operator application + + @param[in] op `CeedOperator` + + @return An error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedOperatorSetMixedPrecision(CeedOperator op) { + bool is_immutable, is_composite, supports_mixed_precision; + Ceed ceed; + + CeedCall(CeedOperatorGetCeed(op, &ceed)); + CeedCall(CeedOperatorIsImmutable(op, &is_immutable)); + CeedCheck(!is_immutable, ceed, CEED_ERROR_INCOMPATIBLE, "CeedOperatorSetMixedPrecision must be called before operator is finalized"); + CeedCall(CeedOperatorIsComposite(op, &is_composite)); + CeedCheck(!is_composite, ceed, CEED_ERROR_INCOMPATIBLE, "CeedOperatorSetMixedPrecision should be set on single operators"); + CeedCall(CeedGetSupportsMixedPrecision(ceed, &supports_mixed_precision)); + CeedCheck(supports_mixed_precision, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement mixed precision operators"); + + op->use_mixed_precision = true; + CeedCallBackend(CeedDestroy(&ceed)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Get whether a `CeedOperator` is set to use reduced precision for operator application + + @param[in] op `CeedOperator` + @param[out] use_mixed_precision Variable to store `CeedQFunction` + + @return An error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedOperatorGetMixedPrecision(CeedOperator op, bool *use_mixed_precision) { + *use_mixed_precision = op->use_mixed_precision; + return CEED_ERROR_SUCCESS; +} + /** @brief Get the `CeedQFunction` associated with a `CeedOperator` @@ -1076,10 +1117,10 @@ int CeedOperatorAtPointsSetPoints(CeedOperator op, CeedElemRestriction rstr_poin /** @brief Get a boolean value indicating if the `CeedOperator` was created with `CeedOperatorCreateAtPoints` - + @param[in] op `CeedOperator` @param[out] is_at_points Variable to store at points status - + @return An error code: 0 - success, otherwise - failure @ref User diff --git a/interface/ceed.c b/interface/ceed.c index 39a8d3a911..5b8bdc994c 100644 --- a/interface/ceed.c +++ b/interface/ceed.c @@ -733,6 +733,21 @@ int CeedSetDeterministic(Ceed ceed, bool is_deterministic) { return CEED_ERROR_SUCCESS; } +/** + @brief Flag `Ceed` context as being able to create mixed precision operators + + @param[in] ceed `Ceed` to flag as deterministic + @param[out] supports_mixed_precision Mixed precision status to set + + @return An error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedSetSupportsMixedPrecision(Ceed ceed, bool supports_mixed_precision) { + ceed->supports_mixed_precision = supports_mixed_precision; + return CEED_ERROR_SUCCESS; +} + /** @brief Set a backend function. @@ -1446,6 +1461,21 @@ int CeedIsDeterministic(Ceed ceed, bool *is_deterministic) { return CEED_ERROR_SUCCESS; } +/** + @brief Get deterministic status of `Ceed` context + + @param[in] ceed `Ceed` context + @param[out] supports_mixed_precision Variable to store deterministic status + + @return An error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedGetSupportsMixedPrecision(Ceed ceed, bool *supports_mixed_precision) { + *supports_mixed_precision = ceed->supports_mixed_precision; + return CEED_ERROR_SUCCESS; +} + /** @brief Set additional JiT source root for `Ceed` context diff --git a/tests/t502-operator-mixed.c b/tests/t502-operator-mixed.c new file mode 100644 index 0000000000..4218b82b1d --- /dev/null +++ b/tests/t502-operator-mixed.c @@ -0,0 +1,125 @@ +/// @file +/// Test creation, action, and destruction for mass matrix operator with multiple components with mixed precision +/// \test Test creation, action, and destruction for mass matrix operator with multiple components with mixed precision +#include "t502-operator.h" + +#include +#include +#include +#include +#include + +int main(int argc, char **argv) { + Ceed ceed; + CeedElemRestriction elem_restriction_x, elem_restriction_u, elem_restriction_q_data; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass; + CeedVector q_data, x, u, v; + CeedInt num_elem = 15, p = 5, q = 8; + CeedInt num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (p - 1) + 1; + CeedInt ind_x[num_elem * 2], ind_u[num_elem * p]; + + CeedInit(argv[1], &ceed); + + CeedVectorCreate(ceed, num_nodes_x, &x); + { + CeedScalar x_array[num_nodes_x]; + + for (CeedInt i = 0; i < num_nodes_x; i++) x_array[i] = (CeedScalar)i / (num_nodes_x - 1); + CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + CeedVectorCreate(ceed, 2 * num_nodes_u, &u); + CeedVectorCreate(ceed, 2 * num_nodes_u, &v); + CeedVectorCreate(ceed, num_elem * q, &q_data); + + // Restrictions + for (CeedInt i = 0; i < num_elem; i++) { + ind_x[2 * i + 0] = i; + ind_x[2 * i + 1] = i + 1; + } + CeedElemRestrictionCreate(ceed, num_elem, 2, 1, 1, num_nodes_x, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x); + + for (CeedInt i = 0; i < num_elem; i++) { + for (CeedInt j = 0; j < p; j++) { + ind_u[p * i + j] = 2 * (i * (p - 1) + j); + } + } + CeedElemRestrictionCreate(ceed, num_elem, p, 2, 1, 2 * num_nodes_u, CEED_MEM_HOST, CEED_USE_POINTER, ind_u, &elem_restriction_u); + + CeedInt strides_q_data[3] = {1, q, q}; + CeedElemRestrictionCreateStrided(ceed, num_elem, q, 1, q * num_elem, strides_q_data, &elem_restriction_q_data); + + // Bases + CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, 2, q, CEED_GAUSS, &basis_x); + CeedBasisCreateTensorH1Lagrange(ceed, 1, 2, p, q, CEED_GAUSS, &basis_u); + + // QFunctions + CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup); + CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT); + CeedQFunctionAddInput(qf_setup, "dx", 1 * 1, CEED_EVAL_GRAD); + CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE); + + CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass); + CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE); + CeedQFunctionAddInput(qf_mass, "u", 2, CEED_EVAL_INTERP); + CeedQFunctionAddOutput(qf_mass, "v", 2, CEED_EVAL_INTERP); + + // Operators + CeedOperatorCreate(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup); + CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + CeedOperatorSetMixedPrecision(op_setup); + + CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass); + CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); + CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetMixedPrecision(op_mass); + + CeedOperatorApply(op_setup, x, q_data, CEED_REQUEST_IMMEDIATE); + + { + CeedScalar *u_array; + + CeedVectorGetArrayWrite(u, CEED_MEM_HOST, &u_array); + for (int i = 0; i < num_nodes_u; i++) { + u_array[2 * i] = 1.0; + u_array[2 * i + 1] = 2.0; + } + CeedVectorRestoreArray(u, &u_array); + } + CeedOperatorApply(op_mass, u, v, CEED_REQUEST_IMMEDIATE); + + // Check output + { + const CeedScalar *v_array; + CeedScalar sum_1 = 0., sum_2 = 0.; + + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < num_nodes_u; i++) { + sum_1 += v_array[2 * i]; + sum_2 += v_array[2 * i + 1]; + } + CeedVectorRestoreArrayRead(v, &v_array); + if (fabs(sum_1 - 1.) > 1000. * FLT_EPSILON) printf("Computed Area: %f != True Area: 1.0\n", sum_1); + if (fabs(sum_2 - 2.) > 1000. * FLT_EPSILON) printf("Computed Area: %f != True Area: 2.0\n", sum_2); + } + + CeedVectorDestroy(&x); + CeedVectorDestroy(&u); + CeedVectorDestroy(&v); + CeedVectorDestroy(&q_data); + CeedElemRestrictionDestroy(&elem_restriction_u); + CeedElemRestrictionDestroy(&elem_restriction_x); + CeedElemRestrictionDestroy(&elem_restriction_q_data); + CeedBasisDestroy(&basis_u); + CeedBasisDestroy(&basis_x); + CeedQFunctionDestroy(&qf_setup); + CeedQFunctionDestroy(&qf_mass); + CeedOperatorDestroy(&op_setup); + CeedOperatorDestroy(&op_mass); + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t503-operator-mixed.c b/tests/t503-operator-mixed.c new file mode 100644 index 0000000000..7042c7846c --- /dev/null +++ b/tests/t503-operator-mixed.c @@ -0,0 +1,114 @@ +/// @file +/// Test creation, action, and destruction for mass matrix operator with passive inputs and outputs with mixed precision +/// \test Test creation, action, and destruction for mass matrix operator with passive inputs and outputs with mixed precision +#include +#include +#include +#include +#include + +#include "t500-operator.h" + +int main(int argc, char **argv) { + Ceed ceed; + CeedElemRestriction elem_restriction_x, elem_restriction_u, elem_restriction_q_data; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass; + CeedVector q_data, x, u, v; + CeedInt num_elem = 15, p = 5, q = 8; + CeedInt num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (p - 1) + 1; + CeedInt ind_x[num_elem * 2], ind_u[num_elem * p]; + + CeedInit(argv[1], &ceed); + + // Vectors + CeedVectorCreate(ceed, num_nodes_x, &x); + { + CeedScalar x_array[num_nodes_x]; + + for (CeedInt i = 0; i < num_nodes_x; i++) x_array[i] = (CeedScalar)i / (num_nodes_x - 1); + CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + CeedVectorCreate(ceed, num_nodes_u, &u); + CeedVectorCreate(ceed, num_nodes_u, &v); + CeedVectorCreate(ceed, num_elem * q, &q_data); + + // Restrictions + for (CeedInt i = 0; i < num_elem; i++) { + ind_x[2 * i + 0] = i; + ind_x[2 * i + 1] = i + 1; + } + CeedElemRestrictionCreate(ceed, num_elem, 2, 1, 1, num_nodes_x, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x); + + for (CeedInt i = 0; i < num_elem; i++) { + for (CeedInt j = 0; j < p; j++) { + ind_u[p * i + j] = i * (p - 1) + j; + } + } + CeedElemRestrictionCreate(ceed, num_elem, p, 1, 1, num_nodes_u, CEED_MEM_HOST, CEED_USE_POINTER, ind_u, &elem_restriction_u); + + CeedInt strides_q_data[3] = {1, q, q}; + CeedElemRestrictionCreateStrided(ceed, num_elem, q, 1, q * num_elem, strides_q_data, &elem_restriction_q_data); + + // Bases + CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, 2, q, CEED_GAUSS, &basis_x); + CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, p, q, CEED_GAUSS, &basis_u); + + // QFunctions + CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup); + CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT); + CeedQFunctionAddInput(qf_setup, "dx", 1, CEED_EVAL_GRAD); + CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE); + + CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass); + CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE); + CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP); + CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP); + + // Operators + CeedOperatorCreate(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup); + CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, x); + CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); + CeedOperatorSetMixedPrecision(op_setup); + + CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass); + CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); + CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, u); + CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, v); + CeedOperatorSetMixedPrecision(op_mass); + + // Note - It is atypical to use only passive fields; this test is intended + // as a test for all passive input modes rather than as an example. + CeedOperatorApply(op_setup, CEED_VECTOR_NONE, CEED_VECTOR_NONE, CEED_REQUEST_IMMEDIATE); + CeedVectorSetValue(u, 1.0); + CeedOperatorApply(op_mass, CEED_VECTOR_NONE, CEED_VECTOR_NONE, CEED_REQUEST_IMMEDIATE); + + // Check output + { + const CeedScalar *v_array; + CeedScalar sum = 0.; + + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < num_nodes_u; i++) sum += v_array[i]; + CeedVectorRestoreArrayRead(v, &v_array); + if (fabs(sum - 1.) > 1000. * FLT_EPSILON) printf("Computed Area: %f != True Area: 1.0\n", sum); + } + + CeedVectorDestroy(&x); + CeedVectorDestroy(&u); + CeedVectorDestroy(&v); + CeedVectorDestroy(&q_data); + CeedElemRestrictionDestroy(&elem_restriction_u); + CeedElemRestrictionDestroy(&elem_restriction_x); + CeedElemRestrictionDestroy(&elem_restriction_q_data); + CeedBasisDestroy(&basis_u); + CeedBasisDestroy(&basis_x); + CeedQFunctionDestroy(&qf_setup); + CeedQFunctionDestroy(&qf_mass); + CeedOperatorDestroy(&op_setup); + CeedOperatorDestroy(&op_mass); + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t505-operator-mixed.c b/tests/t505-operator-mixed.c new file mode 100644 index 0000000000..2efe5bd05d --- /dev/null +++ b/tests/t505-operator-mixed.c @@ -0,0 +1,128 @@ +/// @file +/// Test CeedOperatorApplyAdd with mixed precision +/// \test Test CeedOperatorApplyAdd with mixed precision +#include +#include +#include +#include +#include + +#include "t500-operator.h" + +int main(int argc, char **argv) { + Ceed ceed; + CeedElemRestriction elem_restriction_x, elem_restriction_u, elem_restriction_q_data; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass; + CeedVector q_data, x, u, v; + CeedInt num_elem = 15, p = 5, q = 8; + CeedInt num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (p - 1) + 1; + CeedInt ind_x[num_elem * 2], ind_u[num_elem * p]; + + CeedInit(argv[1], &ceed); + + CeedVectorCreate(ceed, num_nodes_x, &x); + { + CeedScalar x_array[num_nodes_x]; + for (CeedInt i = 0; i < num_nodes_x; i++) x_array[i] = (CeedScalar)i / (num_nodes_x - 1); + CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + CeedVectorCreate(ceed, num_nodes_u, &u); + CeedVectorCreate(ceed, num_nodes_u, &v); + CeedVectorCreate(ceed, num_elem * q, &q_data); + + // Restrictions + for (CeedInt i = 0; i < num_elem; i++) { + ind_x[2 * i + 0] = i; + ind_x[2 * i + 1] = i + 1; + } + CeedElemRestrictionCreate(ceed, num_elem, 2, 1, 1, num_nodes_x, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x); + + for (CeedInt i = 0; i < num_elem; i++) { + for (CeedInt j = 0; j < p; j++) { + ind_u[p * i + j] = i * (p - 1) + j; + } + } + CeedElemRestrictionCreate(ceed, num_elem, p, 1, 1, num_nodes_u, CEED_MEM_HOST, CEED_USE_POINTER, ind_u, &elem_restriction_u); + + CeedInt strides_q_data[3] = {1, q, q}; + CeedElemRestrictionCreateStrided(ceed, num_elem, q, 1, q * num_elem, strides_q_data, &elem_restriction_q_data); + + // Bases + CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, 2, q, CEED_GAUSS, &basis_x); + CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, p, q, CEED_GAUSS, &basis_u); + + // QFunctions + CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup); + CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT); + CeedQFunctionAddInput(qf_setup, "dx", 1, CEED_EVAL_GRAD); + CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE); + + CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass); + CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE); + CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP); + CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP); + + // Operators + CeedOperatorCreate(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup); + CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + CeedOperatorSetMixedPrecision(op_setup); + + CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass); + CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); + CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetMixedPrecision(op_mass); + + CeedOperatorApply(op_setup, x, q_data, CEED_REQUEST_IMMEDIATE); + + // Apply with V = 0 + CeedVectorSetValue(u, 1.0); + CeedVectorSetValue(v, 0.0); + CeedOperatorApplyAdd(op_mass, u, v, CEED_REQUEST_IMMEDIATE); + + // Check output + { + const CeedScalar *v_array; + CeedScalar sum = 0.; + + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < num_nodes_u; i++) sum += v_array[i]; + CeedVectorRestoreArrayRead(v, &v_array); + if (fabs(sum - 1.) > 1000. * FLT_EPSILON) printf("Computed Area: %f != True Area: 1.0\n", sum); + } + + // Apply with V = 1 + CeedVectorSetValue(v, 1.0); + CeedOperatorApplyAdd(op_mass, u, v, CEED_REQUEST_IMMEDIATE); + + // Check output + { + const CeedScalar *v_array; + CeedScalar sum = -num_nodes_u; + + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < num_nodes_u; i++) sum += v_array[i]; + CeedVectorRestoreArrayRead(v, &v_array); + if (fabs(sum - 1.) > 1000. * FLT_EPSILON) printf("Computed Area: %f != True Area: 1.0\n", sum); + } + + CeedVectorDestroy(&x); + CeedVectorDestroy(&u); + CeedVectorDestroy(&v); + CeedVectorDestroy(&q_data); + CeedElemRestrictionDestroy(&elem_restriction_u); + CeedElemRestrictionDestroy(&elem_restriction_x); + CeedElemRestrictionDestroy(&elem_restriction_q_data); + CeedBasisDestroy(&basis_u); + CeedBasisDestroy(&basis_x); + CeedQFunctionDestroy(&qf_setup); + CeedQFunctionDestroy(&qf_mass); + CeedOperatorDestroy(&op_setup); + CeedOperatorDestroy(&op_mass); + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t506-operator-mixed.c b/tests/t506-operator-mixed.c new file mode 100644 index 0000000000..0069f451ff --- /dev/null +++ b/tests/t506-operator-mixed.c @@ -0,0 +1,173 @@ +/// @file +/// Test creation reuse of the same QFunction for multiple operators with mixed precision +/// \test Test creation reuse of the same QFunction for multiple operators with mixed precision +#include +#include +#include +#include +#include + +#include "t502-operator.h" + +int main(int argc, char **argv) { + Ceed ceed; + CeedElemRestriction elem_restriction_x, elem_restriction_u, elem_restriction_q_data_small, elem_restriction_q_data_large; + CeedBasis basis_x_small, basis_x_large, basis_u_small, basis_u_large; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup_small, op_mass_small, op_setup_large, op_mass_large; + CeedVector q_data_small, q_data_large, x, u, v; + CeedInt num_elem = 15, p = 5, q = 8, scale = 3, num_comp = 2; + CeedInt num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (p - 1) + 1; + CeedInt ind_x[num_elem * 2], ind_u[num_elem * p]; + + CeedInit(argv[1], &ceed); + + CeedVectorCreate(ceed, num_nodes_x, &x); + { + CeedScalar x_array[num_nodes_x]; + + for (CeedInt i = 0; i < num_nodes_x; i++) x_array[i] = (CeedScalar)i / (num_nodes_x - 1); + CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + CeedVectorCreate(ceed, num_comp * num_nodes_u, &u); + CeedVectorCreate(ceed, num_comp * num_nodes_u, &v); + CeedVectorCreate(ceed, num_elem * q, &q_data_small); + CeedVectorCreate(ceed, num_elem * q * scale, &q_data_large); + + // Restrictions + for (CeedInt i = 0; i < num_elem; i++) { + ind_x[2 * i + 0] = i; + ind_x[2 * i + 1] = i + 1; + } + CeedElemRestrictionCreate(ceed, num_elem, num_comp, 1, 1, num_nodes_x, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x); + + for (CeedInt i = 0; i < num_elem; i++) { + for (CeedInt j = 0; j < p; j++) { + ind_u[p * i + j] = num_comp * (i * (p - 1) + j); + } + } + CeedElemRestrictionCreate(ceed, num_elem, p, num_comp, 1, num_comp * num_nodes_u, CEED_MEM_HOST, CEED_USE_POINTER, ind_u, &elem_restriction_u); + + CeedInt strides_q_data_small[3] = {1, q, q}; + CeedElemRestrictionCreateStrided(ceed, num_elem, q, 1, q * num_elem, strides_q_data_small, &elem_restriction_q_data_small); + + CeedInt strides_q_data_large[3] = {1, q * scale, q * scale}; + CeedElemRestrictionCreateStrided(ceed, num_elem, q * scale, 1, q * num_elem * scale, strides_q_data_large, &elem_restriction_q_data_large); + + // Bases + CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, 2, q, CEED_GAUSS, &basis_x_small); + CeedBasisCreateTensorH1Lagrange(ceed, 1, num_comp, p, q, CEED_GAUSS, &basis_u_small); + CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, 2, q * scale, CEED_GAUSS, &basis_x_large); + CeedBasisCreateTensorH1Lagrange(ceed, 1, num_comp, p, q * scale, CEED_GAUSS, &basis_u_large); + + // QFunctions + CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup); + CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT); + CeedQFunctionAddInput(qf_setup, "x", 1, CEED_EVAL_GRAD); + CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE); + + CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass); + CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE); + CeedQFunctionAddInput(qf_mass, "u", num_comp, CEED_EVAL_INTERP); + CeedQFunctionAddOutput(qf_mass, "v", num_comp, CEED_EVAL_INTERP); + + // 'Small' Operators + CeedOperatorCreate(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup_small); + CeedOperatorSetField(op_setup_small, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_small, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup_small, "x", elem_restriction_x, basis_x_small, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup_small, "rho", elem_restriction_q_data_small, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + CeedOperatorSetMixedPrecision(op_setup_small); + + CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_small); + CeedOperatorSetField(op_mass_small, "rho", elem_restriction_q_data_small, CEED_BASIS_NONE, q_data_small); + CeedOperatorSetField(op_mass_small, "u", elem_restriction_u, basis_u_small, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_mass_small, "v", elem_restriction_u, basis_u_small, CEED_VECTOR_ACTIVE); + CeedOperatorSetMixedPrecision(op_mass_small); + + // 'Large' operators + CeedOperatorCreate(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup_large); + CeedOperatorSetField(op_setup_large, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_large, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup_large, "x", elem_restriction_x, basis_x_large, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup_large, "rho", elem_restriction_q_data_large, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + CeedOperatorSetMixedPrecision(op_setup_large); + + CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_large); + CeedOperatorSetField(op_mass_large, "rho", elem_restriction_q_data_large, CEED_BASIS_NONE, q_data_large); + CeedOperatorSetField(op_mass_large, "u", elem_restriction_u, basis_u_large, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_mass_large, "v", elem_restriction_u, basis_u_large, CEED_VECTOR_ACTIVE); + CeedOperatorSetMixedPrecision(op_mass_large); + + // Setup + CeedOperatorApply(op_setup_small, x, q_data_small, CEED_REQUEST_IMMEDIATE); + CeedOperatorApply(op_setup_large, x, q_data_large, CEED_REQUEST_IMMEDIATE); + + { + CeedScalar *u_array; + + CeedVectorGetArrayWrite(u, CEED_MEM_HOST, &u_array); + for (int i = 0; i < num_nodes_u; i++) { + u_array[num_comp * i] = 1.0; + u_array[num_comp * i + 1] = 2.0; + } + CeedVectorRestoreArray(u, &u_array); + } + + // 'Small' operator + CeedOperatorApply(op_mass_small, u, v, CEED_REQUEST_IMMEDIATE); + + // Check output + { + const CeedScalar *v_array; + CeedScalar sum_1 = 0., sum_2 = 0.; + + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < num_nodes_u; i++) { + sum_1 += v_array[num_comp * i]; + sum_2 += v_array[num_comp * i + 1]; + } + CeedVectorRestoreArrayRead(v, &v_array); + if (fabs(sum_1 - 1.) > 1000. * FLT_EPSILON) printf("Small Problem, Component 1: Computed Area %f != True Area 1.0\n", sum_1); + if (fabs(sum_2 - 2.) > 1000. * FLT_EPSILON) printf("Small Problem, Component 2: Computed Area %f != True Area 2.0\n", sum_2); + } + + // 'Large' operator + CeedOperatorApply(op_mass_large, u, v, CEED_REQUEST_IMMEDIATE); + + // Check output + { + const CeedScalar *v_array; + CeedScalar sum_1 = 0., sum_2 = 0.; + + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < num_nodes_u; i++) { + sum_1 += v_array[num_comp * i]; + sum_2 += v_array[num_comp * i + 1]; + } + CeedVectorRestoreArrayRead(v, &v_array); + + if (fabs(sum_1 - 1.) > 1000. * FLT_EPSILON) printf("Large Problem, Component 1: Computed Area %f != True Area 1.0\n", sum_1); + if (fabs(sum_2 - 2.) > 1000. * FLT_EPSILON) printf("Large Problem, Component 2: Computed Area %f != True Area 2.0\n", sum_2); + } + + CeedVectorDestroy(&x); + CeedVectorDestroy(&u); + CeedVectorDestroy(&v); + CeedVectorDestroy(&q_data_small); + CeedVectorDestroy(&q_data_large); + CeedElemRestrictionDestroy(&elem_restriction_u); + CeedElemRestrictionDestroy(&elem_restriction_x); + CeedElemRestrictionDestroy(&elem_restriction_q_data_small); + CeedElemRestrictionDestroy(&elem_restriction_q_data_large); + CeedBasisDestroy(&basis_u_small); + CeedBasisDestroy(&basis_x_small); + CeedBasisDestroy(&basis_u_large); + CeedBasisDestroy(&basis_x_large); + CeedQFunctionDestroy(&qf_setup); + CeedQFunctionDestroy(&qf_mass); + CeedOperatorDestroy(&op_setup_small); + CeedOperatorDestroy(&op_mass_small); + CeedOperatorDestroy(&op_setup_large); + CeedOperatorDestroy(&op_mass_large); + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t510-operator-mixed.c b/tests/t510-operator-mixed.c new file mode 100644 index 0000000000..42853f7165 --- /dev/null +++ b/tests/t510-operator-mixed.c @@ -0,0 +1,133 @@ +/// @file +/// Test creation, action, and destruction for mass matrix operator with mixed precision +/// \test Test creation, action, and destruction for mass matrix operator with mixed precision +#include "t510-operator.h" + +#include +#include +#include +#include +#include + +#include "t320-basis.h" + +int main(int argc, char **argv) { + Ceed ceed; + CeedElemRestriction elem_restriction_x, elem_restriction_u, elem_restriction_q_data; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass; + CeedVector q_data, x, u, v; + CeedInt num_elem = 12, dim = 2, p = 6, q = 4; + CeedInt nx = 3, ny = 2; + CeedInt row, col, offset; + CeedInt num_dofs = (nx * 2 + 1) * (ny * 2 + 1), num_qpts = num_elem * q; + CeedInt ind_x[num_elem * p]; + CeedScalar q_ref[dim * q], q_weight[q]; + CeedScalar interp[p * q], grad[dim * p * q]; + + CeedInit(argv[1], &ceed); + + CeedVectorCreate(ceed, dim * num_dofs, &x); + { + CeedScalar x_array[dim * num_dofs]; + + for (CeedInt i = 0; i < num_dofs; i++) { + x_array[i] = (1. / (nx * 2)) * (CeedScalar)(i % (nx * 2 + 1)); + x_array[i + num_dofs] = (1. / (ny * 2)) * (CeedScalar)(i / (nx * 2 + 1)); + } + CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + CeedVectorCreate(ceed, num_qpts, &q_data); + CeedVectorCreate(ceed, num_dofs, &u); + CeedVectorCreate(ceed, num_dofs, &v); + + // Restrictions + for (CeedInt i = 0; i < num_elem / 2; i++) { + col = i % nx; + row = i / nx; + offset = col * 2 + row * (nx * 2 + 1) * 2; + + ind_x[i * 2 * p + 0] = 2 + offset; + ind_x[i * 2 * p + 1] = 9 + offset; + ind_x[i * 2 * p + 2] = 16 + offset; + ind_x[i * 2 * p + 3] = 1 + offset; + ind_x[i * 2 * p + 4] = 8 + offset; + ind_x[i * 2 * p + 5] = 0 + offset; + + ind_x[i * 2 * p + 6] = 14 + offset; + ind_x[i * 2 * p + 7] = 7 + offset; + ind_x[i * 2 * p + 8] = 0 + offset; + ind_x[i * 2 * p + 9] = 15 + offset; + ind_x[i * 2 * p + 10] = 8 + offset; + ind_x[i * 2 * p + 11] = 16 + offset; + } + CeedElemRestrictionCreate(ceed, num_elem, p, dim, num_dofs, dim * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x); + CeedElemRestrictionCreate(ceed, num_elem, p, 1, 1, num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_u); + + CeedInt strides_q_data[3] = {1, q, q}; + CeedElemRestrictionCreateStrided(ceed, num_elem, q, 1, num_qpts, strides_q_data, &elem_restriction_q_data); + + // Bases + Build2DSimplex(q_ref, q_weight, interp, grad); + CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, dim, p, q, interp, grad, q_ref, q_weight, &basis_x); + + Build2DSimplex(q_ref, q_weight, interp, grad); + CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, 1, p, q, interp, grad, q_ref, q_weight, &basis_u); + + // QFunctions + CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup); + CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT); + CeedQFunctionAddInput(qf_setup, "dx", dim * dim, CEED_EVAL_GRAD); + CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE); + + CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass); + CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE); + CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP); + CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP); + + // Operators + CeedOperatorCreate(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup); + CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + CeedOperatorSetMixedPrecision(op_setup); + + CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass); + CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); + CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetMixedPrecision(op_mass); + + CeedOperatorApply(op_setup, x, q_data, CEED_REQUEST_IMMEDIATE); + + CeedVectorSetValue(u, 0.0); + CeedOperatorApply(op_mass, u, v, CEED_REQUEST_IMMEDIATE); + + // Check output + { + const CeedScalar *v_array; + + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < num_dofs; i++) { + if (fabs(v_array[i]) > FLT_EPSILON) printf("[%" CeedInt_FMT "] v %g != 0.0\n", i, v_array[i]); + } + CeedVectorRestoreArrayRead(v, &v_array); + } + + CeedVectorDestroy(&x); + CeedVectorDestroy(&u); + CeedVectorDestroy(&v); + CeedVectorDestroy(&q_data); + CeedElemRestrictionDestroy(&elem_restriction_u); + CeedElemRestrictionDestroy(&elem_restriction_x); + CeedElemRestrictionDestroy(&elem_restriction_q_data); + CeedBasisDestroy(&basis_u); + CeedBasisDestroy(&basis_x); + CeedQFunctionDestroy(&qf_setup); + CeedQFunctionDestroy(&qf_mass); + CeedOperatorDestroy(&op_setup); + CeedOperatorDestroy(&op_mass); + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t520-operator-mixed.c b/tests/t520-operator-mixed.c new file mode 100644 index 0000000000..ea641fb604 --- /dev/null +++ b/tests/t520-operator-mixed.c @@ -0,0 +1,234 @@ +/// @file +/// Test creation, action, and destruction for composite mass matrix operator with mixed precision +/// \test Test creation, action, and destruction for composite mass matrix operator with mixed precision +#include +#include +#include +#include +#include + +#include "t320-basis.h" +#include "t510-operator.h" + +/* The mesh comprises of two rows of 3 quadrilaterals followed by one row + of 6 triangles: + _ _ _ + |_|_|_| + |_|_|_| + |/|/|/| + +*/ + +int main(int argc, char **argv) { + Ceed ceed; + CeedElemRestriction elem_restriction_x_tet, elem_restriction_u_tet, elem_restriction_q_data_tet, elem_restriction_x_hex, elem_restriction_u_hex, + elem_restriction_q_data_hex; + CeedBasis basis_x_tet, basis_u_tet, basis_x_hex, basis_u_hex; + CeedQFunction qf_setup_tet, qf_mass_tet, qf_setup_hex, qf_mass_hex; + CeedOperator op_setup_tet, op_mass_tet, op_setup_hex, op_mass_hex, op_setup, op_mass; + CeedVector q_data_tet, q_data_hex, x, u, v; + CeedInt num_elem_tet = 6, p_tet = 6, q_tet = 4, num_elem_hex = 6, p_hex = 3, q_hex = 4, dim = 2; + CeedInt n_x = 3, n_y = 3, n_x_tet = 3, n_y_tet = 1, n_x_hex = 3; + CeedInt row, col, offset; + CeedInt num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_qpts_tet = num_elem_tet * q_tet, num_qpts_hex = num_elem_hex * q_hex * q_hex; + CeedInt ind_x_tet[num_elem_tet * p_tet], ind_x_hex[num_elem_hex * p_hex * p_hex]; + CeedScalar q_ref[dim * q_tet], q_weight[q_tet]; + CeedScalar interp[p_tet * q_tet], grad[dim * p_tet * q_tet]; + + CeedInit(argv[1], &ceed); + + // Vectors + CeedVectorCreate(ceed, dim * num_dofs, &x); + { + CeedScalar x_array[dim * num_dofs]; + + for (CeedInt i = 0; i < n_y * 2 + 1; i++) { + for (CeedInt j = 0; j < n_x * 2 + 1; j++) { + x_array[i + j * (n_y * 2 + 1) + 0 * num_dofs] = (CeedScalar)i / (2 * n_y); + x_array[i + j * (n_y * 2 + 1) + 1 * num_dofs] = (CeedScalar)j / (2 * n_x); + } + } + CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + CeedVectorCreate(ceed, num_dofs, &u); + CeedVectorCreate(ceed, num_dofs, &v); + CeedVectorCreate(ceed, num_qpts_tet, &q_data_tet); + CeedVectorCreate(ceed, num_qpts_hex, &q_data_hex); + + // Set up Tet Elements + // -- Restrictions + for (CeedInt i = 0; i < num_elem_tet / 2; i++) { + col = i % n_x_tet; + row = i / n_x_tet; + offset = col * 2 + row * (n_x_tet * 2 + 1) * 2; + + ind_x_tet[i * 2 * p_tet + 0] = 2 + offset; + ind_x_tet[i * 2 * p_tet + 1] = 9 + offset; + ind_x_tet[i * 2 * p_tet + 2] = 16 + offset; + ind_x_tet[i * 2 * p_tet + 3] = 1 + offset; + ind_x_tet[i * 2 * p_tet + 4] = 8 + offset; + ind_x_tet[i * 2 * p_tet + 5] = 0 + offset; + + ind_x_tet[i * 2 * p_tet + 6] = 14 + offset; + ind_x_tet[i * 2 * p_tet + 7] = 7 + offset; + ind_x_tet[i * 2 * p_tet + 8] = 0 + offset; + ind_x_tet[i * 2 * p_tet + 9] = 15 + offset; + ind_x_tet[i * 2 * p_tet + 10] = 8 + offset; + ind_x_tet[i * 2 * p_tet + 11] = 16 + offset; + } + CeedElemRestrictionCreate(ceed, num_elem_tet, p_tet, dim, num_dofs, dim * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x_tet, + &elem_restriction_x_tet); + CeedElemRestrictionCreate(ceed, num_elem_tet, p_tet, 1, 1, num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x_tet, &elem_restriction_u_tet); + + CeedInt strides_q_data_tet[3] = {1, q_tet, q_tet}; + CeedElemRestrictionCreateStrided(ceed, num_elem_tet, q_tet, 1, num_qpts_tet, strides_q_data_tet, &elem_restriction_q_data_tet); + + // -- Bases + Build2DSimplex(q_ref, q_weight, interp, grad); + CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, dim, p_tet, q_tet, interp, grad, q_ref, q_weight, &basis_x_tet); + + Build2DSimplex(q_ref, q_weight, interp, grad); + CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, 1, p_tet, q_tet, interp, grad, q_ref, q_weight, &basis_u_tet); + + // -- QFunctions + CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup_tet); + CeedQFunctionAddInput(qf_setup_tet, "_weight", 1, CEED_EVAL_WEIGHT); + CeedQFunctionAddInput(qf_setup_tet, "dx", dim * dim, CEED_EVAL_GRAD); + CeedQFunctionAddOutput(qf_setup_tet, "rho", 1, CEED_EVAL_NONE); + + CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass_tet); + CeedQFunctionAddInput(qf_mass_tet, "rho", 1, CEED_EVAL_NONE); + CeedQFunctionAddInput(qf_mass_tet, "u", 1, CEED_EVAL_INTERP); + CeedQFunctionAddOutput(qf_mass_tet, "v", 1, CEED_EVAL_INTERP); + + // -- Operators + // ---- Setup Tet + CeedOperatorCreate(ceed, qf_setup_tet, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup_tet); + CeedOperatorSetField(op_setup_tet, "_weight", CEED_ELEMRESTRICTION_NONE, basis_x_tet, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup_tet, "dx", elem_restriction_x_tet, basis_x_tet, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup_tet, "rho", elem_restriction_q_data_tet, CEED_BASIS_NONE, q_data_tet); + CeedOperatorSetMixedPrecision(op_setup_tet); + // ---- Mass Tet + CeedOperatorCreate(ceed, qf_mass_tet, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_tet); + CeedOperatorSetField(op_mass_tet, "rho", elem_restriction_q_data_tet, CEED_BASIS_NONE, q_data_tet); + CeedOperatorSetField(op_mass_tet, "u", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_mass_tet, "v", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE); + CeedOperatorSetName(op_mass_tet, "mass tet"); + CeedOperatorSetMixedPrecision(op_mass_tet); + + // Set up Hex Elements + // -- Restrictions + for (CeedInt i = 0; i < num_elem_hex; i++) { + col = i % n_x_hex; + row = i / n_x_hex; + offset = (n_x_tet * 2 + 1) * (n_y_tet * 2) * (1 + row) + col * 2; + for (CeedInt j = 0; j < p_hex; j++) { + for (CeedInt k = 0; k < p_hex; k++) ind_x_hex[p_hex * (p_hex * i + k) + j] = offset + k * (n_x_hex * 2 + 1) + j; + } + } + CeedElemRestrictionCreate(ceed, num_elem_hex, p_hex * p_hex, dim, num_dofs, dim * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x_hex, + &elem_restriction_x_hex); + CeedElemRestrictionCreate(ceed, num_elem_hex, p_hex * p_hex, 1, 1, num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x_hex, &elem_restriction_u_hex); + + CeedInt strides_q_data_hex[3] = {1, q_hex * q_hex, q_hex * q_hex}; + CeedElemRestrictionCreateStrided(ceed, num_elem_hex, q_hex * q_hex, 1, num_qpts_hex, strides_q_data_hex, &elem_restriction_q_data_hex); + + // -- Bases + CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, p_hex, q_hex, CEED_GAUSS, &basis_x_hex); + CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p_hex, q_hex, CEED_GAUSS, &basis_u_hex); + + // -- QFunctions + CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup_hex); + CeedQFunctionAddInput(qf_setup_hex, "weight", 1, CEED_EVAL_WEIGHT); + CeedQFunctionAddInput(qf_setup_hex, "dx", dim * dim, CEED_EVAL_GRAD); + CeedQFunctionAddOutput(qf_setup_hex, "rho", 1, CEED_EVAL_NONE); + + CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass_hex); + CeedQFunctionAddInput(qf_mass_hex, "rho", 1, CEED_EVAL_NONE); + CeedQFunctionAddInput(qf_mass_hex, "u", 1, CEED_EVAL_INTERP); + CeedQFunctionAddOutput(qf_mass_hex, "v", 1, CEED_EVAL_INTERP); + + // -- Operators + CeedOperatorCreate(ceed, qf_setup_hex, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup_hex); + CeedOperatorSetField(op_setup_hex, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_hex, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup_hex, "dx", elem_restriction_x_hex, basis_x_hex, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup_hex, "rho", elem_restriction_q_data_hex, CEED_BASIS_NONE, q_data_hex); + CeedOperatorSetMixedPrecision(op_setup_hex); + + CeedOperatorCreate(ceed, qf_mass_hex, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_hex); + CeedOperatorSetField(op_mass_hex, "rho", elem_restriction_q_data_hex, CEED_BASIS_NONE, q_data_hex); + CeedOperatorSetField(op_mass_hex, "u", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_mass_hex, "v", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE); + CeedOperatorSetName(op_mass_hex, "mass hex"); + CeedOperatorSetMixedPrecision(op_mass_hex); + + // Set up Composite Operators + // -- Create + CeedCompositeOperatorCreate(ceed, &op_setup); + // -- Add SubOperators + CeedCompositeOperatorAddSub(op_setup, op_setup_tet); + CeedCompositeOperatorAddSub(op_setup, op_setup_hex); + + // -- Create + CeedCompositeOperatorCreate(ceed, &op_mass); + // -- Add SubOperators + CeedCompositeOperatorAddSub(op_mass, op_mass_tet); + CeedCompositeOperatorAddSub(op_mass, op_mass_hex); + + { // Test CeedCompositeOperatorGetSubByName + CeedOperator op_byname; + + CeedCompositeOperatorGetSubByName(op_mass, "mass hex", &op_byname); + if (op_byname != op_mass_hex) printf("CeedCompositeOperatorGetSubByName returned incorrect Sub Operator"); + + CeedCompositeOperatorGetSubByName(op_mass, "asdf", &op_byname); + if (op_byname != NULL) printf("CeedCompositeOperatorGetSubByName returned non-NULL for non-existent Sub Operator"); + } + + // Apply Setup Operator + CeedOperatorApply(op_setup, x, CEED_VECTOR_NONE, CEED_REQUEST_IMMEDIATE); + + // Apply Mass Operator + CeedVectorSetValue(u, 0.0); + CeedOperatorApply(op_mass, u, v, CEED_REQUEST_IMMEDIATE); + + // Check output + { + const CeedScalar *v_array; + + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < num_dofs; i++) { + if (fabs(v_array[i]) > FLT_EPSILON) printf("[%" CeedInt_FMT "] v %g != 0.0\n", i, v_array[i]); + } + CeedVectorRestoreArrayRead(v, &v_array); + } + + // Cleanup + CeedVectorDestroy(&x); + CeedVectorDestroy(&u); + CeedVectorDestroy(&v); + CeedVectorDestroy(&q_data_tet); + CeedVectorDestroy(&q_data_hex); + CeedElemRestrictionDestroy(&elem_restriction_u_tet); + CeedElemRestrictionDestroy(&elem_restriction_x_tet); + CeedElemRestrictionDestroy(&elem_restriction_q_data_tet); + CeedElemRestrictionDestroy(&elem_restriction_u_hex); + CeedElemRestrictionDestroy(&elem_restriction_x_hex); + CeedElemRestrictionDestroy(&elem_restriction_q_data_hex); + CeedBasisDestroy(&basis_u_tet); + CeedBasisDestroy(&basis_x_tet); + CeedBasisDestroy(&basis_u_hex); + CeedBasisDestroy(&basis_x_hex); + CeedQFunctionDestroy(&qf_setup_tet); + CeedQFunctionDestroy(&qf_mass_tet); + CeedOperatorDestroy(&op_setup_tet); + CeedOperatorDestroy(&op_mass_tet); + CeedQFunctionDestroy(&qf_setup_hex); + CeedQFunctionDestroy(&qf_mass_hex); + CeedOperatorDestroy(&op_setup_hex); + CeedOperatorDestroy(&op_mass_hex); + CeedOperatorDestroy(&op_setup); + CeedOperatorDestroy(&op_mass); + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t522-operator-mixed.c b/tests/t522-operator-mixed.c new file mode 100644 index 0000000000..071c979790 --- /dev/null +++ b/tests/t522-operator-mixed.c @@ -0,0 +1,221 @@ +/// @file +/// Test creation, action, and destruction for diffusion matrix operator with mixed precision +/// \test Test creation, action, and destruction for diffusion matrix operator with mixed precision +#include "t522-operator.h" + +#include +#include +#include +#include +#include + +#include "t320-basis.h" + +/* The mesh comprises of two rows of 3 quadrilaterals followed by one row + of 6 triangles: + _ _ _ + |_|_|_| + |_|_|_| + |/|/|/| + +*/ + +int main(int argc, char **argv) { + Ceed ceed; + CeedElemRestriction elem_restriction_x_tet, elem_restriction_u_tet, elem_restriction_q_data_tet, elem_restriction_x_hex, elem_restriction_u_hex, + elem_restriction_q_data_hex; + CeedBasis basis_x_tet, basis_u_tet, basis_x_hex, basis_u_hex; + CeedQFunction qf_setup_tet, qf_diff_tet, qf_setup_hex, qf_diff_hex; + CeedOperator op_setup_tet, op_diff_tet, op_setup_hex, op_diff_hex, op_setup, op_diff; + CeedVector q_data_tet, q_data_hex, x, u, v; + CeedInt num_elem_tet = 6, p_tet = 6, q_tet = 4, num_elem_hex = 6, p_hex = 3, q_hex = 4, dim = 2; + CeedInt n_x = 3, n_y = 3, n_x_tet = 3, n_y_tet = 1, n_x_hex = 3; + CeedInt row, col, offset; + CeedInt num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_qpts_tet = num_elem_tet * q_tet, num_qpts_hex = num_elem_hex * q_hex * q_hex; + CeedInt ind_x_tet[num_elem_tet * p_tet], ind_x_hex[num_elem_hex * p_hex * p_hex]; + CeedScalar q_ref[dim * q_tet], q_weight[q_tet]; + CeedScalar interp[p_tet * q_tet], grad[dim * p_tet * q_tet]; + + CeedInit(argv[1], &ceed); + + // Vectors + CeedVectorCreate(ceed, dim * num_dofs, &x); + { + CeedScalar x_array[dim * num_dofs]; + + for (CeedInt i = 0; i < n_y * 2 + 1; i++) { + for (CeedInt j = 0; j < n_x * 2 + 1; j++) { + x_array[i + j * (n_y * 2 + 1) + 0 * num_dofs] = (CeedScalar)i / (2 * n_y); + x_array[i + j * (n_y * 2 + 1) + 1 * num_dofs] = (CeedScalar)j / (2 * n_x); + } + } + CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + CeedVectorCreate(ceed, num_dofs, &u); + CeedVectorCreate(ceed, num_dofs, &v); + CeedVectorCreate(ceed, num_qpts_tet * dim * (dim + 1) / 2, &q_data_tet); + CeedVectorCreate(ceed, num_qpts_hex * dim * (dim + 1) / 2, &q_data_hex); + + // Tet Elements + // -- Restrictions + for (CeedInt i = 0; i < num_elem_tet / 2; i++) { + col = i % n_x_tet; + row = i / n_x_tet; + offset = col * 2 + row * (n_x_tet * 2 + 1) * 2; + + ind_x_tet[i * 2 * p_tet + 0] = 2 + offset; + ind_x_tet[i * 2 * p_tet + 1] = 9 + offset; + ind_x_tet[i * 2 * p_tet + 2] = 16 + offset; + ind_x_tet[i * 2 * p_tet + 3] = 1 + offset; + ind_x_tet[i * 2 * p_tet + 4] = 8 + offset; + ind_x_tet[i * 2 * p_tet + 5] = 0 + offset; + + ind_x_tet[i * 2 * p_tet + 6] = 14 + offset; + ind_x_tet[i * 2 * p_tet + 7] = 7 + offset; + ind_x_tet[i * 2 * p_tet + 8] = 0 + offset; + ind_x_tet[i * 2 * p_tet + 9] = 15 + offset; + ind_x_tet[i * 2 * p_tet + 10] = 8 + offset; + ind_x_tet[i * 2 * p_tet + 11] = 16 + offset; + } + CeedElemRestrictionCreate(ceed, num_elem_tet, p_tet, dim, num_dofs, dim * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x_tet, + &elem_restriction_x_tet); + CeedElemRestrictionCreate(ceed, num_elem_tet, p_tet, 1, 1, num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x_tet, &elem_restriction_u_tet); + + CeedInt strides_q_data_tet[3] = {1, q_tet, q_tet * dim * (dim + 1) / 2}; + CeedElemRestrictionCreateStrided(ceed, num_elem_tet, q_tet, dim * (dim + 1) / 2, dim * (dim + 1) / 2 * num_qpts_tet, strides_q_data_tet, + &elem_restriction_q_data_tet); + + // -- Bases + Build2DSimplex(q_ref, q_weight, interp, grad); + CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, dim, p_tet, q_tet, interp, grad, q_ref, q_weight, &basis_x_tet); + + Build2DSimplex(q_ref, q_weight, interp, grad); + CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, 1, p_tet, q_tet, interp, grad, q_ref, q_weight, &basis_u_tet); + + // -- QFunctions + CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup_tet); + CeedQFunctionAddInput(qf_setup_tet, "weight", 1, CEED_EVAL_WEIGHT); + CeedQFunctionAddInput(qf_setup_tet, "dx", dim * dim, CEED_EVAL_GRAD); + CeedQFunctionAddOutput(qf_setup_tet, "rho", dim * (dim + 1) / 2, CEED_EVAL_NONE); + + CeedQFunctionCreateInterior(ceed, 1, diff, diff_loc, &qf_diff_tet); + CeedQFunctionAddInput(qf_diff_tet, "rho", dim * (dim + 1) / 2, CEED_EVAL_NONE); + CeedQFunctionAddInput(qf_diff_tet, "u", dim, CEED_EVAL_GRAD); + CeedQFunctionAddOutput(qf_diff_tet, "v", dim, CEED_EVAL_GRAD); + + // -- Operators + // ---- Setup Tet + CeedOperatorCreate(ceed, qf_setup_tet, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup_tet); + CeedOperatorSetField(op_setup_tet, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_tet, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup_tet, "dx", elem_restriction_x_tet, basis_x_tet, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup_tet, "rho", elem_restriction_q_data_tet, CEED_BASIS_NONE, q_data_tet); + CeedOperatorSetMixedPrecision(op_setup_tet); + // ---- Diff Tet + CeedOperatorCreate(ceed, qf_diff_tet, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_diff_tet); + CeedOperatorSetField(op_diff_tet, "rho", elem_restriction_q_data_tet, CEED_BASIS_NONE, q_data_tet); + CeedOperatorSetField(op_diff_tet, "u", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_diff_tet, "v", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE); + CeedOperatorSetMixedPrecision(op_diff_tet); + + // Hex Elements + // -- Restrictions + for (CeedInt i = 0; i < num_elem_hex; i++) { + col = i % n_x_hex; + row = i / n_x_hex; + offset = (n_x_tet * 2 + 1) * (n_y_tet * 2) * (1 + row) + col * 2; + for (CeedInt j = 0; j < p_hex; j++) { + for (CeedInt k = 0; k < p_hex; k++) ind_x_hex[p_hex * (p_hex * i + k) + j] = offset + k * (n_x_hex * 2 + 1) + j; + } + } + CeedElemRestrictionCreate(ceed, num_elem_hex, p_hex * p_hex, dim, num_dofs, dim * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x_hex, + &elem_restriction_x_hex); + CeedElemRestrictionCreate(ceed, num_elem_hex, p_hex * p_hex, 1, 1, num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x_hex, &elem_restriction_u_hex); + + CeedInt strides_q_data_hex[3] = {1, q_hex * q_hex, q_hex * q_hex * dim * (dim + 1) / 2}; + CeedElemRestrictionCreateStrided(ceed, num_elem_hex, q_hex * q_hex, dim * (dim + 1) / 2, dim * (dim + 1) / 2 * num_qpts_hex, strides_q_data_hex, + &elem_restriction_q_data_hex); + + // -- Bases + CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, p_hex, q_hex, CEED_GAUSS, &basis_x_hex); + CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p_hex, q_hex, CEED_GAUSS, &basis_u_hex); + + // -- QFunctions + CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup_hex); + CeedQFunctionAddInput(qf_setup_hex, "weight", 1, CEED_EVAL_WEIGHT); + CeedQFunctionAddInput(qf_setup_hex, "dx", dim * dim, CEED_EVAL_GRAD); + CeedQFunctionAddOutput(qf_setup_hex, "rho", dim * (dim + 1) / 2, CEED_EVAL_NONE); + + CeedQFunctionCreateInterior(ceed, 1, diff, diff_loc, &qf_diff_hex); + CeedQFunctionAddInput(qf_diff_hex, "rho", dim * (dim + 1) / 2, CEED_EVAL_NONE); + CeedQFunctionAddInput(qf_diff_hex, "u", dim, CEED_EVAL_GRAD); + CeedQFunctionAddOutput(qf_diff_hex, "v", dim, CEED_EVAL_GRAD); + + // -- Operators + CeedOperatorCreate(ceed, qf_setup_hex, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup_hex); + CeedOperatorSetField(op_setup_hex, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_hex, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup_hex, "dx", elem_restriction_x_hex, basis_x_hex, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup_hex, "rho", elem_restriction_q_data_hex, CEED_BASIS_NONE, q_data_hex); + CeedOperatorSetMixedPrecision(op_setup_hex); + + CeedOperatorCreate(ceed, qf_diff_hex, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_diff_hex); + CeedOperatorSetField(op_diff_hex, "rho", elem_restriction_q_data_hex, CEED_BASIS_NONE, q_data_hex); + CeedOperatorSetField(op_diff_hex, "u", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_diff_hex, "v", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE); + CeedOperatorSetMixedPrecision(op_diff_hex); + + // Composite Operators + CeedCompositeOperatorCreate(ceed, &op_setup); + CeedCompositeOperatorAddSub(op_setup, op_setup_tet); + CeedCompositeOperatorAddSub(op_setup, op_setup_hex); + + CeedCompositeOperatorCreate(ceed, &op_diff); + CeedCompositeOperatorAddSub(op_diff, op_diff_tet); + CeedCompositeOperatorAddSub(op_diff, op_diff_hex); + + // Apply Setup Operator + CeedOperatorApply(op_setup, x, CEED_VECTOR_NONE, CEED_REQUEST_IMMEDIATE); + + // Apply diff Operator + CeedVectorSetValue(u, 1.0); + CeedOperatorApply(op_diff, u, v, CEED_REQUEST_IMMEDIATE); + + // Check output + { + const CeedScalar *v_array; + + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < num_dofs; i++) { + if (fabs(v_array[i]) > FLT_EPSILON) printf("Computed: %f != True: 0.0\n", v_array[i]); + } + CeedVectorRestoreArrayRead(v, &v_array); + } + + // Cleanup + CeedVectorDestroy(&x); + CeedVectorDestroy(&u); + CeedVectorDestroy(&v); + CeedVectorDestroy(&q_data_tet); + CeedVectorDestroy(&q_data_hex); + CeedElemRestrictionDestroy(&elem_restriction_u_tet); + CeedElemRestrictionDestroy(&elem_restriction_x_tet); + CeedElemRestrictionDestroy(&elem_restriction_q_data_tet); + CeedElemRestrictionDestroy(&elem_restriction_u_hex); + CeedElemRestrictionDestroy(&elem_restriction_x_hex); + CeedElemRestrictionDestroy(&elem_restriction_q_data_hex); + CeedBasisDestroy(&basis_u_tet); + CeedBasisDestroy(&basis_x_tet); + CeedBasisDestroy(&basis_u_hex); + CeedBasisDestroy(&basis_x_hex); + CeedQFunctionDestroy(&qf_setup_tet); + CeedQFunctionDestroy(&qf_diff_tet); + CeedOperatorDestroy(&op_setup_tet); + CeedOperatorDestroy(&op_diff_tet); + CeedQFunctionDestroy(&qf_setup_hex); + CeedQFunctionDestroy(&qf_diff_hex); + CeedOperatorDestroy(&op_setup_hex); + CeedOperatorDestroy(&op_diff_hex); + CeedOperatorDestroy(&op_setup); + CeedOperatorDestroy(&op_diff); + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t591-operator-mixed.c b/tests/t591-operator-mixed.c new file mode 100644 index 0000000000..d0bc270977 --- /dev/null +++ b/tests/t591-operator-mixed.c @@ -0,0 +1,197 @@ +/// @file +/// Test creation, action, and destruction for mass matrix operator at points with mixed precision +/// \test Test creation, action, and destruction for mass matrix operator at points with mixed precision +#include "t591-operator.h" + +#include +#include +#include +#include +#include + +int main(int argc, char **argv) { + Ceed ceed; + CeedInt num_elem_1d = 3, num_elem = num_elem_1d * num_elem_1d, dim = 2, p = 3, q = 5; + CeedInt num_nodes = (num_elem_1d * (p - 1) + 1) * (num_elem_1d * (p - 1) + 1), num_points_per_elem = 4, num_points = num_elem * num_points_per_elem; + CeedVector x_points, x_elem, q_data, u, v; + CeedElemRestriction elem_restriction_x_points, elem_restriction_q_data, elem_restriction_x, elem_restriction_u; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass; + bool is_at_points; + + CeedInit(argv[1], &ceed); + + // Point reference coordinates + CeedVectorCreate(ceed, dim * num_points, &x_points); + { + CeedScalar x_array[dim * num_points]; + + for (CeedInt e = 0; e < num_elem; e++) { + for (CeedInt d = 0; d < dim; d++) { + x_array[num_points_per_elem * (e * dim + d) + 0] = 0.25; + x_array[num_points_per_elem * (e * dim + d) + 1] = d == 0 ? -0.25 : 0.25; + x_array[num_points_per_elem * (e * dim + d) + 2] = d == 0 ? 0.25 : -0.25; + x_array[num_points_per_elem * (e * dim + d) + 3] = 0.25; + } + } + CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + { + CeedInt ind_x[num_elem + 1 + num_points]; + + for (CeedInt i = 0; i <= num_elem; i++) ind_x[i] = num_elem + 1 + i * num_points_per_elem; + for (CeedInt i = 0; i < num_points; i++) ind_x[num_elem + 1 + i] = i; + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim, num_points * dim, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, + &elem_restriction_x_points); + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_q_data); + } + + // Q data + CeedVectorCreate(ceed, num_points, &q_data); + + // Cell coordinates + { + CeedInt p = 2, num_nodes = (num_elem_1d * (p - 1) + 1) * (num_elem_1d * (p - 1) + 1); + CeedInt ind_x[num_elem * p * p]; + + for (CeedInt e = 0; e < num_elem; e++) { + CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0}; + + for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p - 1) + 1; + { + CeedInt r_e = e; + + for (CeedInt d = 0; d < dim; d++) { + elem_xy[d] = r_e % num_elem_1d; + r_e /= num_elem_1d; + } + } + CeedInt num_nodes_in_elem = p * p, *elem_nodes = ind_x + e * num_nodes_in_elem; + + for (CeedInt n = 0; n < num_nodes_in_elem; n++) { + CeedInt g_node = 0, g_node_stride = 1, r_node = n; + + for (CeedInt d = 0; d < dim; d++) { + g_node += (elem_xy[d] * (p - 1) + r_node % p) * g_node_stride; + g_node_stride *= n_d[d]; + r_node /= p; + } + elem_nodes[n] = p * g_node; + } + } + CeedElemRestrictionCreate(ceed, num_elem, p * p, dim, 1, dim * num_nodes, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_x); + CeedVectorCreate(ceed, dim * num_nodes, &x_elem); + { + CeedScalar x_array[dim * num_nodes]; + + for (CeedInt i = 0; i <= num_elem_1d; i++) { + for (CeedInt j = 0; j <= num_elem_1d; j++) { + x_array[(i * (num_elem_1d + 1) + j) * dim + 0] = j; + x_array[(i * (num_elem_1d + 1) + j) * dim + 1] = i; + } + } + CeedVectorSetArray(x_elem, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + } + + CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, q, CEED_GAUSS, &basis_x); + + // Cell solution + { + CeedInt ind_u[num_elem * p * p]; + + for (CeedInt e = 0; e < num_elem; e++) { + CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0}; + + for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p - 1) + 1; + { + CeedInt r_e = e; + + for (CeedInt d = 0; d < dim; d++) { + elem_xy[d] = r_e % num_elem_1d; + r_e /= num_elem_1d; + } + } + CeedInt num_nodes_in_elem = p * p, *elem_nodes = ind_u + e * num_nodes_in_elem; + + for (CeedInt n = 0; n < num_nodes_in_elem; n++) { + CeedInt g_node = 0, g_node_stride = 1, r_node = n; + + for (CeedInt d = 0; d < dim; d++) { + g_node += (elem_xy[d] * (p - 1) + r_node % p) * g_node_stride; + g_node_stride *= n_d[d]; + r_node /= p; + } + elem_nodes[n] = g_node; + } + } + CeedElemRestrictionCreate(ceed, num_elem, p * p, 1, 1, num_nodes, CEED_MEM_HOST, CEED_COPY_VALUES, ind_u, &elem_restriction_u); + } + CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis_u); + + // Setup geometric scaling + CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup); + CeedQFunctionAddInput(qf_setup, "x", dim * dim, CEED_EVAL_GRAD); + CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT); + CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE); + + CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup); + CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + CeedOperatorSetMixedPrecision(op_setup); + CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points); + + CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE); + + // Mass operator + CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass); + CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP); + CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE); + CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP); + + CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass); + CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); + CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetMixedPrecision(op_mass); + CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points); + + CeedOperatorIsAtPoints(op_mass, &is_at_points); + if (!is_at_points) printf("Error: Operator should be at points\n"); + + CeedVectorCreate(ceed, num_nodes, &u); + CeedVectorSetValue(u, 1.0); + CeedVectorCreate(ceed, num_nodes, &v); + CeedOperatorApply(op_mass, u, v, CEED_REQUEST_IMMEDIATE); + + { + CeedScalar sum = 0.0; + const CeedScalar *v_array; + + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < num_nodes; i++) sum += v_array[i]; + CeedVectorRestoreArrayRead(v, &v_array); + // Summing 9 reference elements + if (fabs(sum - 1.0 * num_elem) > 5000. * FLT_EPSILON) printf("Incorrect area computed, %f != %f\n", sum, 1.0 * num_elem); + } + + CeedVectorDestroy(&x_points); + CeedVectorDestroy(&q_data); + CeedVectorDestroy(&x_elem); + CeedVectorDestroy(&u); + CeedVectorDestroy(&v); + CeedElemRestrictionDestroy(&elem_restriction_x_points); + CeedElemRestrictionDestroy(&elem_restriction_q_data); + CeedElemRestrictionDestroy(&elem_restriction_x); + CeedElemRestrictionDestroy(&elem_restriction_u); + CeedBasisDestroy(&basis_x); + CeedBasisDestroy(&basis_u); + CeedQFunctionDestroy(&qf_setup); + CeedQFunctionDestroy(&qf_mass); + CeedOperatorDestroy(&op_setup); + CeedOperatorDestroy(&op_mass); + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t592-operator-mixed.c b/tests/t592-operator-mixed.c new file mode 100644 index 0000000000..897c16299b --- /dev/null +++ b/tests/t592-operator-mixed.c @@ -0,0 +1,249 @@ +/// @file +/// Test assembly of mass matrix operator QFunction at points with mixed precision +/// \test Test assembly of mass matrix operator QFunction at points with mixed precision +#include +#include +#include +#include +#include + +#include "t591-operator.h" + +int main(int argc, char **argv) { + Ceed ceed; + CeedInt num_elem_1d = 3, num_elem = num_elem_1d * num_elem_1d, dim = 2, p = 3, q = 5; + CeedInt num_nodes = (num_elem_1d * (p - 1) + 1) * (num_elem_1d * (p - 1) + 1), num_points_per_elem = 4, num_points = num_elem * num_points_per_elem; + CeedVector x_points, x_elem, q_data, u, v, qf_assembled; + CeedElemRestriction elem_restriction_x_points, elem_restriction_q_data, elem_restriction_x, elem_restriction_u, elem_restriction_assembled; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass; + bool is_at_points; + + CeedInit(argv[1], &ceed); + + // Point reference coordinates + CeedVectorCreate(ceed, dim * num_points, &x_points); + { + CeedScalar x_array[dim * num_points]; + + for (CeedInt e = 0; e < num_elem; e++) { + for (CeedInt d = 0; d < dim; d++) { + x_array[num_points_per_elem * (e * dim + d) + 0] = 0.25; + x_array[num_points_per_elem * (e * dim + d) + 1] = d == 0 ? -0.25 : 0.25; + x_array[num_points_per_elem * (e * dim + d) + 2] = d == 0 ? 0.25 : -0.25; + x_array[num_points_per_elem * (e * dim + d) + 3] = 0.25; + } + } + CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + { + CeedInt ind_x[num_elem + 1 + num_points]; + + for (CeedInt i = 0; i <= num_elem; i++) ind_x[i] = num_elem + 1 + i * num_points_per_elem; + for (CeedInt i = 0; i < num_points; i++) ind_x[num_elem + 1 + i] = i; + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim, num_points * dim, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, + &elem_restriction_x_points); + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_q_data); + } + + // Q data + CeedVectorCreate(ceed, num_points, &q_data); + + // Cell coordinates + { + CeedInt p = 2, num_nodes = (num_elem_1d * (p - 1) + 1) * (num_elem_1d * (p - 1) + 1); + CeedInt ind_x[num_elem * p * p]; + + for (CeedInt e = 0; e < num_elem; e++) { + CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0}; + + for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p - 1) + 1; + { + CeedInt r_e = e; + + for (CeedInt d = 0; d < dim; d++) { + elem_xy[d] = r_e % num_elem_1d; + r_e /= num_elem_1d; + } + } + CeedInt num_nodes_in_elem = p * p, *elem_nodes = ind_x + e * num_nodes_in_elem; + + for (CeedInt n = 0; n < num_nodes_in_elem; n++) { + CeedInt g_node = 0, g_node_stride = 1, r_node = n; + + for (CeedInt d = 0; d < dim; d++) { + g_node += (elem_xy[d] * (p - 1) + r_node % p) * g_node_stride; + g_node_stride *= n_d[d]; + r_node /= p; + } + elem_nodes[n] = p * g_node; + } + } + CeedElemRestrictionCreate(ceed, num_elem, p * p, dim, 1, dim * num_nodes, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_x); + CeedVectorCreate(ceed, dim * num_nodes, &x_elem); + { + CeedScalar x_array[dim * num_nodes]; + + for (CeedInt i = 0; i <= num_elem_1d; i++) { + for (CeedInt j = 0; j <= num_elem_1d; j++) { + x_array[(i * (num_elem_1d + 1) + j) * dim + 0] = j; + x_array[(i * (num_elem_1d + 1) + j) * dim + 1] = i; + } + } + CeedVectorSetArray(x_elem, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + } + + CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, q, CEED_GAUSS, &basis_x); + + // Cell solution + { + CeedInt ind_u[num_elem * p * p]; + + for (CeedInt e = 0; e < num_elem; e++) { + CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0}; + + for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p - 1) + 1; + { + CeedInt r_e = e; + + for (CeedInt d = 0; d < dim; d++) { + elem_xy[d] = r_e % num_elem_1d; + r_e /= num_elem_1d; + } + } + CeedInt num_nodes_in_elem = p * p, *elem_nodes = ind_u + e * num_nodes_in_elem; + + for (CeedInt n = 0; n < num_nodes_in_elem; n++) { + CeedInt g_node = 0, g_node_stride = 1, r_node = n; + + for (CeedInt d = 0; d < dim; d++) { + g_node += (elem_xy[d] * (p - 1) + r_node % p) * g_node_stride; + g_node_stride *= n_d[d]; + r_node /= p; + } + elem_nodes[n] = g_node; + } + } + CeedElemRestrictionCreate(ceed, num_elem, p * p, 1, 1, num_nodes, CEED_MEM_HOST, CEED_COPY_VALUES, ind_u, &elem_restriction_u); + } + CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis_u); + + // Setup geometric scaling + CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup); + CeedQFunctionAddInput(qf_setup, "x", dim * dim, CEED_EVAL_GRAD); + CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT); + CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE); + + CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup); + CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + CeedOperatorSetMixedPrecision(op_setup); + CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points); + + CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE); + + // Mass operator + CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass); + CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP); + CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE); + CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP); + + CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass); + CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); + CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetMixedPrecision(op_mass); + CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points); + + CeedOperatorIsAtPoints(op_mass, &is_at_points); + if (!is_at_points) printf("Error: Operator should be at points\n"); + + CeedVectorCreate(ceed, num_nodes, &u); + CeedVectorSetValue(u, 1.0); + CeedVectorCreate(ceed, num_nodes, &v); + + // Assemble QFunction + CeedOperatorSetQFunctionAssemblyReuse(op_mass, true); + CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(op_mass, true); + CeedOperatorLinearAssembleQFunction(op_mass, &qf_assembled, &elem_restriction_assembled, CEED_REQUEST_IMMEDIATE); + + // Check output + { + const CeedScalar *assembled_array, *q_data_array; + + CeedVectorGetArrayRead(qf_assembled, CEED_MEM_HOST, &assembled_array); + CeedVectorGetArrayRead(q_data, CEED_MEM_HOST, &q_data_array); + for (CeedInt i = 0; i < num_points; i++) { + if (fabs(q_data_array[i] - assembled_array[i]) > FLT_EPSILON) { + // LCOV_EXCL_START + printf("Error: qf_assembled[%" CeedInt_FMT "] = %f != %f\n", i, assembled_array[i], q_data_array[i]); + // LCOV_EXCL_STOP + } + } + CeedVectorRestoreArrayRead(qf_assembled, &assembled_array); + CeedVectorRestoreArrayRead(q_data, &q_data_array); + } + + // Apply original Mass Operator + CeedVectorSetValue(u, 1.0); + CeedOperatorApply(op_mass, u, v, CEED_REQUEST_IMMEDIATE); + + // Check output + { + const CeedScalar *v_array; + CeedScalar area = 0.0; + + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < num_nodes; i++) area += v_array[i]; + if (fabs(area - 1.0 * num_elem) > FLT_EPSILON) printf("Error: True operator computed area = %f != 1.0\n", area); + CeedVectorRestoreArrayRead(v, &v_array); + } + + // Switch to new q_data + { + const CeedScalar *assembled_array; + + CeedVectorGetArrayRead(qf_assembled, CEED_MEM_HOST, &assembled_array); + CeedVectorSetArray(q_data, CEED_MEM_HOST, CEED_COPY_VALUES, (CeedScalar *)assembled_array); + CeedVectorRestoreArrayRead(qf_assembled, &assembled_array); + } + + // Apply new Mass Operator + CeedOperatorApply(op_mass, u, v, CEED_REQUEST_IMMEDIATE); + + // Check output + { + const CeedScalar *v_array; + CeedScalar area = 0.0; + + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < num_nodes; i++) area += v_array[i]; + CeedVectorRestoreArrayRead(v, &v_array); + if (fabs(area - 1.0 * num_elem) > FLT_EPSILON) printf("Error: Linearized operator computed area = %f != 1.0\n", area); + } + + // Cleanup + CeedVectorDestroy(&qf_assembled); + CeedVectorDestroy(&q_data); + CeedVectorDestroy(&u); + CeedVectorDestroy(&v); + CeedVectorDestroy(&x_points); + CeedVectorDestroy(&q_data); + CeedVectorDestroy(&x_elem); + CeedElemRestrictionDestroy(&elem_restriction_q_data); + CeedElemRestrictionDestroy(&elem_restriction_x); + CeedElemRestrictionDestroy(&elem_restriction_x_points); + CeedElemRestrictionDestroy(&elem_restriction_u); + CeedElemRestrictionDestroy(&elem_restriction_assembled); + CeedBasisDestroy(&basis_x); + CeedBasisDestroy(&basis_u); + CeedQFunctionDestroy(&qf_setup); + CeedQFunctionDestroy(&qf_mass); + CeedOperatorDestroy(&op_setup); + CeedOperatorDestroy(&op_mass); + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t593-operator-mixed.c b/tests/t593-operator-mixed.c new file mode 100644 index 0000000000..dfd778fe18 --- /dev/null +++ b/tests/t593-operator-mixed.c @@ -0,0 +1,155 @@ +/// @file +/// Test 1D mass matrix operator at points with heterogeneous points per element with mixed precision +/// \test Test 1D mass matrix operator at points with heterogeneous points per element with mixed precision +#include +#include +#include +#include +#include + +#include "t500-operator.h" + +int main(int argc, char **argv) { + Ceed ceed; + CeedInt num_elem = 3, dim = 1, p = 3, q = 5; + CeedInt num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (p - 1) + 1, num_points_per_elem = 4, num_points = num_elem * num_points_per_elem; + CeedInt ind_x[num_elem * 2], ind_u[num_elem * p], ind_x_points[num_elem + 1 + num_points]; + CeedScalar x_array_mesh[num_nodes_x], x_array_points[num_points]; + CeedVector x_points = NULL, x_elem = NULL, q_data = NULL, u = NULL, v = NULL; + CeedElemRestriction elem_restriction_x_points, elem_restriction_q_data, elem_restriction_x, elem_restriction_u; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass; + bool is_at_points; + + CeedInit(argv[1], &ceed); + + // Mesh coordinates + for (CeedInt i = 0; i < num_nodes_x; i++) x_array_mesh[i] = (CeedScalar)i / (num_nodes_x - 1); + for (CeedInt i = 0; i < num_elem; i++) { + ind_x[2 * i + 0] = i; + ind_x[2 * i + 1] = i + 1; + } + CeedElemRestrictionCreate(ceed, num_elem, 2, 1, 1, num_nodes_x, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x); + CeedVectorCreate(ceed, num_nodes_x, &x_elem); + CeedVectorSetArray(x_elem, CEED_MEM_HOST, CEED_USE_POINTER, x_array_mesh); + + // U mesh + for (CeedInt i = 0; i < num_elem; i++) { + for (CeedInt j = 0; j < p; j++) { + ind_u[p * i + j] = i * (p - 1) + j; + } + } + CeedElemRestrictionCreate(ceed, num_elem, p, 1, 1, num_nodes_u, CEED_MEM_HOST, CEED_USE_POINTER, ind_u, &elem_restriction_u); + + // Point reference coordinates + { + CeedScalar weight_tmp[num_points_per_elem + 1]; + CeedInt current_index = 0; + + // Use num_points_per_elem + 1 to test non-uniform quadrature + CeedGaussQuadrature(num_points_per_elem + 1, x_array_points, weight_tmp); + ind_x_points[0] = num_elem + 1; + for (CeedInt p = 0; p < num_points_per_elem + 1; p++, current_index++) { + ind_x_points[num_elem + 1 + current_index] = current_index; + } + // Use num_points_per_elem for middle elements + for (CeedInt e = 1; e < num_elem - 1; e++) { + CeedGaussQuadrature(num_points_per_elem, &x_array_points[current_index], weight_tmp); + ind_x_points[e] = num_elem + 1 + current_index; + for (CeedInt p = 0; p < num_points_per_elem; p++, current_index++) { + ind_x_points[num_elem + 1 + current_index] = current_index; + } + } + // Use num_points_per_elem - 1 to test non-uniform quadrature + CeedGaussQuadrature(num_points_per_elem - 1, &x_array_points[current_index], weight_tmp); + ind_x_points[num_elem - 1] = num_elem + 1 + current_index; + for (CeedInt p = 0; p < num_points_per_elem - 1; p++, current_index++) { + ind_x_points[num_elem + 1 + current_index] = current_index; + } + ind_x_points[num_elem] = num_elem + 1 + current_index; + + CeedVectorCreate(ceed, num_elem * num_points_per_elem, &x_points); + CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_USE_POINTER, x_array_points); + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x_points, + &elem_restriction_x_points); + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x_points, + &elem_restriction_q_data); + + // Q data + CeedVectorCreate(ceed, num_points, &q_data); + } + + // Basis creation + CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, q, CEED_GAUSS, &basis_x); + CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis_u); + + // Setup geometric scaling + CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup); + CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT); + CeedQFunctionAddInput(qf_setup, "x", dim * dim, CEED_EVAL_GRAD); + CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE); + + CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup); + CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points); + CeedOperatorSetMixedPrecision(op_setup); + + CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE); + + // Mass operator + CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass); + CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP); + CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE); + CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP); + + CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass); + CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); + CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetMixedPrecision(op_mass); + CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points); + + CeedOperatorIsAtPoints(op_mass, &is_at_points); + if (!is_at_points) printf("Error: Operator should be at points\n"); + + CeedVectorCreate(ceed, num_nodes_u, &u); + CeedVectorSetValue(u, 0.0); + CeedVectorCreate(ceed, num_nodes_u, &v); + + // Assemble QFunction + CeedOperatorApply(op_mass, u, v, CEED_REQUEST_IMMEDIATE); + + // Check output + { + const CeedScalar *v_array; + + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < num_nodes_u; i++) { + if (fabs(v_array[i]) > FLT_EPSILON) printf("[%" CeedInt_FMT "] v %g != 0.0\n", i, v_array[i]); + } + CeedVectorRestoreArrayRead(v, &v_array); + } + + // Cleanup + CeedVectorDestroy(&q_data); + CeedVectorDestroy(&u); + CeedVectorDestroy(&v); + CeedVectorDestroy(&x_points); + CeedVectorDestroy(&q_data); + CeedVectorDestroy(&x_elem); + CeedElemRestrictionDestroy(&elem_restriction_q_data); + CeedElemRestrictionDestroy(&elem_restriction_x); + CeedElemRestrictionDestroy(&elem_restriction_x_points); + CeedElemRestrictionDestroy(&elem_restriction_u); + CeedBasisDestroy(&basis_x); + CeedBasisDestroy(&basis_u); + CeedQFunctionDestroy(&qf_setup); + CeedQFunctionDestroy(&qf_mass); + CeedOperatorDestroy(&op_setup); + CeedOperatorDestroy(&op_mass); + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t594-operator-mixed.c b/tests/t594-operator-mixed.c new file mode 100644 index 0000000000..fee1b94857 --- /dev/null +++ b/tests/t594-operator-mixed.c @@ -0,0 +1,182 @@ +/// @file +/// Test diagonal assembly of mass matrix operator at points with mixed precision +/// \test Test diagonal assembly of mass matrix operator at points with mixed precision +#include +#include +#include +#include +#include + +#include "t500-operator.h" + +int main(int argc, char **argv) { + Ceed ceed; + CeedInt num_elem = 3, dim = 1, p = 3, q = 5; + CeedInt num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (p - 1) + 1, num_points_per_elem = 4, num_points = num_elem * num_points_per_elem; + CeedInt ind_x[num_elem * 2], ind_u[num_elem * p], ind_x_points[num_elem + 1 + num_points]; + CeedScalar x_array_mesh[num_nodes_x], x_array_points[num_points], assembled_true[num_nodes_u]; + CeedVector x_points = NULL, x_elem = NULL, q_data = NULL, u = NULL, v = NULL, assembled = NULL; + CeedElemRestriction elem_restriction_x_points, elem_restriction_q_data, elem_restriction_x, elem_restriction_u; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass; + bool is_at_points; + + CeedInit(argv[1], &ceed); + + // Mesh coordinates + for (CeedInt i = 0; i < num_nodes_x; i++) x_array_mesh[i] = (CeedScalar)i / (num_nodes_x - 1); + for (CeedInt i = 0; i < num_elem; i++) { + ind_x[2 * i + 0] = i; + ind_x[2 * i + 1] = i + 1; + } + CeedElemRestrictionCreate(ceed, num_elem, 2, 1, 1, num_nodes_x, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x); + CeedVectorCreate(ceed, num_nodes_x, &x_elem); + CeedVectorSetArray(x_elem, CEED_MEM_HOST, CEED_USE_POINTER, x_array_mesh); + + // U mesh + for (CeedInt i = 0; i < num_elem; i++) { + for (CeedInt j = 0; j < p; j++) { + ind_u[p * i + j] = i * (p - 1) + j; + } + } + CeedElemRestrictionCreate(ceed, num_elem, p, 1, 1, num_nodes_u, CEED_MEM_HOST, CEED_USE_POINTER, ind_u, &elem_restriction_u); + + // Point reference coordinates + { + CeedScalar weight_tmp[num_points_per_elem + 1]; + CeedInt current_index = 0; + + // Use num_points_per_elem + 1 to test non-uniform quadrature + CeedGaussQuadrature(num_points_per_elem + 1, x_array_points, weight_tmp); + ind_x_points[0] = num_elem + 1; + for (CeedInt p = 0; p < num_points_per_elem + 1; p++, current_index++) { + ind_x_points[num_elem + 1 + current_index] = current_index; + } + // Use num_points_per_elem for middle elements + for (CeedInt e = 1; e < num_elem - 1; e++) { + CeedGaussQuadrature(num_points_per_elem, &x_array_points[current_index], weight_tmp); + ind_x_points[e] = num_elem + 1 + current_index; + for (CeedInt p = 0; p < num_points_per_elem; p++, current_index++) { + ind_x_points[num_elem + 1 + current_index] = current_index; + } + } + // Use num_points_per_elem - 1 to test non-uniform quadrature + CeedGaussQuadrature(num_points_per_elem - 1, &x_array_points[current_index], weight_tmp); + ind_x_points[num_elem - 1] = num_elem + 1 + current_index; + for (CeedInt p = 0; p < num_points_per_elem - 1; p++, current_index++) { + ind_x_points[num_elem + 1 + current_index] = current_index; + } + ind_x_points[num_elem] = num_elem + 1 + current_index; + + CeedVectorCreate(ceed, num_elem * num_points_per_elem, &x_points); + CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_USE_POINTER, x_array_points); + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x_points, + &elem_restriction_x_points); + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x_points, + &elem_restriction_q_data); + + // Q data + CeedVectorCreate(ceed, num_points, &q_data); + } + + // Basis creation + CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, q, CEED_GAUSS, &basis_x); + CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis_u); + + // Setup geometric scaling + CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup); + CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT); + CeedQFunctionAddInput(qf_setup, "x", dim * dim, CEED_EVAL_GRAD); + CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE); + + CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup); + CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + CeedOperatorSetMixedPrecision(op_setup); + CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points); + + CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE); + + // Mass operator + CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass); + CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP); + CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE); + CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP); + + CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass); + CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); + CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetMixedPrecision(op_mass); + CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points); + + CeedOperatorIsAtPoints(op_mass, &is_at_points); + if (!is_at_points) printf("Error: Operator should be at points\n"); + + CeedVectorCreate(ceed, num_nodes_u, &u); + CeedVectorSetValue(u, 0.0); + CeedVectorCreate(ceed, num_nodes_u, &v); + + // Assemble diagonal + CeedVectorCreate(ceed, num_nodes_u, &assembled); + CeedOperatorLinearAssembleDiagonal(op_mass, assembled, CEED_REQUEST_IMMEDIATE); + + // Manually assemble diagonal + CeedVectorSetValue(u, 0.0); + for (CeedInt i = 0; i < num_nodes_u; i++) { + CeedScalar *u_array; + const CeedScalar *v_array; + + // Set input + CeedVectorGetArray(u, CEED_MEM_HOST, &u_array); + u_array[i] = 1.0; + if (i) u_array[i - 1] = 0.0; + CeedVectorRestoreArray(u, &u_array); + + // Compute diag entry for DoF i + CeedOperatorApply(op_mass, u, v, CEED_REQUEST_IMMEDIATE); + + // Retrieve entry + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + assembled_true[i] = v_array[i]; + CeedVectorRestoreArrayRead(v, &v_array); + } + + // Check output + { + const CeedScalar *assembled_array; + + CeedVectorGetArrayRead(assembled, CEED_MEM_HOST, &assembled_array); + for (CeedInt i = 0; i < num_nodes_u; i++) { + if (fabs(assembled_array[i] - assembled_true[i]) > FLT_EPSILON) { + // LCOV_EXCL_START + printf("[%" CeedInt_FMT "] Error in assembly: %f != %f\n", i, assembled_array[i], assembled_true[i]); + // LCOV_EXCL_STOP + } + } + CeedVectorRestoreArrayRead(assembled, &assembled_array); + } + + // Cleanup + CeedVectorDestroy(&q_data); + CeedVectorDestroy(&u); + CeedVectorDestroy(&v); + CeedVectorDestroy(&x_points); + CeedVectorDestroy(&q_data); + CeedVectorDestroy(&x_elem); + CeedVectorDestroy(&assembled); + CeedElemRestrictionDestroy(&elem_restriction_q_data); + CeedElemRestrictionDestroy(&elem_restriction_x); + CeedElemRestrictionDestroy(&elem_restriction_x_points); + CeedElemRestrictionDestroy(&elem_restriction_u); + CeedBasisDestroy(&basis_x); + CeedBasisDestroy(&basis_u); + CeedQFunctionDestroy(&qf_setup); + CeedQFunctionDestroy(&qf_mass); + CeedOperatorDestroy(&op_setup); + CeedOperatorDestroy(&op_mass); + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t596-operator-mixed.c b/tests/t596-operator-mixed.c new file mode 100644 index 0000000000..4341d16912 --- /dev/null +++ b/tests/t596-operator-mixed.c @@ -0,0 +1,205 @@ +/// @file +/// Test full assembly of mass matrix operator with mixed precision +/// \test Test full assembly of mass matrix operator AtPoints with mixed precision +#include +#include +#include +#include +#include + +#include "t596-operator.h" + +int main(int argc, char **argv) { + Ceed ceed; + + CeedInit(argv[1], &ceed); + + for (CeedInt num_comp = 1; num_comp <= 3; num_comp++) { + CeedElemRestriction elem_restriction_x, elem_restriction_x_points, elem_restriction_u, elem_restriction_q_data; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass; + CeedVector q_data, x, x_points, u, v; + CeedInt p = 3, q = 4, dim = 2; + CeedInt n_x = 3, n_y = 2; + CeedInt num_elem = n_x * n_y; + CeedInt num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_points_per_elem = 4, num_points = num_elem * num_points_per_elem; + CeedInt ind_x[num_elem * p * p]; + CeedScalar assembled_values[num_comp * num_comp * num_dofs * num_dofs]; + CeedScalar assembled_true[num_comp * num_comp * num_dofs * num_dofs]; + + // Points + CeedVectorCreate(ceed, dim * num_points, &x_points); + { + CeedScalar x_array[dim * num_points]; + + for (CeedInt e = 0; e < num_elem; e++) { + for (CeedInt d = 0; d < dim; d++) { + x_array[num_points_per_elem * (e * dim + d) + 0] = 0.25; + x_array[num_points_per_elem * (e * dim + d) + 1] = d == 0 ? -0.25 : 0.25; + x_array[num_points_per_elem * (e * dim + d) + 2] = d == 0 ? 0.25 : -0.25; + x_array[num_points_per_elem * (e * dim + d) + 3] = 0.25; + } + } + CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + { + CeedInt ind_x[num_elem + 1 + num_points]; + + for (CeedInt i = 0; i <= num_elem; i++) ind_x[i] = num_elem + 1 + i * num_points_per_elem; + for (CeedInt i = 0; i < num_points; i++) ind_x[num_elem + 1 + i] = i; + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim, num_points * dim, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, + &elem_restriction_x_points); + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_q_data); + } + + // Vectors + CeedVectorCreate(ceed, dim * num_dofs, &x); + { + CeedScalar x_array[dim * num_dofs]; + + for (CeedInt i = 0; i < n_x * 2 + 1; i++) { + for (CeedInt j = 0; j < n_y * 2 + 1; j++) { + x_array[i + j * (n_x * 2 + 1) + 0 * num_dofs] = (CeedScalar)i / (2 * n_x); + x_array[i + j * (n_x * 2 + 1) + 1 * num_dofs] = (CeedScalar)j / (2 * n_y); + } + } + CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + CeedVectorCreate(ceed, num_comp * num_dofs, &u); + CeedVectorCreate(ceed, num_comp * num_dofs, &v); + CeedVectorCreate(ceed, num_points, &q_data); + + // Restrictions + for (CeedInt i = 0; i < num_elem; i++) { + CeedInt col, row, offset; + col = i % n_x; + row = i / n_x; + offset = col * (p - 1) + row * (n_x * 2 + 1) * (p - 1); + for (CeedInt j = 0; j < p; j++) { + for (CeedInt k = 0; k < p; k++) ind_x[p * (p * i + k) + j] = offset + k * (n_x * 2 + 1) + j; + } + } + CeedElemRestrictionCreate(ceed, num_elem, p * p, dim, num_dofs, dim * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x); + CeedElemRestrictionCreate(ceed, num_elem, p * p, num_comp, num_dofs, num_comp * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, + &elem_restriction_u); + + // Bases + CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, p, q, CEED_GAUSS, &basis_x); + CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp, p, q, CEED_GAUSS, &basis_u); + + // QFunctions + CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup); + CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT); + CeedQFunctionAddInput(qf_setup, "dx", dim * dim, CEED_EVAL_GRAD); + CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE); + + CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass); + CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE); + CeedQFunctionAddInput(qf_mass, "u", num_comp, CEED_EVAL_INTERP); + CeedQFunctionAddOutput(qf_mass, "v", num_comp, CEED_EVAL_INTERP); + { + CeedQFunctionContext qf_context; + + CeedQFunctionContextCreate(ceed, &qf_context); + CeedQFunctionContextSetData(qf_context, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof(CeedInt), &num_comp); + CeedQFunctionSetContext(qf_mass, qf_context); + CeedQFunctionContextDestroy(&qf_context); + } + + // Operators + CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup); + CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + CeedOperatorSetMixedPrecision(op_setup); + CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points); + + CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass); + CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); + CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetMixedPrecision(op_mass); + CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points); + + // Apply Setup Operator + CeedOperatorApply(op_setup, x, q_data, CEED_REQUEST_IMMEDIATE); + + // Fully assemble operator + CeedSize num_entries; + CeedInt *rows; + CeedInt *cols; + CeedVector assembled; + + for (CeedInt k = 0; k < num_comp * num_comp * num_dofs * num_dofs; ++k) { + assembled_values[k] = 0.0; + assembled_true[k] = 0.0; + } + CeedOperatorLinearAssembleSymbolic(op_mass, &num_entries, &rows, &cols); + CeedVectorCreate(ceed, num_entries, &assembled); + CeedOperatorLinearAssemble(op_mass, assembled); + { + const CeedScalar *assembled_array; + + CeedVectorGetArrayRead(assembled, CEED_MEM_HOST, &assembled_array); + for (CeedInt k = 0; k < num_entries; k++) { + assembled_values[rows[k] * num_comp * num_dofs + cols[k]] += assembled_array[k]; + } + CeedVectorRestoreArrayRead(assembled, &assembled_array); + } + + // Manually assemble operator + CeedVectorSetValue(u, 0.0); + for (CeedInt j = 0; j < num_comp * num_dofs; j++) { + CeedScalar *u_array; + const CeedScalar *v_array; + + // Set input + CeedVectorGetArray(u, CEED_MEM_HOST, &u_array); + u_array[j] = 1.0; + if (j) u_array[j - 1] = 0.0; + CeedVectorRestoreArray(u, &u_array); + + // Compute entries for column j + CeedOperatorApply(op_mass, u, v, CEED_REQUEST_IMMEDIATE); + + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < num_comp * num_dofs; i++) assembled_true[i * num_comp * num_dofs + j] = v_array[i]; + CeedVectorRestoreArrayRead(v, &v_array); + } + + // Check output + for (CeedInt i = 0; i < num_comp * num_dofs; i++) { + for (CeedInt j = 0; j < num_comp * num_dofs; j++) { + if (fabs(assembled_values[i * num_dofs * num_comp + j] - assembled_true[i * num_dofs * num_comp + j]) > FLT_EPSILON) { + // LCOV_EXCL_START + printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] Error in assembly: %f != %f\n", i, j, assembled_values[i * num_dofs * num_comp + j], + assembled_true[i * num_dofs * num_comp + j]); + // LCOV_EXCL_STOP + } + } + } + + // Cleanup + free(rows); + free(cols); + CeedVectorDestroy(&x); + CeedVectorDestroy(&x_points); + CeedVectorDestroy(&q_data); + CeedVectorDestroy(&u); + CeedVectorDestroy(&v); + CeedVectorDestroy(&assembled); + CeedElemRestrictionDestroy(&elem_restriction_u); + CeedElemRestrictionDestroy(&elem_restriction_x); + CeedElemRestrictionDestroy(&elem_restriction_x_points); + CeedElemRestrictionDestroy(&elem_restriction_q_data); + CeedBasisDestroy(&basis_u); + CeedBasisDestroy(&basis_x); + CeedQFunctionDestroy(&qf_setup); + CeedQFunctionDestroy(&qf_mass); + CeedOperatorDestroy(&op_setup); + CeedOperatorDestroy(&op_mass); + } + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t597-operator-mixed.c b/tests/t597-operator-mixed.c new file mode 100644 index 0000000000..90d36eda69 --- /dev/null +++ b/tests/t597-operator-mixed.c @@ -0,0 +1,206 @@ +/// @file +/// Test full assembly of Poisson operator AtPoints with mixed precision +/// \test Test full assembly of Poisson operator AtPoints with mixed precision +#include +#include +#include +#include +#include + +#include "t597-operator.h" + +int main(int argc, char **argv) { + Ceed ceed; + + CeedInit(argv[1], &ceed); + + for (CeedInt num_comp = 1; num_comp <= 3; num_comp++) { + CeedElemRestriction elem_restriction_x, elem_restriction_x_points, elem_restriction_u, elem_restriction_q_data; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_diff; + CeedOperator op_setup, op_diff; + CeedVector q_data, x, x_points, u, v; + CeedInt p = 3, q = 4, dim = 2; + CeedInt n_x = 3, n_y = 2; + CeedInt num_elem = n_x * n_y; + CeedInt num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_points_per_elem = 4, num_points = num_elem * num_points_per_elem; + CeedInt ind_x[num_elem * p * p]; + CeedScalar assembled_values[num_comp * num_comp * num_dofs * num_dofs]; + CeedScalar assembled_true[num_comp * num_comp * num_dofs * num_dofs]; + + // Points + CeedVectorCreate(ceed, dim * num_points, &x_points); + { + CeedScalar x_array[dim * num_points]; + + for (CeedInt e = 0; e < num_elem; e++) { + for (CeedInt d = 0; d < dim; d++) { + x_array[num_points_per_elem * (e * dim + d) + 0] = 0.25; + x_array[num_points_per_elem * (e * dim + d) + 1] = d == 0 ? -0.25 : 0.25; + x_array[num_points_per_elem * (e * dim + d) + 2] = d == 0 ? 0.25 : -0.25; + x_array[num_points_per_elem * (e * dim + d) + 3] = 0.25; + } + } + CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + { + CeedInt ind_x[num_elem + 1 + num_points]; + + for (CeedInt i = 0; i <= num_elem; i++) ind_x[i] = num_elem + 1 + i * num_points_per_elem; + for (CeedInt i = 0; i < num_points; i++) ind_x[num_elem + 1 + i] = i; + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim, num_points * dim, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, + &elem_restriction_x_points); + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim * (dim + 1) / 2, num_points * dim * (dim + 1) / 2, CEED_MEM_HOST, + CEED_COPY_VALUES, ind_x, &elem_restriction_q_data); + } + + // Vectors + CeedVectorCreate(ceed, dim * num_dofs, &x); + { + CeedScalar x_array[dim * num_dofs]; + + for (CeedInt i = 0; i < n_x * 2 + 1; i++) { + for (CeedInt j = 0; j < n_y * 2 + 1; j++) { + x_array[i + j * (n_x * 2 + 1) + 0 * num_dofs] = (CeedScalar)i / (2 * n_x); + x_array[i + j * (n_x * 2 + 1) + 1 * num_dofs] = (CeedScalar)j / (2 * n_y); + } + } + CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + CeedVectorCreate(ceed, num_comp * num_dofs, &u); + CeedVectorCreate(ceed, num_comp * num_dofs, &v); + CeedVectorCreate(ceed, num_points * dim * (dim + 1) / 2, &q_data); + + // Restrictions + for (CeedInt i = 0; i < num_elem; i++) { + CeedInt col, row, offset; + col = i % n_x; + row = i / n_x; + offset = col * (p - 1) + row * (n_x * 2 + 1) * (p - 1); + for (CeedInt j = 0; j < p; j++) { + for (CeedInt k = 0; k < p; k++) ind_x[p * (p * i + k) + j] = offset + k * (n_x * 2 + 1) + j; + } + } + CeedElemRestrictionCreate(ceed, num_elem, p * p, dim, num_dofs, dim * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x); + CeedElemRestrictionCreate(ceed, num_elem, p * p, num_comp, num_dofs, num_comp * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, + &elem_restriction_u); + + // Bases + CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, p, q, CEED_GAUSS, &basis_x); + CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp, p, q, CEED_GAUSS, &basis_u); + + // QFunction - setup + CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup); + CeedQFunctionAddInput(qf_setup, "dx", dim * dim, CEED_EVAL_GRAD); + CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT); + CeedQFunctionAddOutput(qf_setup, "q data", dim * (dim + 1) / 2, CEED_EVAL_NONE); + + // Operator - setup + CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup); + CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup, "q data", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + CeedOperatorSetMixedPrecision(op_setup); + CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points); + + // Apply Setup Operator + CeedOperatorApply(op_setup, x, q_data, CEED_REQUEST_IMMEDIATE); + + // QFunction - apply + CeedQFunctionCreateInterior(ceed, 1, diff, diff_loc, &qf_diff); + CeedQFunctionAddInput(qf_diff, "du", num_comp * dim, CEED_EVAL_GRAD); + CeedQFunctionAddInput(qf_diff, "q data", dim * (dim + 1) / 2, CEED_EVAL_NONE); + CeedQFunctionAddOutput(qf_diff, "dv", num_comp * dim, CEED_EVAL_GRAD); + { + CeedQFunctionContext qf_context; + + CeedQFunctionContextCreate(ceed, &qf_context); + CeedQFunctionContextSetData(qf_context, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof(CeedInt), &num_comp); + CeedQFunctionSetContext(qf_diff, qf_context); + CeedQFunctionContextDestroy(&qf_context); + } + + // Operator - apply + CeedOperatorCreateAtPoints(ceed, qf_diff, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_diff); + CeedOperatorSetField(op_diff, "du", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_diff, "q data", elem_restriction_q_data, CEED_BASIS_NONE, q_data); + CeedOperatorSetField(op_diff, "dv", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetMixedPrecision(op_diff); + CeedOperatorAtPointsSetPoints(op_diff, elem_restriction_x_points, x_points); + + // Fully assemble operator + CeedSize num_entries; + CeedInt *rows; + CeedInt *cols; + CeedVector assembled; + + for (CeedInt k = 0; k < num_comp * num_comp * num_dofs * num_dofs; ++k) { + assembled_values[k] = 0.0; + assembled_true[k] = 0.0; + } + CeedOperatorLinearAssembleSymbolic(op_diff, &num_entries, &rows, &cols); + CeedVectorCreate(ceed, num_entries, &assembled); + CeedOperatorLinearAssemble(op_diff, assembled); + { + const CeedScalar *assembled_array; + + CeedVectorGetArrayRead(assembled, CEED_MEM_HOST, &assembled_array); + for (CeedInt k = 0; k < num_entries; k++) assembled_values[rows[k] * num_comp * num_dofs + cols[k]] += assembled_array[k]; + CeedVectorRestoreArrayRead(assembled, &assembled_array); + } + + // Manually assemble operator + CeedVectorSetValue(u, 0.0); + for (CeedInt j = 0; j < num_comp * num_dofs; j++) { + CeedScalar *u_array; + const CeedScalar *v_array; + + // Set input + CeedVectorGetArray(u, CEED_MEM_HOST, &u_array); + u_array[j] = 1.0; + if (j) u_array[j - 1] = 0.0; + CeedVectorRestoreArray(u, &u_array); + + // Compute entries for column j + CeedOperatorApply(op_diff, u, v, CEED_REQUEST_IMMEDIATE); + + CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < num_comp * num_dofs; i++) assembled_true[i * num_comp * num_dofs + j] = v_array[i]; + CeedVectorRestoreArrayRead(v, &v_array); + } + + // Check output + for (CeedInt i = 0; i < num_comp * num_dofs; i++) { + for (CeedInt j = 0; j < num_comp * num_dofs; j++) { + if (fabs(assembled_values[i * num_comp * num_dofs + j] - assembled_true[i * num_comp * num_dofs + j]) > FLT_EPSILON) { + // LCOV_EXCL_START + printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] Error in assembly: %f != %f\n", i, j, assembled_values[i * num_comp * num_dofs + j], + assembled_true[i * num_comp * num_dofs + j]); + // LCOV_EXCL_STOP + } + } + } + + // Cleanup + free(rows); + free(cols); + CeedVectorDestroy(&x); + CeedVectorDestroy(&x_points); + CeedVectorDestroy(&q_data); + CeedVectorDestroy(&u); + CeedVectorDestroy(&v); + CeedVectorDestroy(&assembled); + CeedElemRestrictionDestroy(&elem_restriction_u); + CeedElemRestrictionDestroy(&elem_restriction_x); + CeedElemRestrictionDestroy(&elem_restriction_x_points); + CeedElemRestrictionDestroy(&elem_restriction_q_data); + CeedBasisDestroy(&basis_u); + CeedBasisDestroy(&basis_x); + CeedQFunctionDestroy(&qf_setup); + CeedQFunctionDestroy(&qf_diff); + CeedOperatorDestroy(&op_setup); + CeedOperatorDestroy(&op_diff); + } + CeedDestroy(&ceed); + return 0; +} diff --git a/tests/t598-operator-mixed.c b/tests/t598-operator-mixed.c new file mode 100644 index 0000000000..2d8bb2b8a5 --- /dev/null +++ b/tests/t598-operator-mixed.c @@ -0,0 +1,282 @@ +/// @file +/// Test creation, action, and destruction for mass matrix operator AtPoints with mixed precision +/// \test Test creation, action, and destruction for mass matrix operator AtPoints with mixed precision +#include "t591-operator.h" + +#include +#include +#include +#include +#include + +int main(int argc, char **argv) { + Ceed ceed; + CeedInt num_elem_1d = 3, num_elem = num_elem_1d * num_elem_1d, dim = 2, p_coarse = 2, p_fine = 3, q = 5; + CeedInt num_points_per_elem = 4, num_points = num_elem * num_points_per_elem; + CeedInt num_nodes_coarse = (num_elem_1d * (p_coarse - 1) + 1) * (num_elem_1d * (p_coarse - 1) + 1); + CeedInt num_nodes_fine = (num_elem_1d * (p_fine - 1) + 1) * (num_elem_1d * (p_fine - 1) + 1); + CeedVector x_points, x_elem, q_data, u_coarse, u_fine, v_coarse, v_fine, p_mult_fine; + CeedElemRestriction elem_restriction_x_points, elem_restriction_q_data, elem_restriction_x, elem_restriction_u_coarse, elem_restriction_u_fine; + CeedBasis basis_x, basis_u_coarse, basis_u_fine; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass_coarse, op_mass_fine, op_prolong, op_restrict; + + CeedInit(argv[1], &ceed); + + // Point reference coordinates + CeedVectorCreate(ceed, dim * num_points, &x_points); + { + CeedScalar x_array[dim * num_points]; + + for (CeedInt e = 0; e < num_elem; e++) { + for (CeedInt d = 0; d < dim; d++) { + x_array[num_points_per_elem * (e * dim + d) + 0] = 0.25; + x_array[num_points_per_elem * (e * dim + d) + 1] = d == 0 ? -0.25 : 0.25; + x_array[num_points_per_elem * (e * dim + d) + 2] = d == 0 ? 0.25 : -0.25; + x_array[num_points_per_elem * (e * dim + d) + 3] = 0.25; + } + } + CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + { + CeedInt ind_x[num_elem + 1 + num_points]; + + for (CeedInt i = 0; i <= num_elem; i++) ind_x[i] = num_elem + 1 + i * num_points_per_elem; + for (CeedInt i = 0; i < num_points; i++) ind_x[num_elem + 1 + i] = i; + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim, num_points * dim, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, + &elem_restriction_x_points); + CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_q_data); + } + + // Q data + CeedVectorCreate(ceed, num_points, &q_data); + + // Cell coordinates + { + CeedInt p = 2, num_nodes = (num_elem_1d * (p - 1) + 1) * (num_elem_1d * (p - 1) + 1); + CeedInt ind_x[num_elem * p * p]; + + for (CeedInt e = 0; e < num_elem; e++) { + CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0}; + + for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p - 1) + 1; + { + CeedInt r_e = e; + + for (CeedInt d = 0; d < dim; d++) { + elem_xy[d] = r_e % num_elem_1d; + r_e /= num_elem_1d; + } + } + CeedInt num_nodes_in_elem = p * p, *elem_nodes = ind_x + e * num_nodes_in_elem; + + for (CeedInt n = 0; n < num_nodes_in_elem; n++) { + CeedInt g_node = 0, g_node_stride = 1, r_node = n; + + for (CeedInt d = 0; d < dim; d++) { + g_node += (elem_xy[d] * (p - 1) + r_node % p) * g_node_stride; + g_node_stride *= n_d[d]; + r_node /= p; + } + elem_nodes[n] = p * g_node; + } + } + CeedElemRestrictionCreate(ceed, num_elem, p * p, dim, 1, dim * num_nodes, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_x); + CeedVectorCreate(ceed, dim * num_nodes, &x_elem); + { + CeedScalar x_array[dim * num_nodes]; + + for (CeedInt i = 0; i <= num_elem_1d; i++) { + for (CeedInt j = 0; j <= num_elem_1d; j++) { + x_array[(i * (num_elem_1d + 1) + j) * dim + 0] = j; + x_array[(i * (num_elem_1d + 1) + j) * dim + 1] = i; + } + } + CeedVectorSetArray(x_elem, CEED_MEM_HOST, CEED_COPY_VALUES, x_array); + } + } + + CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, q, CEED_GAUSS, &basis_x); + + // Cell solution + { + CeedInt ind_u[num_elem * p_coarse * p_coarse]; + + for (CeedInt e = 0; e < num_elem; e++) { + CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0}; + + for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p_coarse - 1) + 1; + { + CeedInt r_e = e; + + for (CeedInt d = 0; d < dim; d++) { + elem_xy[d] = r_e % num_elem_1d; + r_e /= num_elem_1d; + } + } + CeedInt num_nodes_in_elem = p_coarse * p_coarse, *elem_nodes = ind_u + e * num_nodes_in_elem; + + for (CeedInt n = 0; n < num_nodes_in_elem; n++) { + CeedInt g_node = 0, g_node_stride = 1, r_node = n; + + for (CeedInt d = 0; d < dim; d++) { + g_node += (elem_xy[d] * (p_coarse - 1) + r_node % p_coarse) * g_node_stride; + g_node_stride *= n_d[d]; + r_node /= p_coarse; + } + elem_nodes[n] = g_node; + } + } + CeedElemRestrictionCreate(ceed, num_elem, p_coarse * p_coarse, 1, 1, num_nodes_coarse, CEED_MEM_HOST, CEED_COPY_VALUES, ind_u, + &elem_restriction_u_coarse); + } + CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p_coarse, q, CEED_GAUSS, &basis_u_coarse); + { + CeedInt ind_u[num_elem * p_fine * p_fine]; + + for (CeedInt e = 0; e < num_elem; e++) { + CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0}; + + for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p_fine - 1) + 1; + { + CeedInt r_e = e; + + for (CeedInt d = 0; d < dim; d++) { + elem_xy[d] = r_e % num_elem_1d; + r_e /= num_elem_1d; + } + } + CeedInt num_nodes_in_elem = p_fine * p_fine, *elem_nodes = ind_u + e * num_nodes_in_elem; + + for (CeedInt n = 0; n < num_nodes_in_elem; n++) { + CeedInt g_node = 0, g_node_stride = 1, r_node = n; + + for (CeedInt d = 0; d < dim; d++) { + g_node += (elem_xy[d] * (p_fine - 1) + r_node % p_fine) * g_node_stride; + g_node_stride *= n_d[d]; + r_node /= p_fine; + } + elem_nodes[n] = g_node; + } + } + CeedElemRestrictionCreate(ceed, num_elem, p_fine * p_fine, 1, 1, num_nodes_fine, CEED_MEM_HOST, CEED_COPY_VALUES, ind_u, + &elem_restriction_u_fine); + } + CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p_fine, q, CEED_GAUSS, &basis_u_fine); + + // Setup geometric scaling + CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup); + CeedQFunctionAddInput(qf_setup, "x", dim * dim, CEED_EVAL_GRAD); + CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT); + CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE); + + CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup); + CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + CeedOperatorSetMixedPrecision(op_setup); + CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points); + + CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE); + + // Mass operator + CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass); + CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP); + CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE); + CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP); + + CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_fine); + CeedOperatorSetField(op_mass_fine, "u", elem_restriction_u_fine, basis_u_fine, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_mass_fine, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); + CeedOperatorSetField(op_mass_fine, "v", elem_restriction_u_fine, basis_u_fine, CEED_VECTOR_ACTIVE); + CeedOperatorSetMixedPrecision(op_mass_fine); + CeedOperatorAtPointsSetPoints(op_mass_fine, elem_restriction_x_points, x_points); + + CeedVectorCreate(ceed, num_nodes_fine, &u_fine); + CeedVectorCreate(ceed, num_nodes_fine, &v_fine); + CeedVectorCreate(ceed, num_nodes_fine, &p_mult_fine); + CeedVectorCreate(ceed, num_nodes_coarse, &u_coarse); + CeedVectorCreate(ceed, num_nodes_coarse, &v_coarse); + + // Create multigrid level + CeedVectorSetValue(p_mult_fine, 1.0); + CeedOperatorMultigridLevelCreate(op_mass_fine, p_mult_fine, elem_restriction_u_coarse, basis_u_coarse, &op_mass_coarse, &op_prolong, &op_restrict); + + // Coarse problem + CeedVectorSetValue(u_coarse, 1.0); + CeedOperatorApply(op_mass_coarse, u_coarse, v_coarse, CEED_REQUEST_IMMEDIATE); + + // Check output + { + const CeedScalar *v_array; + CeedScalar sum = 0.; + + CeedVectorGetArrayRead(v_coarse, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < num_nodes_coarse; i++) { + sum += v_array[i]; + } + CeedVectorRestoreArrayRead(v_coarse, &v_array); + if (fabs(sum - num_elem) > 1000. * FLT_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: %f\n", sum, num_elem); + } + + // Prolong coarse u + CeedOperatorApply(op_prolong, u_coarse, u_fine, CEED_REQUEST_IMMEDIATE); + + // Fine problem + CeedOperatorApply(op_mass_fine, u_fine, v_fine, CEED_REQUEST_IMMEDIATE); + + // Check output + { + const CeedScalar *v_array; + CeedScalar sum = 0.; + + CeedVectorGetArrayRead(v_fine, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < num_nodes_fine; i++) { + sum += v_array[i]; + } + CeedVectorRestoreArrayRead(v_fine, &v_array); + + if (fabs(sum - num_elem) > 1000. * FLT_EPSILON) printf("Computed Area Fine Grid: %f != True Area: %f\n", sum, num_elem); + } + // Restrict state to coarse grid + CeedOperatorApply(op_restrict, v_fine, v_coarse, CEED_REQUEST_IMMEDIATE); + + // Check output + { + const CeedScalar *v_array; + CeedScalar sum = 0.; + + CeedVectorGetArrayRead(v_coarse, CEED_MEM_HOST, &v_array); + for (CeedInt i = 0; i < num_nodes_coarse; i++) { + sum += v_array[i]; + } + CeedVectorRestoreArrayRead(v_coarse, &v_array); + if (fabs(sum - num_elem) > 1000. * FLT_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: %f\n", sum, num_elem); + } + + CeedVectorDestroy(&x_points); + CeedVectorDestroy(&q_data); + CeedVectorDestroy(&x_elem); + CeedVectorDestroy(&u_coarse); + CeedVectorDestroy(&u_fine); + CeedVectorDestroy(&v_fine); + CeedVectorDestroy(&v_coarse); + CeedVectorDestroy(&p_mult_fine); + CeedElemRestrictionDestroy(&elem_restriction_x_points); + CeedElemRestrictionDestroy(&elem_restriction_q_data); + CeedElemRestrictionDestroy(&elem_restriction_x); + CeedElemRestrictionDestroy(&elem_restriction_u_coarse); + CeedElemRestrictionDestroy(&elem_restriction_u_fine); + CeedBasisDestroy(&basis_x); + CeedBasisDestroy(&basis_u_coarse); + CeedBasisDestroy(&basis_u_fine); + CeedQFunctionDestroy(&qf_setup); + CeedQFunctionDestroy(&qf_mass); + CeedOperatorDestroy(&op_setup); + CeedOperatorDestroy(&op_mass_coarse); + CeedOperatorDestroy(&op_mass_fine); + CeedOperatorDestroy(&op_prolong); + CeedOperatorDestroy(&op_restrict); + CeedDestroy(&ceed); + return 0; +} From 766d575ef3435d36abec09e79fac14aec0ba81bd Mon Sep 17 00:00:00 2001 From: Zach Atkins Date: Thu, 10 Jul 2025 17:24:01 -0600 Subject: [PATCH 2/5] Tweak tolerances --- interface/ceed-basis.c | 2 +- tests/t360-basis.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c index 9a99ae6611..dd313c8947 100644 --- a/interface/ceed-basis.c +++ b/interface/ceed-basis.c @@ -1303,7 +1303,7 @@ int CeedSymmetricSchurDecomposition(Ceed ceed, CeedScalar *mat, CeedScalar *lamb // Reduce sub and super diagonal CeedInt p = 0, q = 0, itr = 0, max_itr = n * n * n * n; - CeedScalar tol = CEED_EPSILON; + CeedScalar tol = 10 * CEED_EPSILON; while (itr < max_itr) { // Update p, q, size of reduced portions of diagonal diff --git a/tests/t360-basis.c b/tests/t360-basis.c index f953157e1c..5e8a3fbe2b 100644 --- a/tests/t360-basis.c +++ b/tests/t360-basis.c @@ -40,7 +40,7 @@ int main(int argc, char **argv) { CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); for (CeedInt i = 0; i < p_dim; i++) area += v_array[i]; - if (fabs(area - 2.0 * CeedIntPow(2, dim)) > 5E-6) printf("Incorrect area computed %f != %f\n", area, 2.0 * CeedIntPow(2, dim)); + if (fabs(area - 2.0 * CeedIntPow(2, dim)) > 1E-5) printf("Incorrect area computed %f != %f\n", area, 2.0 * CeedIntPow(2, dim)); CeedVectorRestoreArrayRead(v, &v_array); } From 4be129e8445f86e580171452ffbb7fcb74a2deda Mon Sep 17 00:00:00 2001 From: Zach Atkins Date: Thu, 10 Jul 2025 18:18:06 -0600 Subject: [PATCH 3/5] change CEED_EPSILON to true constant to appease bindgen --- include/ceed/ceed-f32.h | 6 +++++- include/ceed/ceed-f64.h | 8 ++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/include/ceed/ceed-f32.h b/include/ceed/ceed-f32.h index 39d2fb1187..9aa5e4a226 100644 --- a/include/ceed/ceed-f32.h +++ b/include/ceed/ceed-f32.h @@ -10,6 +10,10 @@ /// Include this header in ceed.h to use float instead of double. #pragma once +#ifndef CEED_RUNNING_JIT_PASS +#include +#endif + #define CEED_SCALAR_IS_FP32 /// Set base scalar type to FP32. (See CeedScalarType enum in ceed.h for all options.) @@ -18,4 +22,4 @@ typedef float CeedScalar; typedef CeedScalar CeedScalarCPU; /// Machine epsilon -#define CEED_EPSILON 0x1p-23 +static const CeedScalar CEED_EPSILON = FLT_EPSILON; diff --git a/include/ceed/ceed-f64.h b/include/ceed/ceed-f64.h index 1e3a7fd7bf..ddfb56e6d3 100644 --- a/include/ceed/ceed-f64.h +++ b/include/ceed/ceed-f64.h @@ -10,6 +10,10 @@ /// This is the default header included in ceed.h. #pragma once +#ifndef CEED_RUNNING_JIT_PASS +#include +#endif + #define CEED_SCALAR_IS_FP64 /// Set base scalar type to FP64. (See CeedScalarType enum in ceed.h for all options.) @@ -19,11 +23,11 @@ typedef float CeedScalar; typedef double CeedScalarCPU; /// Machine epsilon -#define CEED_EPSILON 0x1p-23 +static const CeedScalar CEED_EPSILON = FLT_EPSILON; #else typedef double CeedScalar; typedef CeedScalar CeedScalarCPU; /// Machine epsilon -#define CEED_EPSILON 0x1p-52 +static const CeedScalar CEED_EPSILON = DBL_EPSILON; #endif // CEED_RUNNING_JIT_PASS && CEED_JIT_MIXED_PRECISION From 767aa77280a7e59a5aa19f7e2fef044ad0898d8f Mon Sep 17 00:00:00 2001 From: Zach Atkins Date: Fri, 11 Jul 2025 15:13:39 -0600 Subject: [PATCH 4/5] Ensure DBL_EPSILON and FLT_EPSILON are defined --- include/ceed/jit-source/cuda/cuda-jit.h | 7 +++++++ include/ceed/jit-source/hip/hip-jit.h | 7 +++++++ include/ceed/jit-source/sycl/sycl-jit.h | 7 +++++++ 3 files changed, 21 insertions(+) diff --git a/include/ceed/jit-source/cuda/cuda-jit.h b/include/ceed/jit-source/cuda/cuda-jit.h index baa15a1e85..22a0e903b2 100644 --- a/include/ceed/jit-source/cuda/cuda-jit.h +++ b/include/ceed/jit-source/cuda/cuda-jit.h @@ -13,4 +13,11 @@ #define CeedPragmaSIMD #define CEED_Q_VLA 1 +#ifndef DBL_EPSILON +#define DBL_EPSILON 2.22044604925031308084726333618164062e-16 +#endif +#ifndef FLT_EPSILON +#define FLT_EPSILON 1.19209289550781250000000000000000000e-7F +#endif + #include "cuda-types.h" diff --git a/include/ceed/jit-source/hip/hip-jit.h b/include/ceed/jit-source/hip/hip-jit.h index 70a00416e4..03040e6908 100644 --- a/include/ceed/jit-source/hip/hip-jit.h +++ b/include/ceed/jit-source/hip/hip-jit.h @@ -13,4 +13,11 @@ #define CeedPragmaSIMD #define CEED_Q_VLA 1 +#ifndef DBL_EPSILON +#define DBL_EPSILON 2.22044604925031308084726333618164062e-16 +#endif +#ifndef FLT_EPSILON +#define FLT_EPSILON 1.19209289550781250000000000000000000e-7F +#endif + #include "hip-types.h" diff --git a/include/ceed/jit-source/sycl/sycl-jit.h b/include/ceed/jit-source/sycl/sycl-jit.h index 1a2971f4df..b42aa5304f 100644 --- a/include/ceed/jit-source/sycl/sycl-jit.h +++ b/include/ceed/jit-source/sycl/sycl-jit.h @@ -13,5 +13,12 @@ #define CeedPragmaSIMD #define CEED_Q_VLA 1 +#ifndef DBL_EPSILON +#define DBL_EPSILON 2.22044604925031308084726333618164062e-16 +#endif +#ifndef FLT_EPSILON +#define FLT_EPSILON 1.19209289550781250000000000000000000e-7F +#endif + // Need quotes for recursive header inclusion #include "sycl-types.h" From 873a330eff1e7b699f162d895613cf805c07aa61 Mon Sep 17 00:00:00 2001 From: Zach Atkins Date: Fri, 11 Jul 2025 16:20:06 -0600 Subject: [PATCH 5/5] Change operator precision to a more flexible interface --- .../cuda-gen/ceed-cuda-gen-operator-build.cpp | 38 +++++++++---------- .../hip-gen/ceed-hip-gen-operator-build.cpp | 4 +- include/ceed-impl.h | 2 +- include/ceed/ceed-f32.h | 10 +++++ include/ceed/ceed-f64.h | 4 +- include/ceed/ceed.h | 4 +- interface/ceed-operator.c | 22 ++++++----- tests/t502-operator-mixed.c | 4 +- tests/t503-operator-mixed.c | 4 +- tests/t505-operator-mixed.c | 4 +- tests/t506-operator-mixed.c | 11 ++++-- tests/t510-operator-mixed.c | 4 +- tests/t520-operator-mixed.c | 8 ++-- tests/t522-operator-mixed.c | 8 ++-- tests/t591-operator-mixed.c | 4 +- tests/t592-operator-mixed.c | 4 +- tests/t593-operator-mixed.c | 4 +- tests/t594-operator-mixed.c | 4 +- tests/t596-operator-mixed.c | 4 +- tests/t597-operator-mixed.c | 4 +- tests/t598-operator-mixed.c | 10 ++--- 21 files changed, 89 insertions(+), 72 deletions(-) diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp index 59fcf03d42..a0d1553635 100644 --- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp +++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp @@ -1572,17 +1572,17 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b // Compile { - bool is_compile_good = false; - const CeedInt T_1d = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d); - bool use_mixed_precision; + bool is_compile_good = false; + const CeedInt T_1d = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d); + CeedScalarType precision; // Check for mixed precision - CeedCallBackend(CeedOperatorGetMixedPrecision(op, &use_mixed_precision)); + CeedCallBackend(CeedOperatorGetPrecision(op, &precision)); data->thread_1d = T_1d; - if (use_mixed_precision) { - CeedCallBackend( - CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", T_1d, "CEED_JIT_MIXED_PRECISION", 1)); + if (precision) { + CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", T_1d, "CEED_JIT_PRECISION", + (CeedInt)precision)); } else { CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "OP_T_1D", T_1d)); } @@ -2052,18 +2052,18 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo // Compile { - bool is_compile_good = false; - const CeedInt T_1d = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d); - bool use_mixed_precision; + bool is_compile_good = false; + const CeedInt T_1d = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d); + CeedScalarType precision; // Check for mixed precision - CeedCallBackend(CeedOperatorGetMixedPrecision(op, &use_mixed_precision)); + CeedCallBackend(CeedOperatorGetPrecision(op, &precision)); data->thread_1d = T_1d; - if (use_mixed_precision) { + if (precision) { CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 2, "OP_T_1D", T_1d, - "CEED_JIT_MIXED_PRECISION", 1)); + "CEED_JIT_PRECISION", (CeedInt)precision)); } else { CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 1, "OP_T_1D", T_1d)); @@ -2642,17 +2642,17 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera // Compile { - bool is_compile_good = false; - const CeedInt T_1d = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d); - bool use_mixed_precision; + bool is_compile_good = false; + const CeedInt T_1d = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d); + CeedScalarType precision; // Check for mixed precision - CeedCallBackend(CeedOperatorGetMixedPrecision(op, &use_mixed_precision)); + CeedCallBackend(CeedOperatorGetPrecision(op, &precision)); data->thread_1d = T_1d; - if (use_mixed_precision) { + if (precision) { CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 2, "OP_T_1D", T_1d, - "CEED_JIT_MIXED_PRECISION", 1)); + "CEED_JIT_PRECISION", (CeedInt)precision)); } else { CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 1, "OP_T_1D", T_1d)); } diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp index 674f94b428..f4ed5313d6 100644 --- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp +++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp @@ -2483,8 +2483,8 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperat CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode)); if (eval_mode == CEED_EVAL_GRAD) { - code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*" - << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n"; + code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" + << "dim_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n"; } else { code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n"; } diff --git a/include/ceed-impl.h b/include/ceed-impl.h index e8f6976736..d555444c62 100644 --- a/include/ceed-impl.h +++ b/include/ceed-impl.h @@ -381,7 +381,7 @@ struct CeedOperator_private { bool is_composite; bool is_at_points; bool has_restriction; - bool use_mixed_precision; + CeedScalarType precision; CeedQFunctionAssemblyData qf_assembled; CeedOperatorAssemblyData op_assembled; CeedOperator *sub_operators; diff --git a/include/ceed/ceed-f32.h b/include/ceed/ceed-f32.h index 9aa5e4a226..2382b10c95 100644 --- a/include/ceed/ceed-f32.h +++ b/include/ceed/ceed-f32.h @@ -18,8 +18,18 @@ /// Set base scalar type to FP32. (See CeedScalarType enum in ceed.h for all options.) #define CEED_SCALAR_TYPE CEED_SCALAR_FP32 +#if defined(CEED_RUNNING_JIT_PASS) && defined(CEED_JIT_PRECISION) && (CEED_JIT_PRECISION != CEED_SCALAR_TYPE) +#ifdef CEED_JIT_PRECISION == CEED_SCALAR_FP64 +typedef double CeedScalar; +typedef float CeedScalarCPU; + +/// Machine epsilon +static const CeedScalar CEED_EPSILON = DBL_EPSILON; +#endif // CEED_JIT_PRECISION +#else typedef float CeedScalar; typedef CeedScalar CeedScalarCPU; /// Machine epsilon static const CeedScalar CEED_EPSILON = FLT_EPSILON; +#endif diff --git a/include/ceed/ceed-f64.h b/include/ceed/ceed-f64.h index ddfb56e6d3..22c7e694f5 100644 --- a/include/ceed/ceed-f64.h +++ b/include/ceed/ceed-f64.h @@ -18,12 +18,14 @@ /// Set base scalar type to FP64. (See CeedScalarType enum in ceed.h for all options.) #define CEED_SCALAR_TYPE CEED_SCALAR_FP64 -#if defined(CEED_RUNNING_JIT_PASS) && defined(CEED_JIT_MIXED_PRECISION) +#if defined(CEED_RUNNING_JIT_PASS) && defined(CEED_JIT_PRECISION) && (CEED_JIT_PRECISION != CEED_SCALAR_TYPE) +#if CEED_JIT_PRECISION == CEED_SCALAR_FP32 typedef float CeedScalar; typedef double CeedScalarCPU; /// Machine epsilon static const CeedScalar CEED_EPSILON = FLT_EPSILON; +#endif // CEED_JIT_PRECISION #else typedef double CeedScalar; typedef CeedScalar CeedScalarCPU; diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h index 46d22b5dab..fbd5075333 100644 --- a/include/ceed/ceed.h +++ b/include/ceed/ceed.h @@ -427,8 +427,8 @@ CEED_EXTERN int CeedOperatorCheckReady(CeedOperator op); CEED_EXTERN int CeedOperatorGetActiveVectorLengths(CeedOperator op, CeedSize *input_size, CeedSize *output_size); CEED_EXTERN int CeedOperatorSetQFunctionAssemblyReuse(CeedOperator op, bool reuse_assembly_data); CEED_EXTERN int CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(CeedOperator op, bool needs_data_update); -CEED_EXTERN int CeedOperatorSetMixedPrecision(CeedOperator op); -CEED_EXTERN int CeedOperatorGetMixedPrecision(CeedOperator op, bool *use_mixed_precision); +CEED_EXTERN int CeedOperatorSetPrecision(CeedOperator op, CeedScalarType precision); +CEED_EXTERN int CeedOperatorGetPrecision(CeedOperator op, CeedScalarType *precision); CEED_EXTERN int CeedOperatorLinearAssembleQFunction(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request); CEED_EXTERN int CeedOperatorLinearAssembleQFunctionBuildOrUpdate(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request); diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c index 6c2a0cc5cd..902f62ffae 100644 --- a/interface/ceed-operator.c +++ b/interface/ceed-operator.c @@ -637,25 +637,27 @@ int CeedOperatorIsSetupDone(CeedOperator op, bool *is_setup_done) { /** @brief Set a `CeedOperator` to use reduced precision for operator application - @param[in] op `CeedOperator` + @param[in] op `CeedOperator` + @param[in] precision `CeedScalarType` to use for operator application @return An error code: 0 - success, otherwise - failure @ref User **/ -int CeedOperatorSetMixedPrecision(CeedOperator op) { +int CeedOperatorSetPrecision(CeedOperator op, CeedScalarType scalar_type) { bool is_immutable, is_composite, supports_mixed_precision; Ceed ceed; CeedCall(CeedOperatorGetCeed(op, &ceed)); CeedCall(CeedOperatorIsImmutable(op, &is_immutable)); - CeedCheck(!is_immutable, ceed, CEED_ERROR_INCOMPATIBLE, "CeedOperatorSetMixedPrecision must be called before operator is finalized"); + CeedCheck(!is_immutable, ceed, CEED_ERROR_INCOMPATIBLE, "CeedOperatorSetPrecision must be called before operator is finalized"); CeedCall(CeedOperatorIsComposite(op, &is_composite)); - CeedCheck(!is_composite, ceed, CEED_ERROR_INCOMPATIBLE, "CeedOperatorSetMixedPrecision should be set on single operators"); + CeedCheck(!is_composite, ceed, CEED_ERROR_INCOMPATIBLE, "CeedOperatorSetPrecision should be set on single operators"); CeedCall(CeedGetSupportsMixedPrecision(ceed, &supports_mixed_precision)); - CeedCheck(supports_mixed_precision, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement mixed precision operators"); + CeedCheck(scalar_type == CEED_SCALAR_TYPE || supports_mixed_precision, ceed, CEED_ERROR_UNSUPPORTED, + "Backend does not implement mixed precision operators"); - op->use_mixed_precision = true; + op->precision = true; CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -663,15 +665,15 @@ int CeedOperatorSetMixedPrecision(CeedOperator op) { /** @brief Get whether a `CeedOperator` is set to use reduced precision for operator application - @param[in] op `CeedOperator` - @param[out] use_mixed_precision Variable to store `CeedQFunction` + @param[in] op `CeedOperator` + @param[out] precision Variable to store operator precision @return An error code: 0 - success, otherwise - failure @ref User **/ -int CeedOperatorGetMixedPrecision(CeedOperator op, bool *use_mixed_precision) { - *use_mixed_precision = op->use_mixed_precision; +int CeedOperatorGetPrecision(CeedOperator op, CeedScalarType *precision) { + *precision = op->precision; return CEED_ERROR_SUCCESS; } diff --git a/tests/t502-operator-mixed.c b/tests/t502-operator-mixed.c index 4218b82b1d..8a44ee36ed 100644 --- a/tests/t502-operator-mixed.c +++ b/tests/t502-operator-mixed.c @@ -70,13 +70,13 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); - CeedOperatorSetMixedPrecision(op_setup); + CeedOperatorSetPrecision(op_setup, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass); CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); - CeedOperatorSetMixedPrecision(op_mass); + CeedOperatorSetPrecision(op_mass, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); CeedOperatorApply(op_setup, x, q_data, CEED_REQUEST_IMMEDIATE); diff --git a/tests/t503-operator-mixed.c b/tests/t503-operator-mixed.c index 7042c7846c..b195b0629e 100644 --- a/tests/t503-operator-mixed.c +++ b/tests/t503-operator-mixed.c @@ -71,13 +71,13 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, x); CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); - CeedOperatorSetMixedPrecision(op_setup); + CeedOperatorSetPrecision(op_setup, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass); CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, u); CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, v); - CeedOperatorSetMixedPrecision(op_mass); + CeedOperatorSetPrecision(op_mass, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); // Note - It is atypical to use only passive fields; this test is intended // as a test for all passive input modes rather than as an example. diff --git a/tests/t505-operator-mixed.c b/tests/t505-operator-mixed.c index 2efe5bd05d..bb2a62fda4 100644 --- a/tests/t505-operator-mixed.c +++ b/tests/t505-operator-mixed.c @@ -69,13 +69,13 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); - CeedOperatorSetMixedPrecision(op_setup); + CeedOperatorSetPrecision(op_setup, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass); CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); - CeedOperatorSetMixedPrecision(op_mass); + CeedOperatorSetPrecision(op_mass, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); CeedOperatorApply(op_setup, x, q_data, CEED_REQUEST_IMMEDIATE); diff --git a/tests/t506-operator-mixed.c b/tests/t506-operator-mixed.c index 0069f451ff..8aae3d335a 100644 --- a/tests/t506-operator-mixed.c +++ b/tests/t506-operator-mixed.c @@ -76,26 +76,29 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup_small, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_small, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup_small, "x", elem_restriction_x, basis_x_small, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup_small, "rho", elem_restriction_q_data_small, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); - CeedOperatorSetMixedPrecision(op_setup_small); + CeedOperatorSetPrecision(op_setup_small, CEED_SCALAR_TYPE == CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 + : CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 + : CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 + : CEED_SCALAR_FP32); CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_small); CeedOperatorSetField(op_mass_small, "rho", elem_restriction_q_data_small, CEED_BASIS_NONE, q_data_small); CeedOperatorSetField(op_mass_small, "u", elem_restriction_u, basis_u_small, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass_small, "v", elem_restriction_u, basis_u_small, CEED_VECTOR_ACTIVE); - CeedOperatorSetMixedPrecision(op_mass_small); + CeedOperatorSetPrecision(op_mass_small, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); // 'Large' operators CeedOperatorCreate(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup_large); CeedOperatorSetField(op_setup_large, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_large, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup_large, "x", elem_restriction_x, basis_x_large, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup_large, "rho", elem_restriction_q_data_large, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); - CeedOperatorSetMixedPrecision(op_setup_large); + CeedOperatorSetPrecision(op_setup_large, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_large); CeedOperatorSetField(op_mass_large, "rho", elem_restriction_q_data_large, CEED_BASIS_NONE, q_data_large); CeedOperatorSetField(op_mass_large, "u", elem_restriction_u, basis_u_large, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass_large, "v", elem_restriction_u, basis_u_large, CEED_VECTOR_ACTIVE); - CeedOperatorSetMixedPrecision(op_mass_large); + CeedOperatorSetPrecision(op_mass_large, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); // Setup CeedOperatorApply(op_setup_small, x, q_data_small, CEED_REQUEST_IMMEDIATE); diff --git a/tests/t510-operator-mixed.c b/tests/t510-operator-mixed.c index 42853f7165..4fdad2af8e 100644 --- a/tests/t510-operator-mixed.c +++ b/tests/t510-operator-mixed.c @@ -91,13 +91,13 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); - CeedOperatorSetMixedPrecision(op_setup); + CeedOperatorSetPrecision(op_setup, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass); CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); - CeedOperatorSetMixedPrecision(op_mass); + CeedOperatorSetPrecision(op_mass, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); CeedOperatorApply(op_setup, x, q_data, CEED_REQUEST_IMMEDIATE); diff --git a/tests/t520-operator-mixed.c b/tests/t520-operator-mixed.c index ea641fb604..7ab5faefa0 100644 --- a/tests/t520-operator-mixed.c +++ b/tests/t520-operator-mixed.c @@ -107,14 +107,14 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup_tet, "_weight", CEED_ELEMRESTRICTION_NONE, basis_x_tet, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup_tet, "dx", elem_restriction_x_tet, basis_x_tet, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup_tet, "rho", elem_restriction_q_data_tet, CEED_BASIS_NONE, q_data_tet); - CeedOperatorSetMixedPrecision(op_setup_tet); + CeedOperatorSetPrecision(op_setup_tet, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); // ---- Mass Tet CeedOperatorCreate(ceed, qf_mass_tet, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_tet); CeedOperatorSetField(op_mass_tet, "rho", elem_restriction_q_data_tet, CEED_BASIS_NONE, q_data_tet); CeedOperatorSetField(op_mass_tet, "u", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass_tet, "v", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE); CeedOperatorSetName(op_mass_tet, "mass tet"); - CeedOperatorSetMixedPrecision(op_mass_tet); + CeedOperatorSetPrecision(op_mass_tet, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); // Set up Hex Elements // -- Restrictions @@ -153,14 +153,14 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup_hex, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_hex, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup_hex, "dx", elem_restriction_x_hex, basis_x_hex, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup_hex, "rho", elem_restriction_q_data_hex, CEED_BASIS_NONE, q_data_hex); - CeedOperatorSetMixedPrecision(op_setup_hex); + CeedOperatorSetPrecision(op_setup_hex, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); CeedOperatorCreate(ceed, qf_mass_hex, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_hex); CeedOperatorSetField(op_mass_hex, "rho", elem_restriction_q_data_hex, CEED_BASIS_NONE, q_data_hex); CeedOperatorSetField(op_mass_hex, "u", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass_hex, "v", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE); CeedOperatorSetName(op_mass_hex, "mass hex"); - CeedOperatorSetMixedPrecision(op_mass_hex); + CeedOperatorSetPrecision(op_mass_hex, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); // Set up Composite Operators // -- Create diff --git a/tests/t522-operator-mixed.c b/tests/t522-operator-mixed.c index 071c979790..3478107f57 100644 --- a/tests/t522-operator-mixed.c +++ b/tests/t522-operator-mixed.c @@ -109,13 +109,13 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup_tet, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_tet, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup_tet, "dx", elem_restriction_x_tet, basis_x_tet, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup_tet, "rho", elem_restriction_q_data_tet, CEED_BASIS_NONE, q_data_tet); - CeedOperatorSetMixedPrecision(op_setup_tet); + CeedOperatorSetPrecision(op_setup_tet, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); // ---- Diff Tet CeedOperatorCreate(ceed, qf_diff_tet, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_diff_tet); CeedOperatorSetField(op_diff_tet, "rho", elem_restriction_q_data_tet, CEED_BASIS_NONE, q_data_tet); CeedOperatorSetField(op_diff_tet, "u", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_diff_tet, "v", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE); - CeedOperatorSetMixedPrecision(op_diff_tet); + CeedOperatorSetPrecision(op_diff_tet, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); // Hex Elements // -- Restrictions @@ -155,13 +155,13 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup_hex, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_hex, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup_hex, "dx", elem_restriction_x_hex, basis_x_hex, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup_hex, "rho", elem_restriction_q_data_hex, CEED_BASIS_NONE, q_data_hex); - CeedOperatorSetMixedPrecision(op_setup_hex); + CeedOperatorSetPrecision(op_setup_hex, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); CeedOperatorCreate(ceed, qf_diff_hex, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_diff_hex); CeedOperatorSetField(op_diff_hex, "rho", elem_restriction_q_data_hex, CEED_BASIS_NONE, q_data_hex); CeedOperatorSetField(op_diff_hex, "u", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_diff_hex, "v", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE); - CeedOperatorSetMixedPrecision(op_diff_hex); + CeedOperatorSetPrecision(op_diff_hex, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); // Composite Operators CeedCompositeOperatorCreate(ceed, &op_setup); diff --git a/tests/t591-operator-mixed.c b/tests/t591-operator-mixed.c index d0bc270977..068d029196 100644 --- a/tests/t591-operator-mixed.c +++ b/tests/t591-operator-mixed.c @@ -140,7 +140,7 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); - CeedOperatorSetMixedPrecision(op_setup); + CeedOperatorSetPrecision(op_setup, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points); CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE); @@ -155,7 +155,7 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); - CeedOperatorSetMixedPrecision(op_mass); + CeedOperatorSetPrecision(op_mass, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points); CeedOperatorIsAtPoints(op_mass, &is_at_points); diff --git a/tests/t592-operator-mixed.c b/tests/t592-operator-mixed.c index 897c16299b..d80811f60a 100644 --- a/tests/t592-operator-mixed.c +++ b/tests/t592-operator-mixed.c @@ -140,7 +140,7 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); - CeedOperatorSetMixedPrecision(op_setup); + CeedOperatorSetPrecision(op_setup, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points); CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE); @@ -155,7 +155,7 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); - CeedOperatorSetMixedPrecision(op_mass); + CeedOperatorSetPrecision(op_mass, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points); CeedOperatorIsAtPoints(op_mass, &is_at_points); diff --git a/tests/t593-operator-mixed.c b/tests/t593-operator-mixed.c index dfd778fe18..de49c38abf 100644 --- a/tests/t593-operator-mixed.c +++ b/tests/t593-operator-mixed.c @@ -95,7 +95,7 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points); - CeedOperatorSetMixedPrecision(op_setup); + CeedOperatorSetPrecision(op_setup, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE); @@ -109,7 +109,7 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); - CeedOperatorSetMixedPrecision(op_mass); + CeedOperatorSetPrecision(op_mass, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points); CeedOperatorIsAtPoints(op_mass, &is_at_points); diff --git a/tests/t594-operator-mixed.c b/tests/t594-operator-mixed.c index fee1b94857..e802ab893b 100644 --- a/tests/t594-operator-mixed.c +++ b/tests/t594-operator-mixed.c @@ -94,7 +94,7 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); - CeedOperatorSetMixedPrecision(op_setup); + CeedOperatorSetPrecision(op_setup, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points); CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE); @@ -109,7 +109,7 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); - CeedOperatorSetMixedPrecision(op_mass); + CeedOperatorSetPrecision(op_mass, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points); CeedOperatorIsAtPoints(op_mass, &is_at_points); diff --git a/tests/t596-operator-mixed.c b/tests/t596-operator-mixed.c index 4341d16912..a9b0cc039d 100644 --- a/tests/t596-operator-mixed.c +++ b/tests/t596-operator-mixed.c @@ -112,14 +112,14 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); - CeedOperatorSetMixedPrecision(op_setup); + CeedOperatorSetPrecision(op_setup, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points); CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass); CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); - CeedOperatorSetMixedPrecision(op_mass); + CeedOperatorSetPrecision(op_mass, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points); // Apply Setup Operator diff --git a/tests/t597-operator-mixed.c b/tests/t597-operator-mixed.c index 90d36eda69..105a963d3e 100644 --- a/tests/t597-operator-mixed.c +++ b/tests/t597-operator-mixed.c @@ -100,7 +100,7 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup, "q data", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); - CeedOperatorSetMixedPrecision(op_setup); + CeedOperatorSetPrecision(op_setup, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points); // Apply Setup Operator @@ -125,7 +125,7 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_diff, "du", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_diff, "q data", elem_restriction_q_data, CEED_BASIS_NONE, q_data); CeedOperatorSetField(op_diff, "dv", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); - CeedOperatorSetMixedPrecision(op_diff); + CeedOperatorSetPrecision(op_diff, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); CeedOperatorAtPointsSetPoints(op_diff, elem_restriction_x_points, x_points); // Fully assemble operator diff --git a/tests/t598-operator-mixed.c b/tests/t598-operator-mixed.c index 2d8bb2b8a5..08e8bb0e38 100644 --- a/tests/t598-operator-mixed.c +++ b/tests/t598-operator-mixed.c @@ -174,7 +174,7 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); - CeedOperatorSetMixedPrecision(op_setup); + CeedOperatorSetPrecision(op_setup, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points); CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE); @@ -189,7 +189,7 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_mass_fine, "u", elem_restriction_u_fine, basis_u_fine, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass_fine, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); CeedOperatorSetField(op_mass_fine, "v", elem_restriction_u_fine, basis_u_fine, CEED_VECTOR_ACTIVE); - CeedOperatorSetMixedPrecision(op_mass_fine); + CeedOperatorSetPrecision(op_mass_fine, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32); CeedOperatorAtPointsSetPoints(op_mass_fine, elem_restriction_x_points, x_points); CeedVectorCreate(ceed, num_nodes_fine, &u_fine); @@ -216,7 +216,7 @@ int main(int argc, char **argv) { sum += v_array[i]; } CeedVectorRestoreArrayRead(v_coarse, &v_array); - if (fabs(sum - num_elem) > 1000. * FLT_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: %f\n", sum, num_elem); + if (fabs(sum - num_elem) > 1000. * FLT_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: %f\n", sum, (float)num_elem); } // Prolong coarse u @@ -236,7 +236,7 @@ int main(int argc, char **argv) { } CeedVectorRestoreArrayRead(v_fine, &v_array); - if (fabs(sum - num_elem) > 1000. * FLT_EPSILON) printf("Computed Area Fine Grid: %f != True Area: %f\n", sum, num_elem); + if (fabs(sum - num_elem) > 1000. * FLT_EPSILON) printf("Computed Area Fine Grid: %f != True Area: %f\n", sum, (float)num_elem); } // Restrict state to coarse grid CeedOperatorApply(op_restrict, v_fine, v_coarse, CEED_REQUEST_IMMEDIATE); @@ -251,7 +251,7 @@ int main(int argc, char **argv) { sum += v_array[i]; } CeedVectorRestoreArrayRead(v_coarse, &v_array); - if (fabs(sum - num_elem) > 1000. * FLT_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: %f\n", sum, num_elem); + if (fabs(sum - num_elem) > 1000. * FLT_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: %f\n", sum, (float)num_elem); } CeedVectorDestroy(&x_points);