CEED · zatkins-dev · Jul 10, 2025 · Jul 10, 2025 · Jul 11, 2025 · Jul 11, 2025
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -1285,7 +1285,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
   code << tab << "// -----------------------------------------------------------------------------\n";
   code << tab << "extern \"C\" __global__ void " << operator_name
-       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
+       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalarCPU *W, Points_Cuda "
           "points) {\n";
   tab.push();
 
@@ -1295,11 +1295,11 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
-      code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
+      code << tab << "const CeedScalarCPU *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
     }
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
+    code << tab << "CeedScalarCPU *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
   }
 
   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
@@ -1572,11 +1572,20 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
 
   // Compile
   {
-    bool          is_compile_good = false;
-    const CeedInt T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
+    bool           is_compile_good = false;
+    const CeedInt  T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
+    CeedScalarType precision;
+
+    // Check for mixed precision
+    CeedCallBackend(CeedOperatorGetPrecision(op, &precision));
 
     data->thread_1d = T_1d;
-    CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "OP_T_1D", T_1d));
+    if (precision) {
+      CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", T_1d, "CEED_JIT_PRECISION",
+                                          (CeedInt)precision));
+    } else {
+      CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "OP_T_1D", T_1d));
+    }
     if (is_compile_good) {
       *is_good_build = true;
       CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op));
@@ -1689,8 +1698,8 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo
   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
   code << tab << "// -----------------------------------------------------------------------------\n";
   code << tab << "extern \"C\" __global__ void " << operator_name
-       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
-          "points, CeedScalar *__restrict__ values_array) {\n";
+       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalarCPU *W, Points_Cuda "
+          "points, CeedScalarCPU *__restrict__ values_array) {\n";
   tab.push();
 
   // Scratch buffers
@@ -1699,11 +1708,11 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
-      code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
+      code << tab << "const CeedScalarCPU *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
     }
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
+    code << tab << "CeedScalarCPU *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
   }
 
   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
@@ -2043,12 +2052,22 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo
 
   // Compile
   {
-    bool          is_compile_good = false;
-    const CeedInt T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
+    bool           is_compile_good = false;
+    const CeedInt  T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
+    CeedScalarType precision;
+
+    // Check for mixed precision
+    CeedCallBackend(CeedOperatorGetPrecision(op, &precision));
 
     data->thread_1d = T_1d;
-    CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good,
-                                        is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 1, "OP_T_1D", T_1d));
+    if (precision) {
+      CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good,
+                                          is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 2, "OP_T_1D", T_1d,
+                                          "CEED_JIT_PRECISION", (CeedInt)precision));
+    } else {
+      CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good,
+                                          is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 1, "OP_T_1D", T_1d));
+    }
     if (is_compile_good) {
       *is_good_build = true;
       CeedCallBackend(CeedGetKernel_Cuda(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(),
@@ -2221,8 +2240,8 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera
   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
   code << tab << "// -----------------------------------------------------------------------------\n";
   code << tab << "extern \"C\" __global__ void " << operator_name
-       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
-          "points, CeedScalar *__restrict__ values_array) {\n";
+       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalarCPU *W, Points_Cuda "
+          "points, CeedScalarCPU *__restrict__ values_array) {\n";
   tab.push();
 
   // Scratch buffers
@@ -2231,11 +2250,11 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
-      code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
+      code << tab << "const CeedScalarCPU *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
     }
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
+    code << tab << "CeedScalarCPU *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
   }
 
   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
@@ -2485,8 +2504,8 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera
       CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode));
       if (eval_mode == CEED_EVAL_GRAD) {
-        code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*"
-             << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
+        code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*"
+             << "dim_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
       } else {
         code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
       }
@@ -2623,11 +2642,20 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera
 
   // Compile
   {
-    bool          is_compile_good = false;
-    const CeedInt T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
+    bool           is_compile_good = false;
+    const CeedInt  T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
+    CeedScalarType precision;
+
+    // Check for mixed precision
+    CeedCallBackend(CeedOperatorGetPrecision(op, &precision));
 
     data->thread_1d = T_1d;
-    CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 1, "OP_T_1D", T_1d));
+    if (precision) {
+      CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 2, "OP_T_1D", T_1d,
+                                          "CEED_JIT_PRECISION", (CeedInt)precision));
+    } else {
+      CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 1, "OP_T_1D", T_1d));
+    }
     if (is_compile_good) {
       *is_good_build = true;
       CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module_assemble_qfunction, operator_name.c_str(), &data->assemble_qfunction));

diff --git a/backends/cuda-gen/ceed-cuda-gen.c b/backends/cuda-gen/ceed-cuda-gen.c
@@ -29,6 +29,7 @@ static int CeedInit_Cuda_gen(const char *resource, Ceed ceed) {
   CeedCallBackend(CeedCalloc(1, &data));
   CeedCallBackend(CeedSetData(ceed, data));
   CeedCallBackend(CeedInit_Cuda(ceed, resource));
+  CeedCallBackend(CeedSetSupportsMixedPrecision(ceed, true));
 
   CeedCallBackend(CeedInit("/gpu/cuda/shared", &ceed_shared));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_shared));

diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -2483,8 +2483,8 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperat
       CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode));
       if (eval_mode == CEED_EVAL_GRAD) {
-        code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*"
-             << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
+        code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*"
+             << "dim_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
       } else {
         code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
       }

diff --git a/include/ceed-impl.h b/include/ceed-impl.h
@@ -128,6 +128,7 @@ struct Ceed_private {
   bool            is_debug;
   bool            has_valid_op_fallback_resource;
   bool            is_deterministic;
+  bool            supports_mixed_precision;
   char            err_msg[CEED_MAX_RESOURCE_LEN];
   FOffset        *f_offsets;
   CeedWorkVectors work_vectors;
@@ -380,6 +381,7 @@ struct CeedOperator_private {
   bool                      is_composite;
   bool                      is_at_points;
   bool                      has_restriction;
+  CeedScalarType            precision;
   CeedQFunctionAssemblyData qf_assembled;
   CeedOperatorAssemblyData  op_assembled;
   CeedOperator             *sub_operators;

diff --git a/include/ceed/backend.h b/include/ceed/backend.h
@@ -250,6 +250,7 @@ CEED_EXTERN int CeedGetOperatorFallbackResource(Ceed ceed, const char **resource
 CEED_EXTERN int CeedGetOperatorFallbackCeed(Ceed ceed, Ceed *fallback_ceed);
 CEED_EXTERN int CeedSetOperatorFallbackResource(Ceed ceed, const char *resource);
 CEED_EXTERN int CeedSetDeterministic(Ceed ceed, bool is_deterministic);
+CEED_INTERN int CeedSetSupportsMixedPrecision(Ceed ceed, bool supports_mixed_precision);
 CEED_EXTERN int CeedSetBackendFunctionImpl(Ceed ceed, const char *type, void *object, const char *func_name, void (*f)(void));
 CEED_EXTERN int CeedGetData(Ceed ceed, void *data);
 CEED_EXTERN int CeedSetData(Ceed ceed, void *data);

diff --git a/include/ceed/ceed-f32.h b/include/ceed/ceed-f32.h
@@ -10,11 +10,26 @@
 /// Include this header in ceed.h to use float instead of double.
 #pragma once
 
+#ifndef CEED_RUNNING_JIT_PASS
+#include <float.h>
+#endif
+
 #define CEED_SCALAR_IS_FP32
 
 /// Set base scalar type to FP32. (See CeedScalarType enum in ceed.h for all options.)
 #define CEED_SCALAR_TYPE CEED_SCALAR_FP32
-typedef float CeedScalar;
+#if defined(CEED_RUNNING_JIT_PASS) && defined(CEED_JIT_PRECISION) && (CEED_JIT_PRECISION != CEED_SCALAR_TYPE)
+#ifdef CEED_JIT_PRECISION == CEED_SCALAR_FP64
+typedef double CeedScalar;
+typedef float  CeedScalarCPU;
+
+/// Machine epsilon
+static const CeedScalar CEED_EPSILON = DBL_EPSILON;
+#endif  // CEED_JIT_PRECISION
+#else
+typedef float      CeedScalar;
+typedef CeedScalar CeedScalarCPU;
 
 /// Machine epsilon
-#define CEED_EPSILON 6e-08
+static const CeedScalar CEED_EPSILON = FLT_EPSILON;
+#endif
diff --git a/include/ceed/ceed-f64.h b/include/ceed/ceed-f64.h
@@ -10,11 +10,26 @@
 /// This is the default header included in ceed.h.
 #pragma once
 
+#ifndef CEED_RUNNING_JIT_PASS
+#include <float.h>
+#endif
+
 #define CEED_SCALAR_IS_FP64
 
 /// Set base scalar type to FP64. (See CeedScalarType enum in ceed.h for all options.)
 #define CEED_SCALAR_TYPE CEED_SCALAR_FP64
-typedef double CeedScalar;
+#if defined(CEED_RUNNING_JIT_PASS) && defined(CEED_JIT_PRECISION) && (CEED_JIT_PRECISION != CEED_SCALAR_TYPE)
+#if CEED_JIT_PRECISION == CEED_SCALAR_FP32
+typedef float  CeedScalar;
+typedef double CeedScalarCPU;
+
+/// Machine epsilon
+static const CeedScalar CEED_EPSILON = FLT_EPSILON;
+#endif  // CEED_JIT_PRECISION
+#else
+typedef double     CeedScalar;
+typedef CeedScalar CeedScalarCPU;
 
 /// Machine epsilon
-#define CEED_EPSILON 1e-16
+static const CeedScalar CEED_EPSILON = DBL_EPSILON;
+#endif  // CEED_RUNNING_JIT_PASS && CEED_JIT_MIXED_PRECISION
diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
@@ -106,6 +106,7 @@ CEED_EXTERN int CeedSetStream(Ceed ceed, void *handle);
 CEED_EXTERN int CeedReferenceCopy(Ceed ceed, Ceed *ceed_copy);
 CEED_EXTERN int CeedGetResource(Ceed ceed, const char **resource);
 CEED_EXTERN int CeedIsDeterministic(Ceed ceed, bool *is_deterministic);
+CEED_EXTERN int CeedGetSupportsMixedPrecision(Ceed ceed, bool *supports_mixed_precision);
 CEED_EXTERN int CeedAddJitSourceRoot(Ceed ceed, const char *jit_source_root);
 CEED_EXTERN int CeedAddJitDefine(Ceed ceed, const char *jit_define);
 CEED_EXTERN int CeedView(Ceed ceed, FILE *stream);
@@ -426,6 +427,8 @@ CEED_EXTERN int  CeedOperatorCheckReady(CeedOperator op);
 CEED_EXTERN int  CeedOperatorGetActiveVectorLengths(CeedOperator op, CeedSize *input_size, CeedSize *output_size);
 CEED_EXTERN int  CeedOperatorSetQFunctionAssemblyReuse(CeedOperator op, bool reuse_assembly_data);
 CEED_EXTERN int  CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(CeedOperator op, bool needs_data_update);
+CEED_EXTERN int  CeedOperatorSetPrecision(CeedOperator op, CeedScalarType precision);
+CEED_EXTERN int  CeedOperatorGetPrecision(CeedOperator op, CeedScalarType *precision);
 CEED_EXTERN int  CeedOperatorLinearAssembleQFunction(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request);
 CEED_EXTERN int  CeedOperatorLinearAssembleQFunctionBuildOrUpdate(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr,
                                                                   CeedRequest *request);