From 4b254d5a5e68784e43d89bda8df363baa8a18280 Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Thu, 10 Jul 2025 17:08:05 -0600
Subject: [PATCH 1/5] gen - add support for mixed precision CUDA operators

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp |  62 ++--
 backends/cuda-gen/ceed-cuda-gen.c             |   1 +
 include/ceed-impl.h                           |   2 +
 include/ceed/backend.h                        |   1 +
 include/ceed/ceed-f32.h                       |   5 +-
 include/ceed/ceed-f64.h                       |  13 +-
 include/ceed/ceed.h                           |   3 +
 .../ceed/jit-source/cuda/cuda-gen-templates.h | 143 +++++----
 .../cuda-shared-basis-nontensor-templates.h   |  33 +-
 .../cuda/cuda-shared-basis-nontensor.h        |  24 +-
 .../cuda-shared-basis-read-write-templates.h  |  50 ++--
 ...-shared-basis-tensor-at-points-templates.h |  72 ++---
 .../cuda/cuda-shared-basis-tensor-at-points.h |  34 ++-
 ...-shared-basis-tensor-flattened-templates.h | 162 +++++-----
 .../cuda/cuda-shared-basis-tensor-templates.h | 152 +++++-----
 .../cuda/cuda-shared-basis-tensor.h           |  25 +-
 include/ceed/jit-source/cuda/cuda-types.h     |  12 +-
 interface/ceed-operator.c                     |  45 ++-
 interface/ceed.c                              |  30 ++
 tests/t502-operator-mixed.c                   | 125 ++++++++
 tests/t503-operator-mixed.c                   | 114 +++++++
 tests/t505-operator-mixed.c                   | 128 ++++++++
 tests/t506-operator-mixed.c                   | 173 +++++++++++
 tests/t510-operator-mixed.c                   | 133 +++++++++
 tests/t520-operator-mixed.c                   | 234 +++++++++++++++
 tests/t522-operator-mixed.c                   | 221 ++++++++++++++
 tests/t591-operator-mixed.c                   | 197 ++++++++++++
 tests/t592-operator-mixed.c                   | 249 ++++++++++++++++
 tests/t593-operator-mixed.c                   | 155 ++++++++++
 tests/t594-operator-mixed.c                   | 182 +++++++++++
 tests/t596-operator-mixed.c                   | 205 +++++++++++++
 tests/t597-operator-mixed.c                   | 206 +++++++++++++
 tests/t598-operator-mixed.c                   | 282 ++++++++++++++++++
 33 files changed, 3094 insertions(+), 379 deletions(-)
 create mode 100644 tests/t502-operator-mixed.c
 create mode 100644 tests/t503-operator-mixed.c
 create mode 100644 tests/t505-operator-mixed.c
 create mode 100644 tests/t506-operator-mixed.c
 create mode 100644 tests/t510-operator-mixed.c
 create mode 100644 tests/t520-operator-mixed.c
 create mode 100644 tests/t522-operator-mixed.c
 create mode 100644 tests/t591-operator-mixed.c
 create mode 100644 tests/t592-operator-mixed.c
 create mode 100644 tests/t593-operator-mixed.c
 create mode 100644 tests/t594-operator-mixed.c
 create mode 100644 tests/t596-operator-mixed.c
 create mode 100644 tests/t597-operator-mixed.c
 create mode 100644 tests/t598-operator-mixed.c

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 481c358466..59fcf03d42 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -1285,7 +1285,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
   code << tab << "// -----------------------------------------------------------------------------\n";
   code << tab << "extern \"C\" __global__ void " << operator_name
-       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
+       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalarCPU *W, Points_Cuda "
           "points) {\n";
   tab.push();
 
@@ -1295,11 +1295,11 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
-      code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
+      code << tab << "const CeedScalarCPU *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
     }
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
+    code << tab << "CeedScalarCPU *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
   }
 
   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
@@ -1574,9 +1574,18 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
   {
     bool          is_compile_good = false;
     const CeedInt T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
+    bool          use_mixed_precision;
+
+    // Check for mixed precision
+    CeedCallBackend(CeedOperatorGetMixedPrecision(op, &use_mixed_precision));
 
     data->thread_1d = T_1d;
-    CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "OP_T_1D", T_1d));
+    if (use_mixed_precision) {
+      CeedCallBackend(
+          CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", T_1d, "CEED_JIT_MIXED_PRECISION", 1));
+    } else {
+      CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "OP_T_1D", T_1d));
+    }
     if (is_compile_good) {
       *is_good_build = true;
       CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op));
@@ -1689,8 +1698,8 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo
   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
   code << tab << "// -----------------------------------------------------------------------------\n";
   code << tab << "extern \"C\" __global__ void " << operator_name
-       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
-          "points, CeedScalar *__restrict__ values_array) {\n";
+       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalarCPU *W, Points_Cuda "
+          "points, CeedScalarCPU *__restrict__ values_array) {\n";
   tab.push();
 
   // Scratch buffers
@@ -1699,11 +1708,11 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
-      code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
+      code << tab << "const CeedScalarCPU *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
     }
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
+    code << tab << "CeedScalarCPU *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
   }
 
   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
@@ -2045,10 +2054,20 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo
   {
     bool          is_compile_good = false;
     const CeedInt T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
+    bool          use_mixed_precision;
+
+    // Check for mixed precision
+    CeedCallBackend(CeedOperatorGetMixedPrecision(op, &use_mixed_precision));
 
     data->thread_1d = T_1d;
-    CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good,
-                                        is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 1, "OP_T_1D", T_1d));
+    if (use_mixed_precision) {
+      CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good,
+                                          is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 2, "OP_T_1D", T_1d,
+                                          "CEED_JIT_MIXED_PRECISION", 1));
+    } else {
+      CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good,
+                                          is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 1, "OP_T_1D", T_1d));
+    }
     if (is_compile_good) {
       *is_good_build = true;
       CeedCallBackend(CeedGetKernel_Cuda(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(),
@@ -2221,8 +2240,8 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera
   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
   code << tab << "// -----------------------------------------------------------------------------\n";
   code << tab << "extern \"C\" __global__ void " << operator_name
-       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
-          "points, CeedScalar *__restrict__ values_array) {\n";
+       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalarCPU *W, Points_Cuda "
+          "points, CeedScalarCPU *__restrict__ values_array) {\n";
   tab.push();
 
   // Scratch buffers
@@ -2231,11 +2250,11 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
-      code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
+      code << tab << "const CeedScalarCPU *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
     }
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
+    code << tab << "CeedScalarCPU *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
   }
 
   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
@@ -2485,8 +2504,8 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera
       CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode));
       if (eval_mode == CEED_EVAL_GRAD) {
-        code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*"
-             << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
+        code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*"
+             << "dim_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
       } else {
         code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
       }
@@ -2625,9 +2644,18 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera
   {
     bool          is_compile_good = false;
     const CeedInt T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
+    bool          use_mixed_precision;
+
+    // Check for mixed precision
+    CeedCallBackend(CeedOperatorGetMixedPrecision(op, &use_mixed_precision));
 
     data->thread_1d = T_1d;
-    CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 1, "OP_T_1D", T_1d));
+    if (use_mixed_precision) {
+      CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 2, "OP_T_1D", T_1d,
+                                          "CEED_JIT_MIXED_PRECISION", 1));
+    } else {
+      CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 1, "OP_T_1D", T_1d));
+    }
     if (is_compile_good) {
       *is_good_build = true;
       CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module_assemble_qfunction, operator_name.c_str(), &data->assemble_qfunction));
diff --git a/backends/cuda-gen/ceed-cuda-gen.c b/backends/cuda-gen/ceed-cuda-gen.c
index f38b700225..ee5068e72c 100644
--- a/backends/cuda-gen/ceed-cuda-gen.c
+++ b/backends/cuda-gen/ceed-cuda-gen.c
@@ -29,6 +29,7 @@ static int CeedInit_Cuda_gen(const char *resource, Ceed ceed) {
   CeedCallBackend(CeedCalloc(1, &data));
   CeedCallBackend(CeedSetData(ceed, data));
   CeedCallBackend(CeedInit_Cuda(ceed, resource));
+  CeedCallBackend(CeedSetSupportsMixedPrecision(ceed, true));
 
   CeedCallBackend(CeedInit("/gpu/cuda/shared", &ceed_shared));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_shared));
diff --git a/include/ceed-impl.h b/include/ceed-impl.h
index 95c920604d..e8f6976736 100644
--- a/include/ceed-impl.h
+++ b/include/ceed-impl.h
@@ -128,6 +128,7 @@ struct Ceed_private {
   bool            is_debug;
   bool            has_valid_op_fallback_resource;
   bool            is_deterministic;
+  bool            supports_mixed_precision;
   char            err_msg[CEED_MAX_RESOURCE_LEN];
   FOffset        *f_offsets;
   CeedWorkVectors work_vectors;
@@ -380,6 +381,7 @@ struct CeedOperator_private {
   bool                      is_composite;
   bool                      is_at_points;
   bool                      has_restriction;
+  bool                      use_mixed_precision;
   CeedQFunctionAssemblyData qf_assembled;
   CeedOperatorAssemblyData  op_assembled;
   CeedOperator             *sub_operators;
diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index e6b608e571..d4bfa22f35 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -250,6 +250,7 @@ CEED_EXTERN int CeedGetOperatorFallbackResource(Ceed ceed, const char **resource
 CEED_EXTERN int CeedGetOperatorFallbackCeed(Ceed ceed, Ceed *fallback_ceed);
 CEED_EXTERN int CeedSetOperatorFallbackResource(Ceed ceed, const char *resource);
 CEED_EXTERN int CeedSetDeterministic(Ceed ceed, bool is_deterministic);
+CEED_INTERN int CeedSetSupportsMixedPrecision(Ceed ceed, bool supports_mixed_precision);
 CEED_EXTERN int CeedSetBackendFunctionImpl(Ceed ceed, const char *type, void *object, const char *func_name, void (*f)(void));
 CEED_EXTERN int CeedGetData(Ceed ceed, void *data);
 CEED_EXTERN int CeedSetData(Ceed ceed, void *data);
diff --git a/include/ceed/ceed-f32.h b/include/ceed/ceed-f32.h
index 0bce734257..39d2fb1187 100644
--- a/include/ceed/ceed-f32.h
+++ b/include/ceed/ceed-f32.h
@@ -14,7 +14,8 @@
 
 /// Set base scalar type to FP32. (See CeedScalarType enum in ceed.h for all options.)
 #define CEED_SCALAR_TYPE CEED_SCALAR_FP32
-typedef float CeedScalar;
+typedef float      CeedScalar;
+typedef CeedScalar CeedScalarCPU;
 
 /// Machine epsilon
-#define CEED_EPSILON 6e-08
+#define CEED_EPSILON 0x1p-23
diff --git a/include/ceed/ceed-f64.h b/include/ceed/ceed-f64.h
index b74d867c18..1e3a7fd7bf 100644
--- a/include/ceed/ceed-f64.h
+++ b/include/ceed/ceed-f64.h
@@ -14,7 +14,16 @@
 
 /// Set base scalar type to FP64. (See CeedScalarType enum in ceed.h for all options.)
 #define CEED_SCALAR_TYPE CEED_SCALAR_FP64
-typedef double CeedScalar;
+#if defined(CEED_RUNNING_JIT_PASS) && defined(CEED_JIT_MIXED_PRECISION)
+typedef float  CeedScalar;
+typedef double CeedScalarCPU;
 
 /// Machine epsilon
-#define CEED_EPSILON 1e-16
+#define CEED_EPSILON 0x1p-23
+#else
+typedef double     CeedScalar;
+typedef CeedScalar CeedScalarCPU;
+
+/// Machine epsilon
+#define CEED_EPSILON 0x1p-52
+#endif  // CEED_RUNNING_JIT_PASS && CEED_JIT_MIXED_PRECISION
diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index af510065eb..46d22b5dab 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -106,6 +106,7 @@ CEED_EXTERN int CeedSetStream(Ceed ceed, void *handle);
 CEED_EXTERN int CeedReferenceCopy(Ceed ceed, Ceed *ceed_copy);
 CEED_EXTERN int CeedGetResource(Ceed ceed, const char **resource);
 CEED_EXTERN int CeedIsDeterministic(Ceed ceed, bool *is_deterministic);
+CEED_EXTERN int CeedGetSupportsMixedPrecision(Ceed ceed, bool *supports_mixed_precision);
 CEED_EXTERN int CeedAddJitSourceRoot(Ceed ceed, const char *jit_source_root);
 CEED_EXTERN int CeedAddJitDefine(Ceed ceed, const char *jit_define);
 CEED_EXTERN int CeedView(Ceed ceed, FILE *stream);
@@ -426,6 +427,8 @@ CEED_EXTERN int  CeedOperatorCheckReady(CeedOperator op);
 CEED_EXTERN int  CeedOperatorGetActiveVectorLengths(CeedOperator op, CeedSize *input_size, CeedSize *output_size);
 CEED_EXTERN int  CeedOperatorSetQFunctionAssemblyReuse(CeedOperator op, bool reuse_assembly_data);
 CEED_EXTERN int  CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(CeedOperator op, bool needs_data_update);
+CEED_EXTERN int  CeedOperatorSetMixedPrecision(CeedOperator op);
+CEED_EXTERN int  CeedOperatorGetMixedPrecision(CeedOperator op, bool *use_mixed_precision);
 CEED_EXTERN int  CeedOperatorLinearAssembleQFunction(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request);
 CEED_EXTERN int  CeedOperatorLinearAssembleQFunctionBuildOrUpdate(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr,
                                                                   CeedRequest *request);
diff --git a/include/ceed/jit-source/cuda/cuda-gen-templates.h b/include/ceed/jit-source/cuda/cuda-gen-templates.h
index f4dccf54ea..d78e83eeab 100644
--- a/include/ceed/jit-source/cuda/cuda-gen-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-gen-templates.h
@@ -12,8 +12,8 @@
 //------------------------------------------------------------------------------
 // Load matrices for basis actions
 //------------------------------------------------------------------------------
-template <int P, int Q>
-inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
+template <int P, int Q, class ScalarIn, class ScalarOut>
+inline __device__ void LoadMatrix(SharedData_Cuda &data, const ScalarIn *__restrict__ d_B, ScalarOut *B) {
   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
 }
 
@@ -24,9 +24,9 @@ inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__res
 //------------------------------------------------------------------------------
 // L-vector -> single point
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
+template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS, class ScalarIn, class ScalarOut>
 inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
-                                 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
+                                 const CeedInt *__restrict__ indices, const ScalarIn *__restrict__ d_u, ScalarOut *r_u) {
   const CeedInt ind = indices[p + elem * NUM_PTS];
 
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -37,9 +37,9 @@ inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, cons
 //------------------------------------------------------------------------------
 // Single point -> L-vector
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
+template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS, class ScalarIn, class ScalarOut>
 inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
-                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) {
+                                  const CeedInt *__restrict__ indices, const ScalarIn *__restrict__ r_u, ScalarOut *d_u) {
   if (p < points_in_elem) {
     const CeedInt ind = indices[p + elem * NUM_PTS];
 
@@ -56,8 +56,8 @@ inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, con
 //------------------------------------------------------------------------------
 // Set E-vector value
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D>
-inline __device__ void SetEVecStandard1d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
+template <int NUM_COMP, int P_1D, class ScalarIn, class ScalarOut>
+inline __device__ void SetEVecStandard1d_Single(SharedData_Cuda &data, const CeedInt n, const ScalarIn value, ScalarOut *__restrict__ r_v) {
   const CeedInt target_comp = n / P_1D;
   const CeedInt target_node = n % P_1D;
 
@@ -69,9 +69,9 @@ inline __device__ void SetEVecStandard1d_Single(SharedData_Cuda &data, const Cee
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void ReadLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                          const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
+                                          const ScalarIn *__restrict__ d_u, ScalarOut *__restrict__ r_u) {
   if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
     const CeedInt ind  = indices[node + elem * P_1D];
@@ -83,9 +83,8 @@ inline __device__ void ReadLVecStandard1d(SharedData_Cuda &data, const CeedInt n
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
-                                         CeedScalar *__restrict__ r_u) {
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM, class ScalarIn, class ScalarOut>
+inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const ScalarIn *__restrict__ d_u, ScalarOut *__restrict__ r_u) {
   if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
@@ -97,9 +96,9 @@ inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt el
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                           const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+                                           const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) {
   if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
     const CeedInt ind  = indices[node + elem * P_1D];
@@ -108,10 +107,10 @@ inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt
   }
 }
 
-template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteLVecStandard1d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
-                                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
-                                                  CeedScalar *__restrict__ d_v) {
+                                                  const CeedInt *__restrict__ indices, const ScalarIn *__restrict__ r_v,
+                                                  ScalarOut *__restrict__ d_v) {
   const CeedInt target_comp = n / P_1D;
   const CeedInt target_node = n % P_1D;
 
@@ -125,9 +124,9 @@ inline __device__ void WriteLVecStandard1d_Single(SharedData_Cuda &data, const C
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, full assembly
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
-                                                    const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+                                                    const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) {
   const CeedInt in_comp    = in / P_1D;
   const CeedInt in_node    = in % P_1D;
   const CeedInt e_vec_size = P_1D * NUM_COMP;
@@ -144,9 +143,9 @@ inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Cuda &data, const
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, Qfunction assembly
 //------------------------------------------------------------------------------
-template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
+template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteLVecStandard1d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
-                                                      const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+                                                      const CeedInt output_offset, const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) {
   if (data.t_id_x < Q_1D) {
     const CeedInt ind = data.t_id_x + elem * Q_1D;
 
@@ -159,9 +158,8 @@ inline __device__ void WriteLVecStandard1d_QFAssembly(SharedData_Cuda &data, con
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
-                                          CeedScalar *__restrict__ d_v) {
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM, class ScalarIn, class ScalarOut>
+inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) {
   if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
@@ -177,8 +175,8 @@ inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt e
 //------------------------------------------------------------------------------
 // Set E-vector value
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D>
-inline __device__ void SetEVecStandard2d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
+template <int NUM_COMP, int P_1D, class ScalarIn, class ScalarOut>
+inline __device__ void SetEVecStandard2d_Single(SharedData_Cuda &data, const CeedInt n, const ScalarIn value, ScalarOut *__restrict__ r_v) {
   const CeedInt target_comp   = n / (P_1D * P_1D);
   const CeedInt target_node_x = n % P_1D;
   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
@@ -191,9 +189,9 @@ inline __device__ void SetEVecStandard2d_Single(SharedData_Cuda &data, const Cee
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void ReadLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                          const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
+                                          const ScalarIn *__restrict__ d_u, ScalarOut *__restrict__ r_u) {
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
@@ -205,9 +203,8 @@ inline __device__ void ReadLVecStandard2d(SharedData_Cuda &data, const CeedInt n
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
-                                         CeedScalar *__restrict__ r_u) {
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM, class ScalarIn, class ScalarOut>
+inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const ScalarIn *__restrict__ d_u, ScalarOut *__restrict__ r_u) {
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
@@ -219,9 +216,9 @@ inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt el
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                           const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+                                           const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) {
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
@@ -230,10 +227,10 @@ inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt
   }
 }
 
-template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteLVecStandard2d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
-                                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
-                                                  CeedScalar *__restrict__ d_v) {
+                                                  const CeedInt *__restrict__ indices, const ScalarIn *__restrict__ r_v,
+                                                  ScalarOut *__restrict__ d_v) {
   const CeedInt target_comp   = n / (P_1D * P_1D);
   const CeedInt target_node_x = n % P_1D;
   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
@@ -249,9 +246,9 @@ inline __device__ void WriteLVecStandard2d_Single(SharedData_Cuda &data, const C
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, full assembly
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
-                                                    const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+                                                    const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) {
   const CeedInt elem_size  = P_1D * P_1D;
   const CeedInt in_comp    = in / elem_size;
   const CeedInt in_node_x  = in % P_1D;
@@ -273,9 +270,9 @@ inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Cuda &data, const
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, Qfunction assembly
 //------------------------------------------------------------------------------
-template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
+template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteLVecStandard2d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
-                                                      const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+                                                      const CeedInt output_offset, const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) {
   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
     const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D) + elem * Q_1D * Q_1D;
 
@@ -288,9 +285,8 @@ inline __device__ void WriteLVecStandard2d_QFAssembly(SharedData_Cuda &data, con
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
-                                          CeedScalar *__restrict__ d_v) {
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM, class ScalarIn, class ScalarOut>
+inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) {
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
@@ -306,8 +302,8 @@ inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt e
 //------------------------------------------------------------------------------
 // Set E-vector value
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D>
-inline __device__ void SetEVecStandard3d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
+template <int NUM_COMP, int P_1D, class ScalarIn, class ScalarOut>
+inline __device__ void SetEVecStandard3d_Single(SharedData_Cuda &data, const CeedInt n, const ScalarIn value, ScalarOut *__restrict__ r_v) {
   const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
   const CeedInt target_node_x = n % P_1D;
   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
@@ -321,9 +317,9 @@ inline __device__ void SetEVecStandard3d_Single(SharedData_Cuda &data, const Cee
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                          const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
+                                          const ScalarIn *__restrict__ d_u, ScalarOut *__restrict__ r_u) {
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     for (CeedInt z = 0; z < P_1D; z++) {
       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
@@ -337,9 +333,8 @@ inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt n
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
-                                         CeedScalar *__restrict__ r_u) {
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM, class ScalarIn, class ScalarOut>
+inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const ScalarIn *__restrict__ d_u, ScalarOut *__restrict__ r_u) {
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     for (CeedInt z = 0; z < P_1D; z++) {
       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
@@ -353,10 +348,9 @@ inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt el
 //------------------------------------------------------------------------------
 // E-vector -> Q-vector, offests provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int Q_1D>
+template <int NUM_COMP, int COMP_STRIDE, int Q_1D, class ScalarIn, class ScalarOut>
 inline __device__ void ReadEVecSliceStandard3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
-                                               const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u,
-                                               CeedScalar *__restrict__ r_u) {
+                                               const CeedInt *__restrict__ indices, const ScalarIn *__restrict__ d_u, ScalarOut *__restrict__ r_u) {
   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
     const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
     const CeedInt ind  = indices[node + elem * Q_1D * Q_1D * Q_1D];
@@ -368,9 +362,9 @@ inline __device__ void ReadEVecSliceStandard3d(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // E-vector -> Q-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
-                                              CeedScalar *__restrict__ r_u) {
+template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM, class ScalarIn, class ScalarOut>
+inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const ScalarIn *__restrict__ d_u,
+                                              ScalarOut *__restrict__ r_u) {
   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
     const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
@@ -382,9 +376,9 @@ inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedI
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                           const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+                                           const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) {
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     for (CeedInt z = 0; z < P_1D; z++) {
       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
@@ -395,10 +389,10 @@ inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt
   }
 }
 
-template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteLVecStandard3d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
-                                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
-                                                  CeedScalar *__restrict__ d_v) {
+                                                  const CeedInt *__restrict__ indices, const ScalarIn *__restrict__ r_v,
+                                                  ScalarOut *__restrict__ d_v) {
   const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
   const CeedInt target_node_x = n % P_1D;
   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
@@ -415,9 +409,9 @@ inline __device__ void WriteLVecStandard3d_Single(SharedData_Cuda &data, const C
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, full assembly
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
-                                                    const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+                                                    const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) {
   const CeedInt elem_size  = P_1D * P_1D * P_1D;
   const CeedInt in_comp    = in / elem_size;
   const CeedInt in_node_x  = in % P_1D;
@@ -442,9 +436,9 @@ inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Cuda &data, const
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, Qfunction assembly
 //------------------------------------------------------------------------------
-template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
+template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteLVecStandard3d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
-                                                      const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+                                                      const CeedInt output_offset, const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) {
   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
     for (CeedInt z = 0; z < Q_1D; z++) {
       const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D + z * Q_1D * Q_1D) + elem * Q_1D * Q_1D * Q_1D;
@@ -459,9 +453,8 @@ inline __device__ void WriteLVecStandard3d_QFAssembly(SharedData_Cuda &data, con
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
-                                          CeedScalar *__restrict__ d_v) {
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM, class ScalarIn, class ScalarOut>
+inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) {
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     for (CeedInt z = 0; z < P_1D; z++) {
       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
@@ -475,9 +468,9 @@ inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt e
 //------------------------------------------------------------------------------
 // 3D collocated derivatives computation
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int Q_1D, int T_1D>
-inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
-                                        CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_G,
+                                        ScalarOut *__restrict__ r_V) {
   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
       __syncthreads();
@@ -505,9 +498,9 @@ inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q,
 //------------------------------------------------------------------------------
 // 3D collocated derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int Q_1D, int T_1D>
-inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
-                                                 CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const CeedInt q, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_G,
+                                                 ScalarOut *__restrict__ r_V) {
   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
       __syncthreads();
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h
index e5b31970ff..04a7718b90 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h
@@ -12,8 +12,8 @@
 //------------------------------------------------------------------------------
 // 1D tensor contraction
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
-inline __device__ void Contract1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void Contract1d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   data.slice[data.t_id_x] = *U;
   __syncthreads();
   *V = 0.0;
@@ -28,8 +28,8 @@ inline __device__ void Contract1d(SharedData_Cuda &data, const CeedScalar *U, co
 //------------------------------------------------------------------------------
 // 1D transpose tensor contraction
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
-inline __device__ void ContractTranspose1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTranspose1d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   data.slice[data.t_id_x] = *U;
   __syncthreads();
   if (data.t_id_x < P_1D) {
@@ -43,9 +43,8 @@ inline __device__ void ContractTranspose1d(SharedData_Cuda &data, const CeedScal
 //------------------------------------------------------------------------------
 // Interpolate to quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P, int Q, int T_1D>
-inline __device__ void InterpNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                       CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P, int Q, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpNonTensor(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, ScalarOut *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     Contract1d<NUM_COMP, P, Q>(data, &r_U[comp], c_B, &r_V[comp]);
   }
@@ -54,9 +53,9 @@ inline __device__ void InterpNonTensor(SharedData_Cuda &data, const CeedScalar *
 //------------------------------------------------------------------------------
 // Interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P, int Q, int T_1D>
-inline __device__ void InterpTransposeNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                                CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P, int Q, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpTransposeNonTensor(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B,
+                                                ScalarOut *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     r_V[comp] = 0.0;
     ContractTranspose1d<NUM_COMP, P, Q>(data, &r_U[comp], c_B, &r_V[comp]);
@@ -66,8 +65,8 @@ inline __device__ void InterpTransposeNonTensor(SharedData_Cuda &data, const Cee
 //------------------------------------------------------------------------------
 // Derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int DIM, int P, int Q, int T_1D>
-inline __device__ void GradNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int DIM, int P, int Q, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void GradNonTensor(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_G, ScalarOut *__restrict__ r_V) {
   for (CeedInt dim = 0; dim < DIM; dim++) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
       Contract1d<NUM_COMP, P, Q>(data, &r_U[comp], &c_G[dim * P * Q], &r_V[comp + dim * NUM_COMP]);
@@ -78,9 +77,9 @@ inline __device__ void GradNonTensor(SharedData_Cuda &data, const CeedScalar *__
 //------------------------------------------------------------------------------
 // Derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int DIM, int P, int Q, int T_1D>
-inline __device__ void GradTransposeNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
-                                              CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int DIM, int P, int Q, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void GradTransposeNonTensor(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_G,
+                                              ScalarOut *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_V[comp] = 0.0;
   for (CeedInt dim = 0; dim < DIM; dim++) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -92,7 +91,7 @@ inline __device__ void GradTransposeNonTensor(SharedData_Cuda &data, const CeedS
 //------------------------------------------------------------------------------
 // Quadrature weights
 //------------------------------------------------------------------------------
-template <int P, int Q>
-inline __device__ void WeightNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight, CeedScalar *w) {
+template <int P, int Q, class ScalarIn, class ScalarOut>
+inline __device__ void WeightNonTensor(SharedData_Cuda &data, const ScalarIn *__restrict__ q_weight, ScalarOut *w) {
   *w = (data.t_id_x < Q) ? q_weight[data.t_id_x] : 0.0;
 }
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
index ec7102ea2c..5a4cdff8cf 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
@@ -15,7 +15,8 @@
 //------------------------------------------------------------------------------
 // Interp kernels
 //------------------------------------------------------------------------------
-extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalarCPU *c_B, const CeedScalarCPU *__restrict__ d_U,
+                                  CeedScalarCPU *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -41,8 +42,8 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B,
   }
 }
 
-extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
-                                           CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalarCPU *c_B, const CeedScalarCPU *__restrict__ d_U,
+                                           CeedScalarCPU *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -68,8 +69,8 @@ extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedSca
   }
 }
 
-extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
-                                              CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalarCPU *c_B, const CeedScalarCPU *__restrict__ d_U,
+                                              CeedScalarCPU *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -98,7 +99,8 @@ extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const Ceed
 //------------------------------------------------------------------------------
 // Grad kernels
 //------------------------------------------------------------------------------
-extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalarCPU *c_G, const CeedScalarCPU *__restrict__ d_U,
+                                CeedScalarCPU *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -124,8 +126,8 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G, c
   }
 }
 
-extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
-                                         CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalarCPU *c_G, const CeedScalarCPU *__restrict__ d_U,
+                                         CeedScalarCPU *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -151,8 +153,8 @@ extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScala
   }
 }
 
-extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
-                                            CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalarCPU *c_G, const CeedScalarCPU *__restrict__ d_U,
+                                            CeedScalarCPU *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -181,7 +183,7 @@ extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedSc
 //------------------------------------------------------------------------------
 // Weight kernel
 //------------------------------------------------------------------------------
-extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_W) {
+extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalarCPU *__restrict__ q_weight, CeedScalarCPU *__restrict__ d_W) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
index cb62c4f80b..74cbeb6809 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
@@ -12,8 +12,8 @@
 //------------------------------------------------------------------------------
 // Load matrices for basis actions
 //------------------------------------------------------------------------------
-template <int P, int Q>
-inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
+template <int P, int Q, class ScalarIn, class ScalarOut>
+inline __device__ void LoadMatrix(SharedData_Cuda &data, const ScalarIn *__restrict__ d_B, ScalarOut *B) {
   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
 }
 
@@ -24,9 +24,9 @@ inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__res
 //------------------------------------------------------------------------------
 // E-vector -> single element
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D>
+template <int NUM_COMP, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void ReadElementStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
-                                            const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
+                                            const CeedInt strides_elem, const ScalarIn *__restrict__ d_u, ScalarOut *r_u) {
   if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
     const CeedInt ind  = node * strides_node + elem * strides_elem;
@@ -40,9 +40,9 @@ inline __device__ void ReadElementStrided1d(SharedData_Cuda &data, const CeedInt
 //------------------------------------------------------------------------------
 // Single element -> E-vector
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D>
+template <int NUM_COMP, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteElementStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
-                                             const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+                                             const CeedInt strides_elem, const ScalarIn *r_v, ScalarOut *d_v) {
   if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
     const CeedInt ind  = node * strides_node + elem * strides_elem;
@@ -53,9 +53,9 @@ inline __device__ void WriteElementStrided1d(SharedData_Cuda &data, const CeedIn
   }
 }
 
-template <int NUM_COMP, int P_1D>
+template <int NUM_COMP, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void SumElementStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
-                                           const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+                                           const CeedInt strides_elem, const ScalarIn *r_v, ScalarOut *d_v) {
   if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
     const CeedInt ind  = node * strides_node + elem * strides_elem;
@@ -73,9 +73,9 @@ inline __device__ void SumElementStrided1d(SharedData_Cuda &data, const CeedInt
 //------------------------------------------------------------------------------
 // E-vector -> single element
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D>
+template <int NUM_COMP, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void ReadElementStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
-                                            const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
+                                            const CeedInt strides_elem, const ScalarIn *__restrict__ d_u, ScalarOut *r_u) {
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
     const CeedInt ind  = node * strides_node + elem * strides_elem;
@@ -89,9 +89,9 @@ inline __device__ void ReadElementStrided2d(SharedData_Cuda &data, const CeedInt
 //------------------------------------------------------------------------------
 // Single element -> E-vector
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D>
+template <int NUM_COMP, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteElementStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
-                                             const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+                                             const CeedInt strides_elem, const ScalarIn *r_v, ScalarOut *d_v) {
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
     const CeedInt ind  = node * strides_node + elem * strides_elem;
@@ -102,9 +102,9 @@ inline __device__ void WriteElementStrided2d(SharedData_Cuda &data, const CeedIn
   }
 }
 
-template <int NUM_COMP, int P_1D>
+template <int NUM_COMP, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void SumElementStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
-                                           const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+                                           const CeedInt strides_elem, const ScalarIn *r_v, ScalarOut *d_v) {
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
     const CeedInt ind  = node * strides_node + elem * strides_elem;
@@ -122,9 +122,9 @@ inline __device__ void SumElementStrided2d(SharedData_Cuda &data, const CeedInt
 //------------------------------------------------------------------------------
 // E-vector -> single element
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D>
+template <int NUM_COMP, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void ReadElementStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
-                                            const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
+                                            const CeedInt strides_elem, const ScalarIn *__restrict__ d_u, ScalarOut *r_u) {
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     for (CeedInt z = 0; z < P_1D; z++) {
       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
@@ -140,9 +140,9 @@ inline __device__ void ReadElementStrided3d(SharedData_Cuda &data, const CeedInt
 //------------------------------------------------------------------------------
 // Single element -> E-vector
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D>
+template <int NUM_COMP, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteElementStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
-                                             const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+                                             const CeedInt strides_elem, const ScalarIn *r_v, ScalarOut *d_v) {
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     for (CeedInt z = 0; z < P_1D; z++) {
       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
@@ -155,9 +155,9 @@ inline __device__ void WriteElementStrided3d(SharedData_Cuda &data, const CeedIn
   }
 }
 
-template <int NUM_COMP, int P_1D>
+template <int NUM_COMP, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void SumElementStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
-                                           const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+                                           const CeedInt strides_elem, const ScalarIn *r_v, ScalarOut *d_v) {
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     for (CeedInt z = 0; z < P_1D; z++) {
       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
@@ -177,10 +177,10 @@ inline __device__ void SumElementStrided3d(SharedData_Cuda &data, const CeedInt
 //------------------------------------------------------------------------------
 // E-vector -> single point
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_PTS>
+template <int NUM_COMP, int NUM_PTS, class ScalarIn, class ScalarOut>
 inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
                                  const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem,
-                                 const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
+                                 const ScalarIn *__restrict__ d_u, ScalarOut *r_u) {
   const CeedInt ind = (p % NUM_PTS) * strides_point + elem * strides_elem;
 
   if (p < points_in_elem) {
@@ -197,10 +197,10 @@ inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, cons
 //------------------------------------------------------------------------------
 // Single point -> E-vector
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_PTS>
+template <int NUM_COMP, int NUM_PTS, class ScalarIn, class ScalarOut>
 inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
-                                  const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *r_v,
-                                  CeedScalar *d_v) {
+                                  const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem, const ScalarIn *r_v,
+                                  ScalarOut *d_v) {
   if (p < points_in_elem) {
     const CeedInt ind = (p % NUM_PTS) * strides_point + elem * strides_elem;
 
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
index a9cd1209ef..a9e0258dec 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
@@ -40,9 +40,9 @@ inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar
 //------------------------------------------------------------------------------
 // 1D interpolate to points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
-                                        CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_C, const ScalarIn2 *r_X,
+                                        ScalarOut *__restrict__ r_V) {
   CeedScalar chebyshev_x[Q_1D];
 
   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
@@ -61,9 +61,9 @@ inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p,
 //------------------------------------------------------------------------------
 // 1D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
-                                                 CeedScalar *__restrict__ r_C) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *r_X,
+                                                 ScalarOut *__restrict__ r_C) {
   CeedScalar chebyshev_x[Q_1D];
 
   ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
@@ -86,9 +86,9 @@ inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const Ce
 //------------------------------------------------------------------------------
 // 1D derivatives at points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
-                                      CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_C, const ScalarIn2 *r_X,
+                                      ScalarOut *__restrict__ r_V) {
   CeedScalar chebyshev_x[Q_1D];
 
   ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
@@ -108,9 +108,9 @@ inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, co
 //------------------------------------------------------------------------------
 // 1D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
-                                               CeedScalar *__restrict__ r_C) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *r_X,
+                                               ScalarOut *__restrict__ r_C) {
   CeedScalar chebyshev_x[Q_1D];
 
   ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
@@ -137,9 +137,9 @@ inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // 2D interpolate to points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
-                                        CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_C, const ScalarIn2 *r_X,
+                                        ScalarOut *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     CeedScalar buffer[Q_1D];
@@ -168,9 +168,9 @@ inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p,
 //------------------------------------------------------------------------------
 // 2D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
-                                                 CeedScalar *__restrict__ r_C) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *r_X,
+                                                 ScalarOut *__restrict__ r_C) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
@@ -206,9 +206,9 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const Ce
 //------------------------------------------------------------------------------
 // 2D derivatives at points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
-                                      CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_C, const ScalarIn2 *r_X,
+                                      ScalarOut *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     CeedScalar buffer[Q_1D];
@@ -241,9 +241,9 @@ inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, co
 //------------------------------------------------------------------------------
 // 2D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
-                                               CeedScalar *__restrict__ r_C) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *r_X,
+                                               ScalarOut *__restrict__ r_C) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
@@ -287,9 +287,9 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // 3D interpolate to points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
-                                        CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_C, const ScalarIn2 *r_X,
+                                        ScalarOut *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
   for (CeedInt k = 0; k < Q_1D; k++) {
     CeedScalar buffer[Q_1D];
@@ -324,9 +324,9 @@ inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p,
 //------------------------------------------------------------------------------
 // 3D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
-                                                 CeedScalar *__restrict__ r_C) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *r_X,
+                                                 ScalarOut *__restrict__ r_C) {
   for (CeedInt k = 0; k < Q_1D; k++) {
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
@@ -368,9 +368,9 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const Ce
 //------------------------------------------------------------------------------
 // 3D derivatives at points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
-                                      CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_C, const ScalarIn2 *r_X,
+                                      ScalarOut *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
   for (CeedInt k = 0; k < Q_1D; k++) {
     CeedScalar buffer[Q_1D];
@@ -415,9 +415,9 @@ inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, co
 //------------------------------------------------------------------------------
 // 3D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
-                                               CeedScalar *__restrict__ r_C) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *r_X,
+                                               ScalarOut *__restrict__ r_C) {
   for (CeedInt k = 0; k < Q_1D; k++) {
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
index dcb1763e38..0e39830c4d 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
@@ -20,8 +20,9 @@
 //------------------------------------------------------------------------------
 // Interp
 //------------------------------------------------------------------------------
-extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem,
-                                          const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalarCPU *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem,
+                                          const CeedScalarCPU *__restrict__ d_X, const CeedScalarCPU *__restrict__ d_U,
+                                          CeedScalarCPU *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -75,9 +76,9 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScal
   }
 }
 
-extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B,
-                                                   const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X,
-                                                   const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalarCPU *__restrict__ c_B,
+                                                   const CeedInt *__restrict__ points_per_elem, const CeedScalarCPU *__restrict__ d_X,
+                                                   const CeedScalarCPU *__restrict__ d_U, CeedScalarCPU *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -145,9 +146,9 @@ extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const
   }
 }
 
-extern "C" __global__ void InterpTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B,
-                                                      const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X,
-                                                      const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void InterpTransposeAddAtPoints(const CeedInt num_elem, const CeedScalarCPU *__restrict__ c_B,
+                                                      const CeedInt *__restrict__ points_per_elem, const CeedScalarCPU *__restrict__ d_X,
+                                                      const CeedScalarCPU *__restrict__ d_U, CeedScalarCPU *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -207,8 +208,9 @@ extern "C" __global__ void InterpTransposeAddAtPoints(const CeedInt num_elem, co
 //------------------------------------------------------------------------------
 // Grad
 //------------------------------------------------------------------------------
-extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem,
-                                        const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalarCPU *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem,
+                                        const CeedScalarCPU *__restrict__ d_X, const CeedScalarCPU *__restrict__ d_U,
+                                        CeedScalarCPU *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -262,9 +264,9 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar
   }
 }
 
-extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B,
-                                                 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X,
-                                                 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalarCPU *__restrict__ c_B,
+                                                 const CeedInt *__restrict__ points_per_elem, const CeedScalarCPU *__restrict__ d_X,
+                                                 const CeedScalarCPU *__restrict__ d_U, CeedScalarCPU *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -333,9 +335,9 @@ extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const C
   }
 }
 
-extern "C" __global__ void GradTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B,
-                                                    const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X,
-                                                    const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void GradTransposeAddAtPoints(const CeedInt num_elem, const CeedScalarCPU *__restrict__ c_B,
+                                                    const CeedInt *__restrict__ points_per_elem, const CeedScalarCPU *__restrict__ d_X,
+                                                    const CeedScalarCPU *__restrict__ d_U, CeedScalarCPU *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h
index 4f76825d50..3429fcc76f 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h
@@ -16,9 +16,9 @@
 //------------------------------------------------------------------------------
 // 2D tensor contraction x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
-                                            CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const ScalarIn1 *U, const ScalarIn2 *B,
+                                            ScalarOut *V) {
   __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
@@ -33,9 +33,9 @@ inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_i
 //------------------------------------------------------------------------------
 // 2D tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
-                                            CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const ScalarIn1 *U, const ScalarIn2 *B,
+                                            ScalarOut *V) {
   __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
@@ -50,9 +50,9 @@ inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_i
 //------------------------------------------------------------------------------
 // 2D transpose tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
-                                                     const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const ScalarIn1 *U,
+                                                     const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
@@ -67,9 +67,9 @@ inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, cons
 //------------------------------------------------------------------------------
 // 2D transpose tensor contract x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
-                                                     const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const ScalarIn1 *U,
+                                                     const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
@@ -84,9 +84,9 @@ inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, cons
 //------------------------------------------------------------------------------
 // 2D transpose tensor contract and add x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
-                                                        const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const ScalarIn1 *U,
+                                                        const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
@@ -100,8 +100,8 @@ inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, c
 //------------------------------------------------------------------------------
 // 2D pack/unpack quadrature values
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int Q_1D, int T_1D>
-inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
+template <int NUM_COMP, int Q_1D, int T_1D, class ScalarOut>
+inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, ScalarOut *U) {
   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D;
 
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -112,8 +112,8 @@ inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const in
   }
 }
 
-template <int NUM_COMP, int Q_1D, int T_1D>
-inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
+template <int NUM_COMP, int Q_1D, int T_1D, class ScalarOut>
+inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, ScalarOut *U) {
   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D;
 
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -127,9 +127,9 @@ inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const
 //------------------------------------------------------------------------------
 // 2D interpolate to quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                               CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B,
+                                               ScalarOut *__restrict__ r_V) {
   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
   CeedScalar r_t[1];
 
@@ -146,9 +146,9 @@ inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar
 //------------------------------------------------------------------------------
 // 2D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                                        CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B,
+                                                        ScalarOut *__restrict__ r_V) {
   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
   CeedScalar r_t[1];
 
@@ -164,9 +164,9 @@ inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, C
 //------------------------------------------------------------------------------
 // 2D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
-                                             CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarIn3, class ScalarOut>
+inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G,
+                                             ScalarOut *__restrict__ r_V) {
   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
   CeedScalar r_t[1];
 
@@ -185,9 +185,9 @@ inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *
 //------------------------------------------------------------------------------
 // 2D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                                      const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarIn3, class ScalarOut>
+inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G,
+                                                      ScalarOut *__restrict__ r_V) {
   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
   CeedScalar r_t[1];
 
@@ -205,8 +205,8 @@ inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, Cee
 //------------------------------------------------------------------------------
 // 2D quadrature weights
 //------------------------------------------------------------------------------
-template <int P_1D, int Q_1D>
-inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
+template <int P_1D, int Q_1D, class ScalarIn, class ScalarOut>
+inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const ScalarIn *__restrict__ q_weight_1d, ScalarOut *w) {
   const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D;
 
   *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
@@ -219,9 +219,9 @@ inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // 3D tensor contract x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
-                                            const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const ScalarIn1 *U,
+                                            const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
@@ -236,9 +236,9 @@ inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_i
 //------------------------------------------------------------------------------
 // 3D tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
-                                            const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const ScalarIn1 *U,
+                                            const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
@@ -253,9 +253,9 @@ inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_i
 //------------------------------------------------------------------------------
 // 3D tensor contract z
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
-                                            const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const ScalarIn1 *U,
+                                            const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
@@ -270,9 +270,9 @@ inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_i
 //------------------------------------------------------------------------------
 // 3D tensor contract z
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
-                                                     const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const ScalarIn1 *U,
+                                                     const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
@@ -287,9 +287,9 @@ inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, cons
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract z
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
 inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
-                                                        const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+                                                        const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
@@ -303,9 +303,9 @@ inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, c
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
-                                                     const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const ScalarIn1 *U,
+                                                     const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
@@ -320,9 +320,9 @@ inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, cons
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
 inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
-                                                        const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+                                                        const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
@@ -336,9 +336,9 @@ inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, c
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
-                                                     const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const ScalarIn1 *U,
+                                                     const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
@@ -353,9 +353,9 @@ inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, cons
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract add x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
 inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
-                                                        const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+                                                        const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
@@ -369,8 +369,8 @@ inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, c
 //------------------------------------------------------------------------------
 // 3D pack/unpack quadrature values
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int Q_1D, int T_1D>
-inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
+template <int NUM_COMP, int Q_1D, int T_1D, class ScalarOut>
+inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, ScalarOut *U) {
   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
 
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -381,8 +381,8 @@ inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const in
   }
 }
 
-template <int NUM_COMP, int Q_1D, int T_1D>
-inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
+template <int NUM_COMP, int Q_1D, int T_1D, class ScalarOut>
+inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, ScalarOut *U) {
   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
 
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -396,9 +396,9 @@ inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const
 //------------------------------------------------------------------------------
 // 3D interpolate to quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                               CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B,
+                                               ScalarOut *__restrict__ r_V) {
   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
@@ -416,9 +416,9 @@ inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar
 //------------------------------------------------------------------------------
 // 3D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                                        CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B,
+                                                        ScalarOut *__restrict__ r_V) {
   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
@@ -435,9 +435,9 @@ inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, C
 //------------------------------------------------------------------------------
 // 3D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
-                                             CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarIn3, class ScalarOut>
+inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G,
+                                             ScalarOut *__restrict__ r_V) {
   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
@@ -461,9 +461,9 @@ inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *
 //------------------------------------------------------------------------------
 // 3D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                                      const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarIn3, class ScalarOut>
+inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G,
+                                                      ScalarOut *__restrict__ r_V) {
   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
@@ -486,9 +486,9 @@ inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, Cee
 //------------------------------------------------------------------------------
 // 3D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarIn3, class ScalarOut>
+inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G,
+                                                       ScalarOut *__restrict__ r_V) {
   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
@@ -509,9 +509,9 @@ inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, Ce
 //------------------------------------------------------------------------------
 // 3D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                                                const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarIn3, class ScalarOut>
+inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B,
+                                                                const ScalarIn3 *c_G, ScalarOut *__restrict__ r_V) {
   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
@@ -531,8 +531,8 @@ inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda
 //------------------------------------------------------------------------------
 // 3D quadrature weights
 //------------------------------------------------------------------------------
-template <int P_1D, int Q_1D>
-inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
+template <int P_1D, int Q_1D, class ScalarIn, class ScalarOut>
+inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const ScalarIn *__restrict__ q_weight_1d, ScalarOut *w) {
   const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D);
 
   *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0;
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
index f4f701505a..b453d66596 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
@@ -16,8 +16,8 @@
 //------------------------------------------------------------------------------
 // 1D tensor contraction x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
-inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractX1d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[data.t_id_x] = *U;
   __syncthreads();
@@ -32,8 +32,8 @@ inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, c
 //------------------------------------------------------------------------------
 // 1D transpose tensor contraction x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
-inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[data.t_id_x] = *U;
   __syncthreads();
@@ -48,8 +48,8 @@ inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedSca
 //------------------------------------------------------------------------------
 // 1D interpolate to quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void Interp1d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, ScalarOut *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
   }
@@ -58,9 +58,9 @@ inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restr
 //------------------------------------------------------------------------------
 // 1D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                         CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B,
+                                         ScalarOut *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
   }
@@ -69,9 +69,9 @@ inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar
 //------------------------------------------------------------------------------
 // 1D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
-                              CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarIn3, class ScalarOut>
+inline __device__ void Grad1d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G,
+                              ScalarOut *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
   }
@@ -80,9 +80,9 @@ inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restric
 //------------------------------------------------------------------------------
 // 1D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
-                                       CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarIn3, class ScalarOut>
+inline __device__ void GradTranspose1d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G,
+                                       ScalarOut *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
   }
@@ -91,8 +91,8 @@ inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *
 //------------------------------------------------------------------------------
 // 1D quadrature weights
 //------------------------------------------------------------------------------
-template <int P_1D, int Q_1D>
-inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
+template <int P_1D, int Q_1D, class ScalarIn, class ScalarOut>
+inline __device__ void Weight1d(SharedData_Cuda &data, const ScalarIn *__restrict__ q_weight_1d, ScalarOut *w) {
   *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
 }
 
@@ -103,8 +103,8 @@ inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restr
 //------------------------------------------------------------------------------
 // 2D tensor contraction x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractX2d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
@@ -119,8 +119,8 @@ inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, c
 //------------------------------------------------------------------------------
 // 2D tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractY2d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
@@ -135,8 +135,8 @@ inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, c
 //------------------------------------------------------------------------------
 // 2D transpose tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
@@ -151,8 +151,8 @@ inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedSca
 //------------------------------------------------------------------------------
 // 2D transpose tensor contract x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
@@ -167,8 +167,8 @@ inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedSca
 //------------------------------------------------------------------------------
 // 2D transpose tensor contract and add x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
@@ -182,9 +182,8 @@ inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // 2D interpolate to quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                      CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpTensor2d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, ScalarOut *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
@@ -195,9 +194,9 @@ inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *_
 //------------------------------------------------------------------------------
 // 2D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                               CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B,
+                                               ScalarOut *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
@@ -208,9 +207,9 @@ inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // 2D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
-                                    CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarIn3, class ScalarOut>
+inline __device__ void GradTensor2d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G,
+                                    ScalarOut *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, r_t);
@@ -223,9 +222,9 @@ inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__r
 //------------------------------------------------------------------------------
 // 2D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
-                                             CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarIn3, class ScalarOut>
+inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G,
+                                             ScalarOut *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
@@ -238,8 +237,8 @@ inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedSc
 //------------------------------------------------------------------------------
 // 2D quadrature weights
 //------------------------------------------------------------------------------
-template <int P_1D, int Q_1D>
-inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
+template <int P_1D, int Q_1D, class ScalarIn, class ScalarOut>
+inline __device__ void WeightTensor2d(SharedData_Cuda &data, const ScalarIn *__restrict__ q_weight_1d, ScalarOut *w) {
   *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
 }
 
@@ -250,8 +249,8 @@ inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *_
 //------------------------------------------------------------------------------
 // 3D tensor contract x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractX3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   CeedScalar r_B[P_1D];
   for (CeedInt i = 0; i < P_1D; i++) {
     r_B[i] = B[i + data.t_id_x * P_1D];
@@ -273,8 +272,8 @@ inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, c
 //------------------------------------------------------------------------------
 // 3D tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractY3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   CeedScalar r_B[P_1D];
   for (CeedInt i = 0; i < P_1D; i++) {
     r_B[i] = B[i + data.t_id_y * P_1D];
@@ -296,8 +295,8 @@ inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, c
 //------------------------------------------------------------------------------
 // 3D tensor contract z
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractZ3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   for (CeedInt k = 0; k < Q_1D; k++) {
     V[k] = 0.0;
     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
@@ -311,8 +310,8 @@ inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, c
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract z
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   for (CeedInt k = 0; k < P_1D; k++) {
     V[k] = 0.0;
     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
@@ -326,8 +325,8 @@ inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedSca
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   CeedScalar r_B[Q_1D];
   for (CeedInt i = 0; i < Q_1D; i++) {
     r_B[i] = B[data.t_id_y + i * P_1D];
@@ -349,8 +348,8 @@ inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedSca
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   CeedScalar r_B[Q_1D];
   for (CeedInt i = 0; i < Q_1D; i++) {
     r_B[i] = B[data.t_id_y + i * P_1D];
@@ -371,8 +370,8 @@ inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   CeedScalar r_B[Q_1D];
   for (CeedInt i = 0; i < Q_1D; i++) {
     r_B[i] = B[data.t_id_x + i * P_1D];
@@ -394,8 +393,8 @@ inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedSca
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract add x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   CeedScalar r_B[Q_1D];
   for (CeedInt i = 0; i < Q_1D; i++) {
     r_B[i] = B[data.t_id_x + i * P_1D];
@@ -416,9 +415,8 @@ inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // 3D interpolate to quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                      CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpTensor3d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, ScalarOut *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -431,9 +429,9 @@ inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *_
 //------------------------------------------------------------------------------
 // 3D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                               CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B,
+                                               ScalarOut *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -446,9 +444,9 @@ inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // 3D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
-                                    CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarIn3, class ScalarOut>
+inline __device__ void GradTensor3d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G,
+                                    ScalarOut *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -467,9 +465,9 @@ inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__r
 //------------------------------------------------------------------------------
 // 3D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
-                                             CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarIn3, class ScalarOut>
+inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G,
+                                             ScalarOut *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -488,9 +486,9 @@ inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedSc
 //------------------------------------------------------------------------------
 // 3D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
-                                              CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarIn3, class ScalarOut>
+inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G,
+                                              ScalarOut *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -506,9 +504,9 @@ inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedS
 //------------------------------------------------------------------------------
 // 3D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarIn3, class ScalarOut>
+inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B,
+                                                       const ScalarIn3 *c_G, ScalarOut *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -524,8 +522,8 @@ inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, co
 //------------------------------------------------------------------------------
 // 3D quadrature weights
 //------------------------------------------------------------------------------
-template <int P_1D, int Q_1D>
-inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
+template <int P_1D, int Q_1D, class ScalarIn, class ScalarOut>
+inline __device__ void WeightTensor3d(SharedData_Cuda &data, const ScalarIn *__restrict__ q_weight_1d, ScalarOut *w) {
   const bool       quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
   const CeedScalar pw   = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
   for (CeedInt q = 0; q < Q_1D; q++) {
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
index 1252c8197d..d21fec19f5 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
@@ -15,7 +15,8 @@
 //------------------------------------------------------------------------------
 // Interp kernel by dim
 //------------------------------------------------------------------------------
-extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalarCPU *c_B, const CeedScalarCPU *__restrict__ d_U,
+                                  CeedScalarCPU *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -53,8 +54,8 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B,
   }
 }
 
-extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
-                                           CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalarCPU *c_B, const CeedScalarCPU *__restrict__ d_U,
+                                           CeedScalarCPU *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -92,8 +93,8 @@ extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedSca
   }
 }
 
-extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
-                                              CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalarCPU *c_B, const CeedScalarCPU *__restrict__ d_U,
+                                              CeedScalarCPU *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -134,8 +135,8 @@ extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const Ceed
 //------------------------------------------------------------------------------
 // Grad kernel by dim
 //------------------------------------------------------------------------------
-extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
-                                CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalarCPU *c_B, const CeedScalarCPU *c_G, const CeedScalarCPU *__restrict__ d_U,
+                                CeedScalarCPU *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -177,8 +178,8 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_B, c
   }
 }
 
-extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
-                                         CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalarCPU *c_B, const CeedScalarCPU *c_G,
+                                         const CeedScalarCPU *__restrict__ d_U, CeedScalarCPU *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -220,8 +221,8 @@ extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScala
   }
 }
 
-extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
-                                            CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalarCPU *c_B, const CeedScalarCPU *c_G,
+                                            const CeedScalarCPU *__restrict__ d_U, CeedScalarCPU *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -266,7 +267,7 @@ extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedSc
 //------------------------------------------------------------------------------
 // Weight kernels by dim
 //------------------------------------------------------------------------------
-extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *__restrict__ d_W) {
+extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalarCPU *__restrict__ q_weight_1d, CeedScalarCPU *__restrict__ d_W) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
diff --git a/include/ceed/jit-source/cuda/cuda-types.h b/include/ceed/jit-source/cuda/cuda-types.h
index 9acb0064a3..969cbb1d90 100644
--- a/include/ceed/jit-source/cuda/cuda-types.h
+++ b/include/ceed/jit-source/cuda/cuda-types.h
@@ -14,8 +14,8 @@
 #define CEED_CUDA_NUMBER_FIELDS 16
 
 typedef struct {
-  const CeedScalar *inputs[CEED_CUDA_NUMBER_FIELDS];
-  CeedScalar       *outputs[CEED_CUDA_NUMBER_FIELDS];
+  const CeedScalarCPU *inputs[CEED_CUDA_NUMBER_FIELDS];
+  CeedScalarCPU       *outputs[CEED_CUDA_NUMBER_FIELDS];
 } Fields_Cuda;
 
 typedef struct {
@@ -24,10 +24,10 @@ typedef struct {
 } FieldsInt_Cuda;
 
 typedef struct {
-  CeedInt           num_elem;
-  const CeedInt    *num_per_elem;
-  const CeedInt    *indices;
-  const CeedScalar *coords;
+  CeedInt              num_elem;
+  const CeedInt       *num_per_elem;
+  const CeedInt       *indices;
+  const CeedScalarCPU *coords;
 } Points_Cuda;
 
 typedef struct {
diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index d0545ba7e6..6c2a0cc5cd 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -634,6 +634,47 @@ int CeedOperatorIsSetupDone(CeedOperator op, bool *is_setup_done) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Set a `CeedOperator` to use reduced precision for operator application
+
+  @param[in] op `CeedOperator`
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedOperatorSetMixedPrecision(CeedOperator op) {
+  bool is_immutable, is_composite, supports_mixed_precision;
+  Ceed ceed;
+
+  CeedCall(CeedOperatorGetCeed(op, &ceed));
+  CeedCall(CeedOperatorIsImmutable(op, &is_immutable));
+  CeedCheck(!is_immutable, ceed, CEED_ERROR_INCOMPATIBLE, "CeedOperatorSetMixedPrecision must be called before operator is finalized");
+  CeedCall(CeedOperatorIsComposite(op, &is_composite));
+  CeedCheck(!is_composite, ceed, CEED_ERROR_INCOMPATIBLE, "CeedOperatorSetMixedPrecision should be set on single operators");
+  CeedCall(CeedGetSupportsMixedPrecision(ceed, &supports_mixed_precision));
+  CeedCheck(supports_mixed_precision, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement mixed precision operators");
+
+  op->use_mixed_precision = true;
+  CeedCallBackend(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Get whether a `CeedOperator` is set to use reduced precision for operator application
+
+  @param[in]  op                  `CeedOperator`
+  @param[out] use_mixed_precision Variable to store `CeedQFunction`
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedOperatorGetMixedPrecision(CeedOperator op, bool *use_mixed_precision) {
+  *use_mixed_precision = op->use_mixed_precision;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Get the `CeedQFunction` associated with a `CeedOperator`
 
@@ -1076,10 +1117,10 @@ int CeedOperatorAtPointsSetPoints(CeedOperator op, CeedElemRestriction rstr_poin
 
 /**
   @brief Get a boolean value indicating if the `CeedOperator` was created with `CeedOperatorCreateAtPoints`
-    
+
   @param[in]  op           `CeedOperator`
   @param[out] is_at_points Variable to store at points status
-  
+
   @return An error code: 0 - success, otherwise - failure
 
   @ref User
diff --git a/interface/ceed.c b/interface/ceed.c
index 39a8d3a911..5b8bdc994c 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -733,6 +733,21 @@ int CeedSetDeterministic(Ceed ceed, bool is_deterministic) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Flag `Ceed` context as being able to create mixed precision operators
+
+  @param[in]  ceed                     `Ceed` to flag as deterministic
+  @param[out] supports_mixed_precision Mixed precision status to set
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedSetSupportsMixedPrecision(Ceed ceed, bool supports_mixed_precision) {
+  ceed->supports_mixed_precision = supports_mixed_precision;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Set a backend function.
 
@@ -1446,6 +1461,21 @@ int CeedIsDeterministic(Ceed ceed, bool *is_deterministic) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Get deterministic status of `Ceed` context
+
+  @param[in]  ceed                     `Ceed` context
+  @param[out] supports_mixed_precision Variable to store deterministic status
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedGetSupportsMixedPrecision(Ceed ceed, bool *supports_mixed_precision) {
+  *supports_mixed_precision = ceed->supports_mixed_precision;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Set additional JiT source root for `Ceed` context
 
diff --git a/tests/t502-operator-mixed.c b/tests/t502-operator-mixed.c
new file mode 100644
index 0000000000..4218b82b1d
--- /dev/null
+++ b/tests/t502-operator-mixed.c
@@ -0,0 +1,125 @@
+/// @file
+/// Test creation, action, and destruction for mass matrix operator with multiple components with mixed precision
+/// \test Test creation, action, and destruction for mass matrix operator with multiple components with mixed precision
+#include "t502-operator.h"
+
+#include <ceed.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int main(int argc, char **argv) {
+  Ceed                ceed;
+  CeedElemRestriction elem_restriction_x, elem_restriction_u, elem_restriction_q_data;
+  CeedBasis           basis_x, basis_u;
+  CeedQFunction       qf_setup, qf_mass;
+  CeedOperator        op_setup, op_mass;
+  CeedVector          q_data, x, u, v;
+  CeedInt             num_elem = 15, p = 5, q = 8;
+  CeedInt             num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (p - 1) + 1;
+  CeedInt             ind_x[num_elem * 2], ind_u[num_elem * p];
+
+  CeedInit(argv[1], &ceed);
+
+  CeedVectorCreate(ceed, num_nodes_x, &x);
+  {
+    CeedScalar x_array[num_nodes_x];
+
+    for (CeedInt i = 0; i < num_nodes_x; i++) x_array[i] = (CeedScalar)i / (num_nodes_x - 1);
+    CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+  }
+  CeedVectorCreate(ceed, 2 * num_nodes_u, &u);
+  CeedVectorCreate(ceed, 2 * num_nodes_u, &v);
+  CeedVectorCreate(ceed, num_elem * q, &q_data);
+
+  // Restrictions
+  for (CeedInt i = 0; i < num_elem; i++) {
+    ind_x[2 * i + 0] = i;
+    ind_x[2 * i + 1] = i + 1;
+  }
+  CeedElemRestrictionCreate(ceed, num_elem, 2, 1, 1, num_nodes_x, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x);
+
+  for (CeedInt i = 0; i < num_elem; i++) {
+    for (CeedInt j = 0; j < p; j++) {
+      ind_u[p * i + j] = 2 * (i * (p - 1) + j);
+    }
+  }
+  CeedElemRestrictionCreate(ceed, num_elem, p, 2, 1, 2 * num_nodes_u, CEED_MEM_HOST, CEED_USE_POINTER, ind_u, &elem_restriction_u);
+
+  CeedInt strides_q_data[3] = {1, q, q};
+  CeedElemRestrictionCreateStrided(ceed, num_elem, q, 1, q * num_elem, strides_q_data, &elem_restriction_q_data);
+
+  // Bases
+  CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, 2, q, CEED_GAUSS, &basis_x);
+  CeedBasisCreateTensorH1Lagrange(ceed, 1, 2, p, q, CEED_GAUSS, &basis_u);
+
+  // QFunctions
+  CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup);
+  CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT);
+  CeedQFunctionAddInput(qf_setup, "dx", 1 * 1, CEED_EVAL_GRAD);
+  CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE);
+
+  CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass);
+  CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE);
+  CeedQFunctionAddInput(qf_mass, "u", 2, CEED_EVAL_INTERP);
+  CeedQFunctionAddOutput(qf_mass, "v", 2, CEED_EVAL_INTERP);
+
+  // Operators
+  CeedOperatorCreate(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup);
+  CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
+  CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetMixedPrecision(op_setup);
+
+  CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass);
+  CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
+  CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetMixedPrecision(op_mass);
+
+  CeedOperatorApply(op_setup, x, q_data, CEED_REQUEST_IMMEDIATE);
+
+  {
+    CeedScalar *u_array;
+
+    CeedVectorGetArrayWrite(u, CEED_MEM_HOST, &u_array);
+    for (int i = 0; i < num_nodes_u; i++) {
+      u_array[2 * i]     = 1.0;
+      u_array[2 * i + 1] = 2.0;
+    }
+    CeedVectorRestoreArray(u, &u_array);
+  }
+  CeedOperatorApply(op_mass, u, v, CEED_REQUEST_IMMEDIATE);
+
+  // Check output
+  {
+    const CeedScalar *v_array;
+    CeedScalar        sum_1 = 0., sum_2 = 0.;
+
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < num_nodes_u; i++) {
+      sum_1 += v_array[2 * i];
+      sum_2 += v_array[2 * i + 1];
+    }
+    CeedVectorRestoreArrayRead(v, &v_array);
+    if (fabs(sum_1 - 1.) > 1000. * FLT_EPSILON) printf("Computed Area: %f != True Area: 1.0\n", sum_1);
+    if (fabs(sum_2 - 2.) > 1000. * FLT_EPSILON) printf("Computed Area: %f != True Area: 2.0\n", sum_2);
+  }
+
+  CeedVectorDestroy(&x);
+  CeedVectorDestroy(&u);
+  CeedVectorDestroy(&v);
+  CeedVectorDestroy(&q_data);
+  CeedElemRestrictionDestroy(&elem_restriction_u);
+  CeedElemRestrictionDestroy(&elem_restriction_x);
+  CeedElemRestrictionDestroy(&elem_restriction_q_data);
+  CeedBasisDestroy(&basis_u);
+  CeedBasisDestroy(&basis_x);
+  CeedQFunctionDestroy(&qf_setup);
+  CeedQFunctionDestroy(&qf_mass);
+  CeedOperatorDestroy(&op_setup);
+  CeedOperatorDestroy(&op_mass);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t503-operator-mixed.c b/tests/t503-operator-mixed.c
new file mode 100644
index 0000000000..7042c7846c
--- /dev/null
+++ b/tests/t503-operator-mixed.c
@@ -0,0 +1,114 @@
+/// @file
+/// Test creation, action, and destruction for mass matrix operator with passive inputs and outputs with mixed precision
+/// \test Test creation, action, and destruction for mass matrix operator with passive inputs and outputs with mixed precision
+#include <ceed.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "t500-operator.h"
+
+int main(int argc, char **argv) {
+  Ceed                ceed;
+  CeedElemRestriction elem_restriction_x, elem_restriction_u, elem_restriction_q_data;
+  CeedBasis           basis_x, basis_u;
+  CeedQFunction       qf_setup, qf_mass;
+  CeedOperator        op_setup, op_mass;
+  CeedVector          q_data, x, u, v;
+  CeedInt             num_elem = 15, p = 5, q = 8;
+  CeedInt             num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (p - 1) + 1;
+  CeedInt             ind_x[num_elem * 2], ind_u[num_elem * p];
+
+  CeedInit(argv[1], &ceed);
+
+  // Vectors
+  CeedVectorCreate(ceed, num_nodes_x, &x);
+  {
+    CeedScalar x_array[num_nodes_x];
+
+    for (CeedInt i = 0; i < num_nodes_x; i++) x_array[i] = (CeedScalar)i / (num_nodes_x - 1);
+    CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+  }
+  CeedVectorCreate(ceed, num_nodes_u, &u);
+  CeedVectorCreate(ceed, num_nodes_u, &v);
+  CeedVectorCreate(ceed, num_elem * q, &q_data);
+
+  // Restrictions
+  for (CeedInt i = 0; i < num_elem; i++) {
+    ind_x[2 * i + 0] = i;
+    ind_x[2 * i + 1] = i + 1;
+  }
+  CeedElemRestrictionCreate(ceed, num_elem, 2, 1, 1, num_nodes_x, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x);
+
+  for (CeedInt i = 0; i < num_elem; i++) {
+    for (CeedInt j = 0; j < p; j++) {
+      ind_u[p * i + j] = i * (p - 1) + j;
+    }
+  }
+  CeedElemRestrictionCreate(ceed, num_elem, p, 1, 1, num_nodes_u, CEED_MEM_HOST, CEED_USE_POINTER, ind_u, &elem_restriction_u);
+
+  CeedInt strides_q_data[3] = {1, q, q};
+  CeedElemRestrictionCreateStrided(ceed, num_elem, q, 1, q * num_elem, strides_q_data, &elem_restriction_q_data);
+
+  // Bases
+  CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, 2, q, CEED_GAUSS, &basis_x);
+  CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, p, q, CEED_GAUSS, &basis_u);
+
+  // QFunctions
+  CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup);
+  CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT);
+  CeedQFunctionAddInput(qf_setup, "dx", 1, CEED_EVAL_GRAD);
+  CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE);
+
+  CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass);
+  CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE);
+  CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP);
+  CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP);
+
+  // Operators
+  CeedOperatorCreate(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup);
+  CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
+  CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, x);
+  CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
+  CeedOperatorSetMixedPrecision(op_setup);
+
+  CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass);
+  CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
+  CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, u);
+  CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, v);
+  CeedOperatorSetMixedPrecision(op_mass);
+
+  // Note - It is atypical to use only passive fields; this test is intended
+  //   as a test for all passive input modes rather than as an example.
+  CeedOperatorApply(op_setup, CEED_VECTOR_NONE, CEED_VECTOR_NONE, CEED_REQUEST_IMMEDIATE);
+  CeedVectorSetValue(u, 1.0);
+  CeedOperatorApply(op_mass, CEED_VECTOR_NONE, CEED_VECTOR_NONE, CEED_REQUEST_IMMEDIATE);
+
+  // Check output
+  {
+    const CeedScalar *v_array;
+    CeedScalar        sum = 0.;
+
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < num_nodes_u; i++) sum += v_array[i];
+    CeedVectorRestoreArrayRead(v, &v_array);
+    if (fabs(sum - 1.) > 1000. * FLT_EPSILON) printf("Computed Area: %f != True Area: 1.0\n", sum);
+  }
+
+  CeedVectorDestroy(&x);
+  CeedVectorDestroy(&u);
+  CeedVectorDestroy(&v);
+  CeedVectorDestroy(&q_data);
+  CeedElemRestrictionDestroy(&elem_restriction_u);
+  CeedElemRestrictionDestroy(&elem_restriction_x);
+  CeedElemRestrictionDestroy(&elem_restriction_q_data);
+  CeedBasisDestroy(&basis_u);
+  CeedBasisDestroy(&basis_x);
+  CeedQFunctionDestroy(&qf_setup);
+  CeedQFunctionDestroy(&qf_mass);
+  CeedOperatorDestroy(&op_setup);
+  CeedOperatorDestroy(&op_mass);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t505-operator-mixed.c b/tests/t505-operator-mixed.c
new file mode 100644
index 0000000000..2efe5bd05d
--- /dev/null
+++ b/tests/t505-operator-mixed.c
@@ -0,0 +1,128 @@
+/// @file
+/// Test CeedOperatorApplyAdd with mixed precision
+/// \test Test CeedOperatorApplyAdd with mixed precision
+#include <ceed.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "t500-operator.h"
+
+int main(int argc, char **argv) {
+  Ceed                ceed;
+  CeedElemRestriction elem_restriction_x, elem_restriction_u, elem_restriction_q_data;
+  CeedBasis           basis_x, basis_u;
+  CeedQFunction       qf_setup, qf_mass;
+  CeedOperator        op_setup, op_mass;
+  CeedVector          q_data, x, u, v;
+  CeedInt             num_elem = 15, p = 5, q = 8;
+  CeedInt             num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (p - 1) + 1;
+  CeedInt             ind_x[num_elem * 2], ind_u[num_elem * p];
+
+  CeedInit(argv[1], &ceed);
+
+  CeedVectorCreate(ceed, num_nodes_x, &x);
+  {
+    CeedScalar x_array[num_nodes_x];
+    for (CeedInt i = 0; i < num_nodes_x; i++) x_array[i] = (CeedScalar)i / (num_nodes_x - 1);
+    CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+  }
+  CeedVectorCreate(ceed, num_nodes_u, &u);
+  CeedVectorCreate(ceed, num_nodes_u, &v);
+  CeedVectorCreate(ceed, num_elem * q, &q_data);
+
+  // Restrictions
+  for (CeedInt i = 0; i < num_elem; i++) {
+    ind_x[2 * i + 0] = i;
+    ind_x[2 * i + 1] = i + 1;
+  }
+  CeedElemRestrictionCreate(ceed, num_elem, 2, 1, 1, num_nodes_x, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x);
+
+  for (CeedInt i = 0; i < num_elem; i++) {
+    for (CeedInt j = 0; j < p; j++) {
+      ind_u[p * i + j] = i * (p - 1) + j;
+    }
+  }
+  CeedElemRestrictionCreate(ceed, num_elem, p, 1, 1, num_nodes_u, CEED_MEM_HOST, CEED_USE_POINTER, ind_u, &elem_restriction_u);
+
+  CeedInt strides_q_data[3] = {1, q, q};
+  CeedElemRestrictionCreateStrided(ceed, num_elem, q, 1, q * num_elem, strides_q_data, &elem_restriction_q_data);
+
+  // Bases
+  CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, 2, q, CEED_GAUSS, &basis_x);
+  CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, p, q, CEED_GAUSS, &basis_u);
+
+  // QFunctions
+  CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup);
+  CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT);
+  CeedQFunctionAddInput(qf_setup, "dx", 1, CEED_EVAL_GRAD);
+  CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE);
+
+  CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass);
+  CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE);
+  CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP);
+  CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP);
+
+  // Operators
+  CeedOperatorCreate(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup);
+  CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
+  CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetMixedPrecision(op_setup);
+
+  CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass);
+  CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
+  CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetMixedPrecision(op_mass);
+
+  CeedOperatorApply(op_setup, x, q_data, CEED_REQUEST_IMMEDIATE);
+
+  // Apply with V = 0
+  CeedVectorSetValue(u, 1.0);
+  CeedVectorSetValue(v, 0.0);
+  CeedOperatorApplyAdd(op_mass, u, v, CEED_REQUEST_IMMEDIATE);
+
+  // Check output
+  {
+    const CeedScalar *v_array;
+    CeedScalar        sum = 0.;
+
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < num_nodes_u; i++) sum += v_array[i];
+    CeedVectorRestoreArrayRead(v, &v_array);
+    if (fabs(sum - 1.) > 1000. * FLT_EPSILON) printf("Computed Area: %f != True Area: 1.0\n", sum);
+  }
+
+  // Apply with V = 1
+  CeedVectorSetValue(v, 1.0);
+  CeedOperatorApplyAdd(op_mass, u, v, CEED_REQUEST_IMMEDIATE);
+
+  // Check output
+  {
+    const CeedScalar *v_array;
+    CeedScalar        sum = -num_nodes_u;
+
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < num_nodes_u; i++) sum += v_array[i];
+    CeedVectorRestoreArrayRead(v, &v_array);
+    if (fabs(sum - 1.) > 1000. * FLT_EPSILON) printf("Computed Area: %f != True Area: 1.0\n", sum);
+  }
+
+  CeedVectorDestroy(&x);
+  CeedVectorDestroy(&u);
+  CeedVectorDestroy(&v);
+  CeedVectorDestroy(&q_data);
+  CeedElemRestrictionDestroy(&elem_restriction_u);
+  CeedElemRestrictionDestroy(&elem_restriction_x);
+  CeedElemRestrictionDestroy(&elem_restriction_q_data);
+  CeedBasisDestroy(&basis_u);
+  CeedBasisDestroy(&basis_x);
+  CeedQFunctionDestroy(&qf_setup);
+  CeedQFunctionDestroy(&qf_mass);
+  CeedOperatorDestroy(&op_setup);
+  CeedOperatorDestroy(&op_mass);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t506-operator-mixed.c b/tests/t506-operator-mixed.c
new file mode 100644
index 0000000000..0069f451ff
--- /dev/null
+++ b/tests/t506-operator-mixed.c
@@ -0,0 +1,173 @@
+/// @file
+/// Test creation reuse of the same QFunction for multiple operators with mixed precision
+/// \test Test creation reuse of the same QFunction for multiple operators with mixed precision
+#include <ceed.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "t502-operator.h"
+
+int main(int argc, char **argv) {
+  Ceed                ceed;
+  CeedElemRestriction elem_restriction_x, elem_restriction_u, elem_restriction_q_data_small, elem_restriction_q_data_large;
+  CeedBasis           basis_x_small, basis_x_large, basis_u_small, basis_u_large;
+  CeedQFunction       qf_setup, qf_mass;
+  CeedOperator        op_setup_small, op_mass_small, op_setup_large, op_mass_large;
+  CeedVector          q_data_small, q_data_large, x, u, v;
+  CeedInt             num_elem = 15, p = 5, q = 8, scale = 3, num_comp = 2;
+  CeedInt             num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (p - 1) + 1;
+  CeedInt             ind_x[num_elem * 2], ind_u[num_elem * p];
+
+  CeedInit(argv[1], &ceed);
+
+  CeedVectorCreate(ceed, num_nodes_x, &x);
+  {
+    CeedScalar x_array[num_nodes_x];
+
+    for (CeedInt i = 0; i < num_nodes_x; i++) x_array[i] = (CeedScalar)i / (num_nodes_x - 1);
+    CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+  }
+  CeedVectorCreate(ceed, num_comp * num_nodes_u, &u);
+  CeedVectorCreate(ceed, num_comp * num_nodes_u, &v);
+  CeedVectorCreate(ceed, num_elem * q, &q_data_small);
+  CeedVectorCreate(ceed, num_elem * q * scale, &q_data_large);
+
+  // Restrictions
+  for (CeedInt i = 0; i < num_elem; i++) {
+    ind_x[2 * i + 0] = i;
+    ind_x[2 * i + 1] = i + 1;
+  }
+  CeedElemRestrictionCreate(ceed, num_elem, num_comp, 1, 1, num_nodes_x, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x);
+
+  for (CeedInt i = 0; i < num_elem; i++) {
+    for (CeedInt j = 0; j < p; j++) {
+      ind_u[p * i + j] = num_comp * (i * (p - 1) + j);
+    }
+  }
+  CeedElemRestrictionCreate(ceed, num_elem, p, num_comp, 1, num_comp * num_nodes_u, CEED_MEM_HOST, CEED_USE_POINTER, ind_u, &elem_restriction_u);
+
+  CeedInt strides_q_data_small[3] = {1, q, q};
+  CeedElemRestrictionCreateStrided(ceed, num_elem, q, 1, q * num_elem, strides_q_data_small, &elem_restriction_q_data_small);
+
+  CeedInt strides_q_data_large[3] = {1, q * scale, q * scale};
+  CeedElemRestrictionCreateStrided(ceed, num_elem, q * scale, 1, q * num_elem * scale, strides_q_data_large, &elem_restriction_q_data_large);
+
+  // Bases
+  CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, 2, q, CEED_GAUSS, &basis_x_small);
+  CeedBasisCreateTensorH1Lagrange(ceed, 1, num_comp, p, q, CEED_GAUSS, &basis_u_small);
+  CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, 2, q * scale, CEED_GAUSS, &basis_x_large);
+  CeedBasisCreateTensorH1Lagrange(ceed, 1, num_comp, p, q * scale, CEED_GAUSS, &basis_u_large);
+
+  // QFunctions
+  CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup);
+  CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT);
+  CeedQFunctionAddInput(qf_setup, "x", 1, CEED_EVAL_GRAD);
+  CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE);
+
+  CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass);
+  CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE);
+  CeedQFunctionAddInput(qf_mass, "u", num_comp, CEED_EVAL_INTERP);
+  CeedQFunctionAddOutput(qf_mass, "v", num_comp, CEED_EVAL_INTERP);
+
+  // 'Small' Operators
+  CeedOperatorCreate(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup_small);
+  CeedOperatorSetField(op_setup_small, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_small, CEED_VECTOR_NONE);
+  CeedOperatorSetField(op_setup_small, "x", elem_restriction_x, basis_x_small, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_setup_small, "rho", elem_restriction_q_data_small, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetMixedPrecision(op_setup_small);
+
+  CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_small);
+  CeedOperatorSetField(op_mass_small, "rho", elem_restriction_q_data_small, CEED_BASIS_NONE, q_data_small);
+  CeedOperatorSetField(op_mass_small, "u", elem_restriction_u, basis_u_small, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_mass_small, "v", elem_restriction_u, basis_u_small, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetMixedPrecision(op_mass_small);
+
+  // 'Large' operators
+  CeedOperatorCreate(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup_large);
+  CeedOperatorSetField(op_setup_large, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_large, CEED_VECTOR_NONE);
+  CeedOperatorSetField(op_setup_large, "x", elem_restriction_x, basis_x_large, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_setup_large, "rho", elem_restriction_q_data_large, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetMixedPrecision(op_setup_large);
+
+  CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_large);
+  CeedOperatorSetField(op_mass_large, "rho", elem_restriction_q_data_large, CEED_BASIS_NONE, q_data_large);
+  CeedOperatorSetField(op_mass_large, "u", elem_restriction_u, basis_u_large, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_mass_large, "v", elem_restriction_u, basis_u_large, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetMixedPrecision(op_mass_large);
+
+  // Setup
+  CeedOperatorApply(op_setup_small, x, q_data_small, CEED_REQUEST_IMMEDIATE);
+  CeedOperatorApply(op_setup_large, x, q_data_large, CEED_REQUEST_IMMEDIATE);
+
+  {
+    CeedScalar *u_array;
+
+    CeedVectorGetArrayWrite(u, CEED_MEM_HOST, &u_array);
+    for (int i = 0; i < num_nodes_u; i++) {
+      u_array[num_comp * i]     = 1.0;
+      u_array[num_comp * i + 1] = 2.0;
+    }
+    CeedVectorRestoreArray(u, &u_array);
+  }
+
+  // 'Small' operator
+  CeedOperatorApply(op_mass_small, u, v, CEED_REQUEST_IMMEDIATE);
+
+  // Check output
+  {
+    const CeedScalar *v_array;
+    CeedScalar        sum_1 = 0., sum_2 = 0.;
+
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < num_nodes_u; i++) {
+      sum_1 += v_array[num_comp * i];
+      sum_2 += v_array[num_comp * i + 1];
+    }
+    CeedVectorRestoreArrayRead(v, &v_array);
+    if (fabs(sum_1 - 1.) > 1000. * FLT_EPSILON) printf("Small Problem, Component 1: Computed Area %f != True Area 1.0\n", sum_1);
+    if (fabs(sum_2 - 2.) > 1000. * FLT_EPSILON) printf("Small Problem, Component 2: Computed Area %f != True Area 2.0\n", sum_2);
+  }
+
+  // 'Large' operator
+  CeedOperatorApply(op_mass_large, u, v, CEED_REQUEST_IMMEDIATE);
+
+  // Check output
+  {
+    const CeedScalar *v_array;
+    CeedScalar        sum_1 = 0., sum_2 = 0.;
+
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < num_nodes_u; i++) {
+      sum_1 += v_array[num_comp * i];
+      sum_2 += v_array[num_comp * i + 1];
+    }
+    CeedVectorRestoreArrayRead(v, &v_array);
+
+    if (fabs(sum_1 - 1.) > 1000. * FLT_EPSILON) printf("Large Problem, Component 1: Computed Area %f != True Area 1.0\n", sum_1);
+    if (fabs(sum_2 - 2.) > 1000. * FLT_EPSILON) printf("Large Problem, Component 2: Computed Area %f != True Area 2.0\n", sum_2);
+  }
+
+  CeedVectorDestroy(&x);
+  CeedVectorDestroy(&u);
+  CeedVectorDestroy(&v);
+  CeedVectorDestroy(&q_data_small);
+  CeedVectorDestroy(&q_data_large);
+  CeedElemRestrictionDestroy(&elem_restriction_u);
+  CeedElemRestrictionDestroy(&elem_restriction_x);
+  CeedElemRestrictionDestroy(&elem_restriction_q_data_small);
+  CeedElemRestrictionDestroy(&elem_restriction_q_data_large);
+  CeedBasisDestroy(&basis_u_small);
+  CeedBasisDestroy(&basis_x_small);
+  CeedBasisDestroy(&basis_u_large);
+  CeedBasisDestroy(&basis_x_large);
+  CeedQFunctionDestroy(&qf_setup);
+  CeedQFunctionDestroy(&qf_mass);
+  CeedOperatorDestroy(&op_setup_small);
+  CeedOperatorDestroy(&op_mass_small);
+  CeedOperatorDestroy(&op_setup_large);
+  CeedOperatorDestroy(&op_mass_large);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t510-operator-mixed.c b/tests/t510-operator-mixed.c
new file mode 100644
index 0000000000..42853f7165
--- /dev/null
+++ b/tests/t510-operator-mixed.c
@@ -0,0 +1,133 @@
+/// @file
+/// Test creation, action, and destruction for mass matrix operator with mixed precision
+/// \test Test creation, action, and destruction for mass matrix operator with mixed precision
+#include "t510-operator.h"
+
+#include <ceed.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "t320-basis.h"
+
+int main(int argc, char **argv) {
+  Ceed                ceed;
+  CeedElemRestriction elem_restriction_x, elem_restriction_u, elem_restriction_q_data;
+  CeedBasis           basis_x, basis_u;
+  CeedQFunction       qf_setup, qf_mass;
+  CeedOperator        op_setup, op_mass;
+  CeedVector          q_data, x, u, v;
+  CeedInt             num_elem = 12, dim = 2, p = 6, q = 4;
+  CeedInt             nx = 3, ny = 2;
+  CeedInt             row, col, offset;
+  CeedInt             num_dofs = (nx * 2 + 1) * (ny * 2 + 1), num_qpts = num_elem * q;
+  CeedInt             ind_x[num_elem * p];
+  CeedScalar          q_ref[dim * q], q_weight[q];
+  CeedScalar          interp[p * q], grad[dim * p * q];
+
+  CeedInit(argv[1], &ceed);
+
+  CeedVectorCreate(ceed, dim * num_dofs, &x);
+  {
+    CeedScalar x_array[dim * num_dofs];
+
+    for (CeedInt i = 0; i < num_dofs; i++) {
+      x_array[i]            = (1. / (nx * 2)) * (CeedScalar)(i % (nx * 2 + 1));
+      x_array[i + num_dofs] = (1. / (ny * 2)) * (CeedScalar)(i / (nx * 2 + 1));
+    }
+    CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+  }
+  CeedVectorCreate(ceed, num_qpts, &q_data);
+  CeedVectorCreate(ceed, num_dofs, &u);
+  CeedVectorCreate(ceed, num_dofs, &v);
+
+  // Restrictions
+  for (CeedInt i = 0; i < num_elem / 2; i++) {
+    col    = i % nx;
+    row    = i / nx;
+    offset = col * 2 + row * (nx * 2 + 1) * 2;
+
+    ind_x[i * 2 * p + 0] = 2 + offset;
+    ind_x[i * 2 * p + 1] = 9 + offset;
+    ind_x[i * 2 * p + 2] = 16 + offset;
+    ind_x[i * 2 * p + 3] = 1 + offset;
+    ind_x[i * 2 * p + 4] = 8 + offset;
+    ind_x[i * 2 * p + 5] = 0 + offset;
+
+    ind_x[i * 2 * p + 6]  = 14 + offset;
+    ind_x[i * 2 * p + 7]  = 7 + offset;
+    ind_x[i * 2 * p + 8]  = 0 + offset;
+    ind_x[i * 2 * p + 9]  = 15 + offset;
+    ind_x[i * 2 * p + 10] = 8 + offset;
+    ind_x[i * 2 * p + 11] = 16 + offset;
+  }
+  CeedElemRestrictionCreate(ceed, num_elem, p, dim, num_dofs, dim * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x);
+  CeedElemRestrictionCreate(ceed, num_elem, p, 1, 1, num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_u);
+
+  CeedInt strides_q_data[3] = {1, q, q};
+  CeedElemRestrictionCreateStrided(ceed, num_elem, q, 1, num_qpts, strides_q_data, &elem_restriction_q_data);
+
+  // Bases
+  Build2DSimplex(q_ref, q_weight, interp, grad);
+  CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, dim, p, q, interp, grad, q_ref, q_weight, &basis_x);
+
+  Build2DSimplex(q_ref, q_weight, interp, grad);
+  CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, 1, p, q, interp, grad, q_ref, q_weight, &basis_u);
+
+  // QFunctions
+  CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup);
+  CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT);
+  CeedQFunctionAddInput(qf_setup, "dx", dim * dim, CEED_EVAL_GRAD);
+  CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE);
+
+  CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass);
+  CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE);
+  CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP);
+  CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP);
+
+  // Operators
+  CeedOperatorCreate(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup);
+  CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
+  CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetMixedPrecision(op_setup);
+
+  CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass);
+  CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
+  CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetMixedPrecision(op_mass);
+
+  CeedOperatorApply(op_setup, x, q_data, CEED_REQUEST_IMMEDIATE);
+
+  CeedVectorSetValue(u, 0.0);
+  CeedOperatorApply(op_mass, u, v, CEED_REQUEST_IMMEDIATE);
+
+  // Check output
+  {
+    const CeedScalar *v_array;
+
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < num_dofs; i++) {
+      if (fabs(v_array[i]) > FLT_EPSILON) printf("[%" CeedInt_FMT "] v %g != 0.0\n", i, v_array[i]);
+    }
+    CeedVectorRestoreArrayRead(v, &v_array);
+  }
+
+  CeedVectorDestroy(&x);
+  CeedVectorDestroy(&u);
+  CeedVectorDestroy(&v);
+  CeedVectorDestroy(&q_data);
+  CeedElemRestrictionDestroy(&elem_restriction_u);
+  CeedElemRestrictionDestroy(&elem_restriction_x);
+  CeedElemRestrictionDestroy(&elem_restriction_q_data);
+  CeedBasisDestroy(&basis_u);
+  CeedBasisDestroy(&basis_x);
+  CeedQFunctionDestroy(&qf_setup);
+  CeedQFunctionDestroy(&qf_mass);
+  CeedOperatorDestroy(&op_setup);
+  CeedOperatorDestroy(&op_mass);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t520-operator-mixed.c b/tests/t520-operator-mixed.c
new file mode 100644
index 0000000000..ea641fb604
--- /dev/null
+++ b/tests/t520-operator-mixed.c
@@ -0,0 +1,234 @@
+/// @file
+/// Test creation, action, and destruction for composite mass matrix operator with mixed precision
+/// \test Test creation, action, and destruction for composite mass matrix operator with mixed precision
+#include <ceed.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "t320-basis.h"
+#include "t510-operator.h"
+
+/* The mesh comprises of two rows of 3 quadrilaterals followed by one row
+     of 6 triangles:
+   _ _ _
+  |_|_|_|
+  |_|_|_|
+  |/|/|/|
+
+*/
+
+int main(int argc, char **argv) {
+  Ceed                ceed;
+  CeedElemRestriction elem_restriction_x_tet, elem_restriction_u_tet, elem_restriction_q_data_tet, elem_restriction_x_hex, elem_restriction_u_hex,
+      elem_restriction_q_data_hex;
+  CeedBasis     basis_x_tet, basis_u_tet, basis_x_hex, basis_u_hex;
+  CeedQFunction qf_setup_tet, qf_mass_tet, qf_setup_hex, qf_mass_hex;
+  CeedOperator  op_setup_tet, op_mass_tet, op_setup_hex, op_mass_hex, op_setup, op_mass;
+  CeedVector    q_data_tet, q_data_hex, x, u, v;
+  CeedInt       num_elem_tet = 6, p_tet = 6, q_tet = 4, num_elem_hex = 6, p_hex = 3, q_hex = 4, dim = 2;
+  CeedInt       n_x = 3, n_y = 3, n_x_tet = 3, n_y_tet = 1, n_x_hex = 3;
+  CeedInt       row, col, offset;
+  CeedInt       num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_qpts_tet = num_elem_tet * q_tet, num_qpts_hex = num_elem_hex * q_hex * q_hex;
+  CeedInt       ind_x_tet[num_elem_tet * p_tet], ind_x_hex[num_elem_hex * p_hex * p_hex];
+  CeedScalar    q_ref[dim * q_tet], q_weight[q_tet];
+  CeedScalar    interp[p_tet * q_tet], grad[dim * p_tet * q_tet];
+
+  CeedInit(argv[1], &ceed);
+
+  // Vectors
+  CeedVectorCreate(ceed, dim * num_dofs, &x);
+  {
+    CeedScalar x_array[dim * num_dofs];
+
+    for (CeedInt i = 0; i < n_y * 2 + 1; i++) {
+      for (CeedInt j = 0; j < n_x * 2 + 1; j++) {
+        x_array[i + j * (n_y * 2 + 1) + 0 * num_dofs] = (CeedScalar)i / (2 * n_y);
+        x_array[i + j * (n_y * 2 + 1) + 1 * num_dofs] = (CeedScalar)j / (2 * n_x);
+      }
+    }
+    CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+  }
+  CeedVectorCreate(ceed, num_dofs, &u);
+  CeedVectorCreate(ceed, num_dofs, &v);
+  CeedVectorCreate(ceed, num_qpts_tet, &q_data_tet);
+  CeedVectorCreate(ceed, num_qpts_hex, &q_data_hex);
+
+  // Set up Tet Elements
+  // -- Restrictions
+  for (CeedInt i = 0; i < num_elem_tet / 2; i++) {
+    col    = i % n_x_tet;
+    row    = i / n_x_tet;
+    offset = col * 2 + row * (n_x_tet * 2 + 1) * 2;
+
+    ind_x_tet[i * 2 * p_tet + 0] = 2 + offset;
+    ind_x_tet[i * 2 * p_tet + 1] = 9 + offset;
+    ind_x_tet[i * 2 * p_tet + 2] = 16 + offset;
+    ind_x_tet[i * 2 * p_tet + 3] = 1 + offset;
+    ind_x_tet[i * 2 * p_tet + 4] = 8 + offset;
+    ind_x_tet[i * 2 * p_tet + 5] = 0 + offset;
+
+    ind_x_tet[i * 2 * p_tet + 6]  = 14 + offset;
+    ind_x_tet[i * 2 * p_tet + 7]  = 7 + offset;
+    ind_x_tet[i * 2 * p_tet + 8]  = 0 + offset;
+    ind_x_tet[i * 2 * p_tet + 9]  = 15 + offset;
+    ind_x_tet[i * 2 * p_tet + 10] = 8 + offset;
+    ind_x_tet[i * 2 * p_tet + 11] = 16 + offset;
+  }
+  CeedElemRestrictionCreate(ceed, num_elem_tet, p_tet, dim, num_dofs, dim * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x_tet,
+                            &elem_restriction_x_tet);
+  CeedElemRestrictionCreate(ceed, num_elem_tet, p_tet, 1, 1, num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x_tet, &elem_restriction_u_tet);
+
+  CeedInt strides_q_data_tet[3] = {1, q_tet, q_tet};
+  CeedElemRestrictionCreateStrided(ceed, num_elem_tet, q_tet, 1, num_qpts_tet, strides_q_data_tet, &elem_restriction_q_data_tet);
+
+  // -- Bases
+  Build2DSimplex(q_ref, q_weight, interp, grad);
+  CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, dim, p_tet, q_tet, interp, grad, q_ref, q_weight, &basis_x_tet);
+
+  Build2DSimplex(q_ref, q_weight, interp, grad);
+  CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, 1, p_tet, q_tet, interp, grad, q_ref, q_weight, &basis_u_tet);
+
+  // -- QFunctions
+  CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup_tet);
+  CeedQFunctionAddInput(qf_setup_tet, "_weight", 1, CEED_EVAL_WEIGHT);
+  CeedQFunctionAddInput(qf_setup_tet, "dx", dim * dim, CEED_EVAL_GRAD);
+  CeedQFunctionAddOutput(qf_setup_tet, "rho", 1, CEED_EVAL_NONE);
+
+  CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass_tet);
+  CeedQFunctionAddInput(qf_mass_tet, "rho", 1, CEED_EVAL_NONE);
+  CeedQFunctionAddInput(qf_mass_tet, "u", 1, CEED_EVAL_INTERP);
+  CeedQFunctionAddOutput(qf_mass_tet, "v", 1, CEED_EVAL_INTERP);
+
+  // -- Operators
+  // ---- Setup Tet
+  CeedOperatorCreate(ceed, qf_setup_tet, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup_tet);
+  CeedOperatorSetField(op_setup_tet, "_weight", CEED_ELEMRESTRICTION_NONE, basis_x_tet, CEED_VECTOR_NONE);
+  CeedOperatorSetField(op_setup_tet, "dx", elem_restriction_x_tet, basis_x_tet, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_setup_tet, "rho", elem_restriction_q_data_tet, CEED_BASIS_NONE, q_data_tet);
+  CeedOperatorSetMixedPrecision(op_setup_tet);
+  // ---- Mass Tet
+  CeedOperatorCreate(ceed, qf_mass_tet, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_tet);
+  CeedOperatorSetField(op_mass_tet, "rho", elem_restriction_q_data_tet, CEED_BASIS_NONE, q_data_tet);
+  CeedOperatorSetField(op_mass_tet, "u", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_mass_tet, "v", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetName(op_mass_tet, "mass tet");
+  CeedOperatorSetMixedPrecision(op_mass_tet);
+
+  // Set up Hex Elements
+  // -- Restrictions
+  for (CeedInt i = 0; i < num_elem_hex; i++) {
+    col    = i % n_x_hex;
+    row    = i / n_x_hex;
+    offset = (n_x_tet * 2 + 1) * (n_y_tet * 2) * (1 + row) + col * 2;
+    for (CeedInt j = 0; j < p_hex; j++) {
+      for (CeedInt k = 0; k < p_hex; k++) ind_x_hex[p_hex * (p_hex * i + k) + j] = offset + k * (n_x_hex * 2 + 1) + j;
+    }
+  }
+  CeedElemRestrictionCreate(ceed, num_elem_hex, p_hex * p_hex, dim, num_dofs, dim * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x_hex,
+                            &elem_restriction_x_hex);
+  CeedElemRestrictionCreate(ceed, num_elem_hex, p_hex * p_hex, 1, 1, num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x_hex, &elem_restriction_u_hex);
+
+  CeedInt strides_q_data_hex[3] = {1, q_hex * q_hex, q_hex * q_hex};
+  CeedElemRestrictionCreateStrided(ceed, num_elem_hex, q_hex * q_hex, 1, num_qpts_hex, strides_q_data_hex, &elem_restriction_q_data_hex);
+
+  // -- Bases
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, p_hex, q_hex, CEED_GAUSS, &basis_x_hex);
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p_hex, q_hex, CEED_GAUSS, &basis_u_hex);
+
+  // -- QFunctions
+  CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup_hex);
+  CeedQFunctionAddInput(qf_setup_hex, "weight", 1, CEED_EVAL_WEIGHT);
+  CeedQFunctionAddInput(qf_setup_hex, "dx", dim * dim, CEED_EVAL_GRAD);
+  CeedQFunctionAddOutput(qf_setup_hex, "rho", 1, CEED_EVAL_NONE);
+
+  CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass_hex);
+  CeedQFunctionAddInput(qf_mass_hex, "rho", 1, CEED_EVAL_NONE);
+  CeedQFunctionAddInput(qf_mass_hex, "u", 1, CEED_EVAL_INTERP);
+  CeedQFunctionAddOutput(qf_mass_hex, "v", 1, CEED_EVAL_INTERP);
+
+  // -- Operators
+  CeedOperatorCreate(ceed, qf_setup_hex, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup_hex);
+  CeedOperatorSetField(op_setup_hex, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_hex, CEED_VECTOR_NONE);
+  CeedOperatorSetField(op_setup_hex, "dx", elem_restriction_x_hex, basis_x_hex, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_setup_hex, "rho", elem_restriction_q_data_hex, CEED_BASIS_NONE, q_data_hex);
+  CeedOperatorSetMixedPrecision(op_setup_hex);
+
+  CeedOperatorCreate(ceed, qf_mass_hex, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_hex);
+  CeedOperatorSetField(op_mass_hex, "rho", elem_restriction_q_data_hex, CEED_BASIS_NONE, q_data_hex);
+  CeedOperatorSetField(op_mass_hex, "u", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_mass_hex, "v", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetName(op_mass_hex, "mass hex");
+  CeedOperatorSetMixedPrecision(op_mass_hex);
+
+  // Set up Composite Operators
+  // -- Create
+  CeedCompositeOperatorCreate(ceed, &op_setup);
+  // -- Add SubOperators
+  CeedCompositeOperatorAddSub(op_setup, op_setup_tet);
+  CeedCompositeOperatorAddSub(op_setup, op_setup_hex);
+
+  // -- Create
+  CeedCompositeOperatorCreate(ceed, &op_mass);
+  // -- Add SubOperators
+  CeedCompositeOperatorAddSub(op_mass, op_mass_tet);
+  CeedCompositeOperatorAddSub(op_mass, op_mass_hex);
+
+  {  // Test CeedCompositeOperatorGetSubByName
+    CeedOperator op_byname;
+
+    CeedCompositeOperatorGetSubByName(op_mass, "mass hex", &op_byname);
+    if (op_byname != op_mass_hex) printf("CeedCompositeOperatorGetSubByName returned incorrect Sub Operator");
+
+    CeedCompositeOperatorGetSubByName(op_mass, "asdf", &op_byname);
+    if (op_byname != NULL) printf("CeedCompositeOperatorGetSubByName returned non-NULL for non-existent Sub Operator");
+  }
+
+  // Apply Setup Operator
+  CeedOperatorApply(op_setup, x, CEED_VECTOR_NONE, CEED_REQUEST_IMMEDIATE);
+
+  // Apply Mass Operator
+  CeedVectorSetValue(u, 0.0);
+  CeedOperatorApply(op_mass, u, v, CEED_REQUEST_IMMEDIATE);
+
+  // Check output
+  {
+    const CeedScalar *v_array;
+
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < num_dofs; i++) {
+      if (fabs(v_array[i]) > FLT_EPSILON) printf("[%" CeedInt_FMT "] v %g != 0.0\n", i, v_array[i]);
+    }
+    CeedVectorRestoreArrayRead(v, &v_array);
+  }
+
+  // Cleanup
+  CeedVectorDestroy(&x);
+  CeedVectorDestroy(&u);
+  CeedVectorDestroy(&v);
+  CeedVectorDestroy(&q_data_tet);
+  CeedVectorDestroy(&q_data_hex);
+  CeedElemRestrictionDestroy(&elem_restriction_u_tet);
+  CeedElemRestrictionDestroy(&elem_restriction_x_tet);
+  CeedElemRestrictionDestroy(&elem_restriction_q_data_tet);
+  CeedElemRestrictionDestroy(&elem_restriction_u_hex);
+  CeedElemRestrictionDestroy(&elem_restriction_x_hex);
+  CeedElemRestrictionDestroy(&elem_restriction_q_data_hex);
+  CeedBasisDestroy(&basis_u_tet);
+  CeedBasisDestroy(&basis_x_tet);
+  CeedBasisDestroy(&basis_u_hex);
+  CeedBasisDestroy(&basis_x_hex);
+  CeedQFunctionDestroy(&qf_setup_tet);
+  CeedQFunctionDestroy(&qf_mass_tet);
+  CeedOperatorDestroy(&op_setup_tet);
+  CeedOperatorDestroy(&op_mass_tet);
+  CeedQFunctionDestroy(&qf_setup_hex);
+  CeedQFunctionDestroy(&qf_mass_hex);
+  CeedOperatorDestroy(&op_setup_hex);
+  CeedOperatorDestroy(&op_mass_hex);
+  CeedOperatorDestroy(&op_setup);
+  CeedOperatorDestroy(&op_mass);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t522-operator-mixed.c b/tests/t522-operator-mixed.c
new file mode 100644
index 0000000000..071c979790
--- /dev/null
+++ b/tests/t522-operator-mixed.c
@@ -0,0 +1,221 @@
+/// @file
+/// Test creation, action, and destruction for diffusion matrix operator with mixed precision
+/// \test Test creation, action, and destruction for diffusion matrix operator with mixed precision
+#include "t522-operator.h"
+
+#include <ceed.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "t320-basis.h"
+
+/* The mesh comprises of two rows of 3 quadrilaterals followed by one row
+     of 6 triangles:
+   _ _ _
+  |_|_|_|
+  |_|_|_|
+  |/|/|/|
+
+*/
+
+int main(int argc, char **argv) {
+  Ceed                ceed;
+  CeedElemRestriction elem_restriction_x_tet, elem_restriction_u_tet, elem_restriction_q_data_tet, elem_restriction_x_hex, elem_restriction_u_hex,
+      elem_restriction_q_data_hex;
+  CeedBasis     basis_x_tet, basis_u_tet, basis_x_hex, basis_u_hex;
+  CeedQFunction qf_setup_tet, qf_diff_tet, qf_setup_hex, qf_diff_hex;
+  CeedOperator  op_setup_tet, op_diff_tet, op_setup_hex, op_diff_hex, op_setup, op_diff;
+  CeedVector    q_data_tet, q_data_hex, x, u, v;
+  CeedInt       num_elem_tet = 6, p_tet = 6, q_tet = 4, num_elem_hex = 6, p_hex = 3, q_hex = 4, dim = 2;
+  CeedInt       n_x = 3, n_y = 3, n_x_tet = 3, n_y_tet = 1, n_x_hex = 3;
+  CeedInt       row, col, offset;
+  CeedInt       num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_qpts_tet = num_elem_tet * q_tet, num_qpts_hex = num_elem_hex * q_hex * q_hex;
+  CeedInt       ind_x_tet[num_elem_tet * p_tet], ind_x_hex[num_elem_hex * p_hex * p_hex];
+  CeedScalar    q_ref[dim * q_tet], q_weight[q_tet];
+  CeedScalar    interp[p_tet * q_tet], grad[dim * p_tet * q_tet];
+
+  CeedInit(argv[1], &ceed);
+
+  // Vectors
+  CeedVectorCreate(ceed, dim * num_dofs, &x);
+  {
+    CeedScalar x_array[dim * num_dofs];
+
+    for (CeedInt i = 0; i < n_y * 2 + 1; i++) {
+      for (CeedInt j = 0; j < n_x * 2 + 1; j++) {
+        x_array[i + j * (n_y * 2 + 1) + 0 * num_dofs] = (CeedScalar)i / (2 * n_y);
+        x_array[i + j * (n_y * 2 + 1) + 1 * num_dofs] = (CeedScalar)j / (2 * n_x);
+      }
+    }
+    CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+  }
+  CeedVectorCreate(ceed, num_dofs, &u);
+  CeedVectorCreate(ceed, num_dofs, &v);
+  CeedVectorCreate(ceed, num_qpts_tet * dim * (dim + 1) / 2, &q_data_tet);
+  CeedVectorCreate(ceed, num_qpts_hex * dim * (dim + 1) / 2, &q_data_hex);
+
+  // Tet Elements
+  // -- Restrictions
+  for (CeedInt i = 0; i < num_elem_tet / 2; i++) {
+    col    = i % n_x_tet;
+    row    = i / n_x_tet;
+    offset = col * 2 + row * (n_x_tet * 2 + 1) * 2;
+
+    ind_x_tet[i * 2 * p_tet + 0] = 2 + offset;
+    ind_x_tet[i * 2 * p_tet + 1] = 9 + offset;
+    ind_x_tet[i * 2 * p_tet + 2] = 16 + offset;
+    ind_x_tet[i * 2 * p_tet + 3] = 1 + offset;
+    ind_x_tet[i * 2 * p_tet + 4] = 8 + offset;
+    ind_x_tet[i * 2 * p_tet + 5] = 0 + offset;
+
+    ind_x_tet[i * 2 * p_tet + 6]  = 14 + offset;
+    ind_x_tet[i * 2 * p_tet + 7]  = 7 + offset;
+    ind_x_tet[i * 2 * p_tet + 8]  = 0 + offset;
+    ind_x_tet[i * 2 * p_tet + 9]  = 15 + offset;
+    ind_x_tet[i * 2 * p_tet + 10] = 8 + offset;
+    ind_x_tet[i * 2 * p_tet + 11] = 16 + offset;
+  }
+  CeedElemRestrictionCreate(ceed, num_elem_tet, p_tet, dim, num_dofs, dim * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x_tet,
+                            &elem_restriction_x_tet);
+  CeedElemRestrictionCreate(ceed, num_elem_tet, p_tet, 1, 1, num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x_tet, &elem_restriction_u_tet);
+
+  CeedInt strides_q_data_tet[3] = {1, q_tet, q_tet * dim * (dim + 1) / 2};
+  CeedElemRestrictionCreateStrided(ceed, num_elem_tet, q_tet, dim * (dim + 1) / 2, dim * (dim + 1) / 2 * num_qpts_tet, strides_q_data_tet,
+                                   &elem_restriction_q_data_tet);
+
+  // -- Bases
+  Build2DSimplex(q_ref, q_weight, interp, grad);
+  CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, dim, p_tet, q_tet, interp, grad, q_ref, q_weight, &basis_x_tet);
+
+  Build2DSimplex(q_ref, q_weight, interp, grad);
+  CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, 1, p_tet, q_tet, interp, grad, q_ref, q_weight, &basis_u_tet);
+
+  // -- QFunctions
+  CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup_tet);
+  CeedQFunctionAddInput(qf_setup_tet, "weight", 1, CEED_EVAL_WEIGHT);
+  CeedQFunctionAddInput(qf_setup_tet, "dx", dim * dim, CEED_EVAL_GRAD);
+  CeedQFunctionAddOutput(qf_setup_tet, "rho", dim * (dim + 1) / 2, CEED_EVAL_NONE);
+
+  CeedQFunctionCreateInterior(ceed, 1, diff, diff_loc, &qf_diff_tet);
+  CeedQFunctionAddInput(qf_diff_tet, "rho", dim * (dim + 1) / 2, CEED_EVAL_NONE);
+  CeedQFunctionAddInput(qf_diff_tet, "u", dim, CEED_EVAL_GRAD);
+  CeedQFunctionAddOutput(qf_diff_tet, "v", dim, CEED_EVAL_GRAD);
+
+  // -- Operators
+  // ---- Setup Tet
+  CeedOperatorCreate(ceed, qf_setup_tet, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup_tet);
+  CeedOperatorSetField(op_setup_tet, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_tet, CEED_VECTOR_NONE);
+  CeedOperatorSetField(op_setup_tet, "dx", elem_restriction_x_tet, basis_x_tet, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_setup_tet, "rho", elem_restriction_q_data_tet, CEED_BASIS_NONE, q_data_tet);
+  CeedOperatorSetMixedPrecision(op_setup_tet);
+  // ---- Diff Tet
+  CeedOperatorCreate(ceed, qf_diff_tet, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_diff_tet);
+  CeedOperatorSetField(op_diff_tet, "rho", elem_restriction_q_data_tet, CEED_BASIS_NONE, q_data_tet);
+  CeedOperatorSetField(op_diff_tet, "u", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_diff_tet, "v", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetMixedPrecision(op_diff_tet);
+
+  // Hex Elements
+  // -- Restrictions
+  for (CeedInt i = 0; i < num_elem_hex; i++) {
+    col    = i % n_x_hex;
+    row    = i / n_x_hex;
+    offset = (n_x_tet * 2 + 1) * (n_y_tet * 2) * (1 + row) + col * 2;
+    for (CeedInt j = 0; j < p_hex; j++) {
+      for (CeedInt k = 0; k < p_hex; k++) ind_x_hex[p_hex * (p_hex * i + k) + j] = offset + k * (n_x_hex * 2 + 1) + j;
+    }
+  }
+  CeedElemRestrictionCreate(ceed, num_elem_hex, p_hex * p_hex, dim, num_dofs, dim * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x_hex,
+                            &elem_restriction_x_hex);
+  CeedElemRestrictionCreate(ceed, num_elem_hex, p_hex * p_hex, 1, 1, num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x_hex, &elem_restriction_u_hex);
+
+  CeedInt strides_q_data_hex[3] = {1, q_hex * q_hex, q_hex * q_hex * dim * (dim + 1) / 2};
+  CeedElemRestrictionCreateStrided(ceed, num_elem_hex, q_hex * q_hex, dim * (dim + 1) / 2, dim * (dim + 1) / 2 * num_qpts_hex, strides_q_data_hex,
+                                   &elem_restriction_q_data_hex);
+
+  // -- Bases
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, p_hex, q_hex, CEED_GAUSS, &basis_x_hex);
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p_hex, q_hex, CEED_GAUSS, &basis_u_hex);
+
+  // -- QFunctions
+  CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup_hex);
+  CeedQFunctionAddInput(qf_setup_hex, "weight", 1, CEED_EVAL_WEIGHT);
+  CeedQFunctionAddInput(qf_setup_hex, "dx", dim * dim, CEED_EVAL_GRAD);
+  CeedQFunctionAddOutput(qf_setup_hex, "rho", dim * (dim + 1) / 2, CEED_EVAL_NONE);
+
+  CeedQFunctionCreateInterior(ceed, 1, diff, diff_loc, &qf_diff_hex);
+  CeedQFunctionAddInput(qf_diff_hex, "rho", dim * (dim + 1) / 2, CEED_EVAL_NONE);
+  CeedQFunctionAddInput(qf_diff_hex, "u", dim, CEED_EVAL_GRAD);
+  CeedQFunctionAddOutput(qf_diff_hex, "v", dim, CEED_EVAL_GRAD);
+
+  // -- Operators
+  CeedOperatorCreate(ceed, qf_setup_hex, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup_hex);
+  CeedOperatorSetField(op_setup_hex, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_hex, CEED_VECTOR_NONE);
+  CeedOperatorSetField(op_setup_hex, "dx", elem_restriction_x_hex, basis_x_hex, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_setup_hex, "rho", elem_restriction_q_data_hex, CEED_BASIS_NONE, q_data_hex);
+  CeedOperatorSetMixedPrecision(op_setup_hex);
+
+  CeedOperatorCreate(ceed, qf_diff_hex, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_diff_hex);
+  CeedOperatorSetField(op_diff_hex, "rho", elem_restriction_q_data_hex, CEED_BASIS_NONE, q_data_hex);
+  CeedOperatorSetField(op_diff_hex, "u", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_diff_hex, "v", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetMixedPrecision(op_diff_hex);
+
+  // Composite Operators
+  CeedCompositeOperatorCreate(ceed, &op_setup);
+  CeedCompositeOperatorAddSub(op_setup, op_setup_tet);
+  CeedCompositeOperatorAddSub(op_setup, op_setup_hex);
+
+  CeedCompositeOperatorCreate(ceed, &op_diff);
+  CeedCompositeOperatorAddSub(op_diff, op_diff_tet);
+  CeedCompositeOperatorAddSub(op_diff, op_diff_hex);
+
+  // Apply Setup Operator
+  CeedOperatorApply(op_setup, x, CEED_VECTOR_NONE, CEED_REQUEST_IMMEDIATE);
+
+  // Apply diff Operator
+  CeedVectorSetValue(u, 1.0);
+  CeedOperatorApply(op_diff, u, v, CEED_REQUEST_IMMEDIATE);
+
+  // Check output
+  {
+    const CeedScalar *v_array;
+
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < num_dofs; i++) {
+      if (fabs(v_array[i]) > FLT_EPSILON) printf("Computed: %f != True: 0.0\n", v_array[i]);
+    }
+    CeedVectorRestoreArrayRead(v, &v_array);
+  }
+
+  // Cleanup
+  CeedVectorDestroy(&x);
+  CeedVectorDestroy(&u);
+  CeedVectorDestroy(&v);
+  CeedVectorDestroy(&q_data_tet);
+  CeedVectorDestroy(&q_data_hex);
+  CeedElemRestrictionDestroy(&elem_restriction_u_tet);
+  CeedElemRestrictionDestroy(&elem_restriction_x_tet);
+  CeedElemRestrictionDestroy(&elem_restriction_q_data_tet);
+  CeedElemRestrictionDestroy(&elem_restriction_u_hex);
+  CeedElemRestrictionDestroy(&elem_restriction_x_hex);
+  CeedElemRestrictionDestroy(&elem_restriction_q_data_hex);
+  CeedBasisDestroy(&basis_u_tet);
+  CeedBasisDestroy(&basis_x_tet);
+  CeedBasisDestroy(&basis_u_hex);
+  CeedBasisDestroy(&basis_x_hex);
+  CeedQFunctionDestroy(&qf_setup_tet);
+  CeedQFunctionDestroy(&qf_diff_tet);
+  CeedOperatorDestroy(&op_setup_tet);
+  CeedOperatorDestroy(&op_diff_tet);
+  CeedQFunctionDestroy(&qf_setup_hex);
+  CeedQFunctionDestroy(&qf_diff_hex);
+  CeedOperatorDestroy(&op_setup_hex);
+  CeedOperatorDestroy(&op_diff_hex);
+  CeedOperatorDestroy(&op_setup);
+  CeedOperatorDestroy(&op_diff);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t591-operator-mixed.c b/tests/t591-operator-mixed.c
new file mode 100644
index 0000000000..d0bc270977
--- /dev/null
+++ b/tests/t591-operator-mixed.c
@@ -0,0 +1,197 @@
+/// @file
+/// Test creation, action, and destruction for mass matrix operator at points with mixed precision
+/// \test Test creation, action, and destruction for mass matrix operator at points with mixed precision
+#include "t591-operator.h"
+
+#include <ceed.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int main(int argc, char **argv) {
+  Ceed    ceed;
+  CeedInt num_elem_1d = 3, num_elem = num_elem_1d * num_elem_1d, dim = 2, p = 3, q = 5;
+  CeedInt num_nodes = (num_elem_1d * (p - 1) + 1) * (num_elem_1d * (p - 1) + 1), num_points_per_elem = 4, num_points = num_elem * num_points_per_elem;
+  CeedVector          x_points, x_elem, q_data, u, v;
+  CeedElemRestriction elem_restriction_x_points, elem_restriction_q_data, elem_restriction_x, elem_restriction_u;
+  CeedBasis           basis_x, basis_u;
+  CeedQFunction       qf_setup, qf_mass;
+  CeedOperator        op_setup, op_mass;
+  bool                is_at_points;
+
+  CeedInit(argv[1], &ceed);
+
+  // Point reference coordinates
+  CeedVectorCreate(ceed, dim * num_points, &x_points);
+  {
+    CeedScalar x_array[dim * num_points];
+
+    for (CeedInt e = 0; e < num_elem; e++) {
+      for (CeedInt d = 0; d < dim; d++) {
+        x_array[num_points_per_elem * (e * dim + d) + 0] = 0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 1] = d == 0 ? -0.25 : 0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 2] = d == 0 ? 0.25 : -0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 3] = 0.25;
+      }
+    }
+    CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+  }
+  {
+    CeedInt ind_x[num_elem + 1 + num_points];
+
+    for (CeedInt i = 0; i <= num_elem; i++) ind_x[i] = num_elem + 1 + i * num_points_per_elem;
+    for (CeedInt i = 0; i < num_points; i++) ind_x[num_elem + 1 + i] = i;
+    CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim, num_points * dim, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x,
+                                      &elem_restriction_x_points);
+    CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_q_data);
+  }
+
+  // Q data
+  CeedVectorCreate(ceed, num_points, &q_data);
+
+  // Cell coordinates
+  {
+    CeedInt p = 2, num_nodes = (num_elem_1d * (p - 1) + 1) * (num_elem_1d * (p - 1) + 1);
+    CeedInt ind_x[num_elem * p * p];
+
+    for (CeedInt e = 0; e < num_elem; e++) {
+      CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0};
+
+      for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p - 1) + 1;
+      {
+        CeedInt r_e = e;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          elem_xy[d] = r_e % num_elem_1d;
+          r_e /= num_elem_1d;
+        }
+      }
+      CeedInt num_nodes_in_elem = p * p, *elem_nodes = ind_x + e * num_nodes_in_elem;
+
+      for (CeedInt n = 0; n < num_nodes_in_elem; n++) {
+        CeedInt g_node = 0, g_node_stride = 1, r_node = n;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          g_node += (elem_xy[d] * (p - 1) + r_node % p) * g_node_stride;
+          g_node_stride *= n_d[d];
+          r_node /= p;
+        }
+        elem_nodes[n] = p * g_node;
+      }
+    }
+    CeedElemRestrictionCreate(ceed, num_elem, p * p, dim, 1, dim * num_nodes, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_x);
+    CeedVectorCreate(ceed, dim * num_nodes, &x_elem);
+    {
+      CeedScalar x_array[dim * num_nodes];
+
+      for (CeedInt i = 0; i <= num_elem_1d; i++) {
+        for (CeedInt j = 0; j <= num_elem_1d; j++) {
+          x_array[(i * (num_elem_1d + 1) + j) * dim + 0] = j;
+          x_array[(i * (num_elem_1d + 1) + j) * dim + 1] = i;
+        }
+      }
+      CeedVectorSetArray(x_elem, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+    }
+  }
+
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, q, CEED_GAUSS, &basis_x);
+
+  // Cell solution
+  {
+    CeedInt ind_u[num_elem * p * p];
+
+    for (CeedInt e = 0; e < num_elem; e++) {
+      CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0};
+
+      for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p - 1) + 1;
+      {
+        CeedInt r_e = e;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          elem_xy[d] = r_e % num_elem_1d;
+          r_e /= num_elem_1d;
+        }
+      }
+      CeedInt num_nodes_in_elem = p * p, *elem_nodes = ind_u + e * num_nodes_in_elem;
+
+      for (CeedInt n = 0; n < num_nodes_in_elem; n++) {
+        CeedInt g_node = 0, g_node_stride = 1, r_node = n;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          g_node += (elem_xy[d] * (p - 1) + r_node % p) * g_node_stride;
+          g_node_stride *= n_d[d];
+          r_node /= p;
+        }
+        elem_nodes[n] = g_node;
+      }
+    }
+    CeedElemRestrictionCreate(ceed, num_elem, p * p, 1, 1, num_nodes, CEED_MEM_HOST, CEED_COPY_VALUES, ind_u, &elem_restriction_u);
+  }
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis_u);
+
+  // Setup geometric scaling
+  CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup);
+  CeedQFunctionAddInput(qf_setup, "x", dim * dim, CEED_EVAL_GRAD);
+  CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT);
+  CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE);
+
+  CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup);
+  CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
+  CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetMixedPrecision(op_setup);
+  CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
+
+  CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE);
+
+  // Mass operator
+  CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass);
+  CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP);
+  CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE);
+  CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP);
+
+  CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass);
+  CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
+  CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetMixedPrecision(op_mass);
+  CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points);
+
+  CeedOperatorIsAtPoints(op_mass, &is_at_points);
+  if (!is_at_points) printf("Error: Operator should be at points\n");
+
+  CeedVectorCreate(ceed, num_nodes, &u);
+  CeedVectorSetValue(u, 1.0);
+  CeedVectorCreate(ceed, num_nodes, &v);
+  CeedOperatorApply(op_mass, u, v, CEED_REQUEST_IMMEDIATE);
+
+  {
+    CeedScalar        sum = 0.0;
+    const CeedScalar *v_array;
+
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < num_nodes; i++) sum += v_array[i];
+    CeedVectorRestoreArrayRead(v, &v_array);
+    // Summing 9 reference elements
+    if (fabs(sum - 1.0 * num_elem) > 5000. * FLT_EPSILON) printf("Incorrect area computed, %f != %f\n", sum, 1.0 * num_elem);
+  }
+
+  CeedVectorDestroy(&x_points);
+  CeedVectorDestroy(&q_data);
+  CeedVectorDestroy(&x_elem);
+  CeedVectorDestroy(&u);
+  CeedVectorDestroy(&v);
+  CeedElemRestrictionDestroy(&elem_restriction_x_points);
+  CeedElemRestrictionDestroy(&elem_restriction_q_data);
+  CeedElemRestrictionDestroy(&elem_restriction_x);
+  CeedElemRestrictionDestroy(&elem_restriction_u);
+  CeedBasisDestroy(&basis_x);
+  CeedBasisDestroy(&basis_u);
+  CeedQFunctionDestroy(&qf_setup);
+  CeedQFunctionDestroy(&qf_mass);
+  CeedOperatorDestroy(&op_setup);
+  CeedOperatorDestroy(&op_mass);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t592-operator-mixed.c b/tests/t592-operator-mixed.c
new file mode 100644
index 0000000000..897c16299b
--- /dev/null
+++ b/tests/t592-operator-mixed.c
@@ -0,0 +1,249 @@
+/// @file
+/// Test assembly of mass matrix operator QFunction at points with mixed precision
+/// \test Test assembly of mass matrix operator QFunction at points with mixed precision
+#include <ceed.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "t591-operator.h"
+
+int main(int argc, char **argv) {
+  Ceed    ceed;
+  CeedInt num_elem_1d = 3, num_elem = num_elem_1d * num_elem_1d, dim = 2, p = 3, q = 5;
+  CeedInt num_nodes = (num_elem_1d * (p - 1) + 1) * (num_elem_1d * (p - 1) + 1), num_points_per_elem = 4, num_points = num_elem * num_points_per_elem;
+  CeedVector          x_points, x_elem, q_data, u, v, qf_assembled;
+  CeedElemRestriction elem_restriction_x_points, elem_restriction_q_data, elem_restriction_x, elem_restriction_u, elem_restriction_assembled;
+  CeedBasis           basis_x, basis_u;
+  CeedQFunction       qf_setup, qf_mass;
+  CeedOperator        op_setup, op_mass;
+  bool                is_at_points;
+
+  CeedInit(argv[1], &ceed);
+
+  // Point reference coordinates
+  CeedVectorCreate(ceed, dim * num_points, &x_points);
+  {
+    CeedScalar x_array[dim * num_points];
+
+    for (CeedInt e = 0; e < num_elem; e++) {
+      for (CeedInt d = 0; d < dim; d++) {
+        x_array[num_points_per_elem * (e * dim + d) + 0] = 0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 1] = d == 0 ? -0.25 : 0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 2] = d == 0 ? 0.25 : -0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 3] = 0.25;
+      }
+    }
+    CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+  }
+  {
+    CeedInt ind_x[num_elem + 1 + num_points];
+
+    for (CeedInt i = 0; i <= num_elem; i++) ind_x[i] = num_elem + 1 + i * num_points_per_elem;
+    for (CeedInt i = 0; i < num_points; i++) ind_x[num_elem + 1 + i] = i;
+    CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim, num_points * dim, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x,
+                                      &elem_restriction_x_points);
+    CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_q_data);
+  }
+
+  // Q data
+  CeedVectorCreate(ceed, num_points, &q_data);
+
+  // Cell coordinates
+  {
+    CeedInt p = 2, num_nodes = (num_elem_1d * (p - 1) + 1) * (num_elem_1d * (p - 1) + 1);
+    CeedInt ind_x[num_elem * p * p];
+
+    for (CeedInt e = 0; e < num_elem; e++) {
+      CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0};
+
+      for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p - 1) + 1;
+      {
+        CeedInt r_e = e;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          elem_xy[d] = r_e % num_elem_1d;
+          r_e /= num_elem_1d;
+        }
+      }
+      CeedInt num_nodes_in_elem = p * p, *elem_nodes = ind_x + e * num_nodes_in_elem;
+
+      for (CeedInt n = 0; n < num_nodes_in_elem; n++) {
+        CeedInt g_node = 0, g_node_stride = 1, r_node = n;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          g_node += (elem_xy[d] * (p - 1) + r_node % p) * g_node_stride;
+          g_node_stride *= n_d[d];
+          r_node /= p;
+        }
+        elem_nodes[n] = p * g_node;
+      }
+    }
+    CeedElemRestrictionCreate(ceed, num_elem, p * p, dim, 1, dim * num_nodes, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_x);
+    CeedVectorCreate(ceed, dim * num_nodes, &x_elem);
+    {
+      CeedScalar x_array[dim * num_nodes];
+
+      for (CeedInt i = 0; i <= num_elem_1d; i++) {
+        for (CeedInt j = 0; j <= num_elem_1d; j++) {
+          x_array[(i * (num_elem_1d + 1) + j) * dim + 0] = j;
+          x_array[(i * (num_elem_1d + 1) + j) * dim + 1] = i;
+        }
+      }
+      CeedVectorSetArray(x_elem, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+    }
+  }
+
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, q, CEED_GAUSS, &basis_x);
+
+  // Cell solution
+  {
+    CeedInt ind_u[num_elem * p * p];
+
+    for (CeedInt e = 0; e < num_elem; e++) {
+      CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0};
+
+      for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p - 1) + 1;
+      {
+        CeedInt r_e = e;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          elem_xy[d] = r_e % num_elem_1d;
+          r_e /= num_elem_1d;
+        }
+      }
+      CeedInt num_nodes_in_elem = p * p, *elem_nodes = ind_u + e * num_nodes_in_elem;
+
+      for (CeedInt n = 0; n < num_nodes_in_elem; n++) {
+        CeedInt g_node = 0, g_node_stride = 1, r_node = n;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          g_node += (elem_xy[d] * (p - 1) + r_node % p) * g_node_stride;
+          g_node_stride *= n_d[d];
+          r_node /= p;
+        }
+        elem_nodes[n] = g_node;
+      }
+    }
+    CeedElemRestrictionCreate(ceed, num_elem, p * p, 1, 1, num_nodes, CEED_MEM_HOST, CEED_COPY_VALUES, ind_u, &elem_restriction_u);
+  }
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis_u);
+
+  // Setup geometric scaling
+  CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup);
+  CeedQFunctionAddInput(qf_setup, "x", dim * dim, CEED_EVAL_GRAD);
+  CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT);
+  CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE);
+
+  CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup);
+  CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
+  CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetMixedPrecision(op_setup);
+  CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
+
+  CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE);
+
+  // Mass operator
+  CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass);
+  CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP);
+  CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE);
+  CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP);
+
+  CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass);
+  CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
+  CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetMixedPrecision(op_mass);
+  CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points);
+
+  CeedOperatorIsAtPoints(op_mass, &is_at_points);
+  if (!is_at_points) printf("Error: Operator should be at points\n");
+
+  CeedVectorCreate(ceed, num_nodes, &u);
+  CeedVectorSetValue(u, 1.0);
+  CeedVectorCreate(ceed, num_nodes, &v);
+
+  // Assemble QFunction
+  CeedOperatorSetQFunctionAssemblyReuse(op_mass, true);
+  CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(op_mass, true);
+  CeedOperatorLinearAssembleQFunction(op_mass, &qf_assembled, &elem_restriction_assembled, CEED_REQUEST_IMMEDIATE);
+
+  // Check output
+  {
+    const CeedScalar *assembled_array, *q_data_array;
+
+    CeedVectorGetArrayRead(qf_assembled, CEED_MEM_HOST, &assembled_array);
+    CeedVectorGetArrayRead(q_data, CEED_MEM_HOST, &q_data_array);
+    for (CeedInt i = 0; i < num_points; i++) {
+      if (fabs(q_data_array[i] - assembled_array[i]) > FLT_EPSILON) {
+        // LCOV_EXCL_START
+        printf("Error: qf_assembled[%" CeedInt_FMT "] = %f != %f\n", i, assembled_array[i], q_data_array[i]);
+        // LCOV_EXCL_STOP
+      }
+    }
+    CeedVectorRestoreArrayRead(qf_assembled, &assembled_array);
+    CeedVectorRestoreArrayRead(q_data, &q_data_array);
+  }
+
+  // Apply original Mass Operator
+  CeedVectorSetValue(u, 1.0);
+  CeedOperatorApply(op_mass, u, v, CEED_REQUEST_IMMEDIATE);
+
+  // Check output
+  {
+    const CeedScalar *v_array;
+    CeedScalar        area = 0.0;
+
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < num_nodes; i++) area += v_array[i];
+    if (fabs(area - 1.0 * num_elem) > FLT_EPSILON) printf("Error: True operator computed area = %f != 1.0\n", area);
+    CeedVectorRestoreArrayRead(v, &v_array);
+  }
+
+  // Switch to new q_data
+  {
+    const CeedScalar *assembled_array;
+
+    CeedVectorGetArrayRead(qf_assembled, CEED_MEM_HOST, &assembled_array);
+    CeedVectorSetArray(q_data, CEED_MEM_HOST, CEED_COPY_VALUES, (CeedScalar *)assembled_array);
+    CeedVectorRestoreArrayRead(qf_assembled, &assembled_array);
+  }
+
+  // Apply new Mass Operator
+  CeedOperatorApply(op_mass, u, v, CEED_REQUEST_IMMEDIATE);
+
+  // Check output
+  {
+    const CeedScalar *v_array;
+    CeedScalar        area = 0.0;
+
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < num_nodes; i++) area += v_array[i];
+    CeedVectorRestoreArrayRead(v, &v_array);
+    if (fabs(area - 1.0 * num_elem) > FLT_EPSILON) printf("Error: Linearized operator computed area = %f != 1.0\n", area);
+  }
+
+  // Cleanup
+  CeedVectorDestroy(&qf_assembled);
+  CeedVectorDestroy(&q_data);
+  CeedVectorDestroy(&u);
+  CeedVectorDestroy(&v);
+  CeedVectorDestroy(&x_points);
+  CeedVectorDestroy(&q_data);
+  CeedVectorDestroy(&x_elem);
+  CeedElemRestrictionDestroy(&elem_restriction_q_data);
+  CeedElemRestrictionDestroy(&elem_restriction_x);
+  CeedElemRestrictionDestroy(&elem_restriction_x_points);
+  CeedElemRestrictionDestroy(&elem_restriction_u);
+  CeedElemRestrictionDestroy(&elem_restriction_assembled);
+  CeedBasisDestroy(&basis_x);
+  CeedBasisDestroy(&basis_u);
+  CeedQFunctionDestroy(&qf_setup);
+  CeedQFunctionDestroy(&qf_mass);
+  CeedOperatorDestroy(&op_setup);
+  CeedOperatorDestroy(&op_mass);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t593-operator-mixed.c b/tests/t593-operator-mixed.c
new file mode 100644
index 0000000000..dfd778fe18
--- /dev/null
+++ b/tests/t593-operator-mixed.c
@@ -0,0 +1,155 @@
+/// @file
+/// Test 1D mass matrix operator at points with heterogeneous points per element with mixed precision
+/// \test Test 1D mass matrix operator at points with heterogeneous points per element with mixed precision
+#include <ceed.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "t500-operator.h"
+
+int main(int argc, char **argv) {
+  Ceed       ceed;
+  CeedInt    num_elem = 3, dim = 1, p = 3, q = 5;
+  CeedInt    num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (p - 1) + 1, num_points_per_elem = 4, num_points = num_elem * num_points_per_elem;
+  CeedInt    ind_x[num_elem * 2], ind_u[num_elem * p], ind_x_points[num_elem + 1 + num_points];
+  CeedScalar x_array_mesh[num_nodes_x], x_array_points[num_points];
+  CeedVector x_points = NULL, x_elem = NULL, q_data = NULL, u = NULL, v = NULL;
+  CeedElemRestriction elem_restriction_x_points, elem_restriction_q_data, elem_restriction_x, elem_restriction_u;
+  CeedBasis           basis_x, basis_u;
+  CeedQFunction       qf_setup, qf_mass;
+  CeedOperator        op_setup, op_mass;
+  bool                is_at_points;
+
+  CeedInit(argv[1], &ceed);
+
+  // Mesh coordinates
+  for (CeedInt i = 0; i < num_nodes_x; i++) x_array_mesh[i] = (CeedScalar)i / (num_nodes_x - 1);
+  for (CeedInt i = 0; i < num_elem; i++) {
+    ind_x[2 * i + 0] = i;
+    ind_x[2 * i + 1] = i + 1;
+  }
+  CeedElemRestrictionCreate(ceed, num_elem, 2, 1, 1, num_nodes_x, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x);
+  CeedVectorCreate(ceed, num_nodes_x, &x_elem);
+  CeedVectorSetArray(x_elem, CEED_MEM_HOST, CEED_USE_POINTER, x_array_mesh);
+
+  // U mesh
+  for (CeedInt i = 0; i < num_elem; i++) {
+    for (CeedInt j = 0; j < p; j++) {
+      ind_u[p * i + j] = i * (p - 1) + j;
+    }
+  }
+  CeedElemRestrictionCreate(ceed, num_elem, p, 1, 1, num_nodes_u, CEED_MEM_HOST, CEED_USE_POINTER, ind_u, &elem_restriction_u);
+
+  // Point reference coordinates
+  {
+    CeedScalar weight_tmp[num_points_per_elem + 1];
+    CeedInt    current_index = 0;
+
+    // Use num_points_per_elem + 1 to test non-uniform quadrature
+    CeedGaussQuadrature(num_points_per_elem + 1, x_array_points, weight_tmp);
+    ind_x_points[0] = num_elem + 1;
+    for (CeedInt p = 0; p < num_points_per_elem + 1; p++, current_index++) {
+      ind_x_points[num_elem + 1 + current_index] = current_index;
+    }
+    // Use num_points_per_elem for middle elements
+    for (CeedInt e = 1; e < num_elem - 1; e++) {
+      CeedGaussQuadrature(num_points_per_elem, &x_array_points[current_index], weight_tmp);
+      ind_x_points[e] = num_elem + 1 + current_index;
+      for (CeedInt p = 0; p < num_points_per_elem; p++, current_index++) {
+        ind_x_points[num_elem + 1 + current_index] = current_index;
+      }
+    }
+    // Use num_points_per_elem - 1 to test non-uniform quadrature
+    CeedGaussQuadrature(num_points_per_elem - 1, &x_array_points[current_index], weight_tmp);
+    ind_x_points[num_elem - 1] = num_elem + 1 + current_index;
+    for (CeedInt p = 0; p < num_points_per_elem - 1; p++, current_index++) {
+      ind_x_points[num_elem + 1 + current_index] = current_index;
+    }
+    ind_x_points[num_elem] = num_elem + 1 + current_index;
+
+    CeedVectorCreate(ceed, num_elem * num_points_per_elem, &x_points);
+    CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_USE_POINTER, x_array_points);
+    CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x_points,
+                                      &elem_restriction_x_points);
+    CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x_points,
+                                      &elem_restriction_q_data);
+
+    // Q data
+    CeedVectorCreate(ceed, num_points, &q_data);
+  }
+
+  // Basis creation
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, q, CEED_GAUSS, &basis_x);
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis_u);
+
+  // Setup geometric scaling
+  CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup);
+  CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT);
+  CeedQFunctionAddInput(qf_setup, "x", dim * dim, CEED_EVAL_GRAD);
+  CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE);
+
+  CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup);
+  CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
+  CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+  CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
+  CeedOperatorSetMixedPrecision(op_setup);
+
+  CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE);
+
+  // Mass operator
+  CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass);
+  CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP);
+  CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE);
+  CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP);
+
+  CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass);
+  CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
+  CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetMixedPrecision(op_mass);
+  CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points);
+
+  CeedOperatorIsAtPoints(op_mass, &is_at_points);
+  if (!is_at_points) printf("Error: Operator should be at points\n");
+
+  CeedVectorCreate(ceed, num_nodes_u, &u);
+  CeedVectorSetValue(u, 0.0);
+  CeedVectorCreate(ceed, num_nodes_u, &v);
+
+  // Assemble QFunction
+  CeedOperatorApply(op_mass, u, v, CEED_REQUEST_IMMEDIATE);
+
+  // Check output
+  {
+    const CeedScalar *v_array;
+
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < num_nodes_u; i++) {
+      if (fabs(v_array[i]) > FLT_EPSILON) printf("[%" CeedInt_FMT "] v %g != 0.0\n", i, v_array[i]);
+    }
+    CeedVectorRestoreArrayRead(v, &v_array);
+  }
+
+  // Cleanup
+  CeedVectorDestroy(&q_data);
+  CeedVectorDestroy(&u);
+  CeedVectorDestroy(&v);
+  CeedVectorDestroy(&x_points);
+  CeedVectorDestroy(&q_data);
+  CeedVectorDestroy(&x_elem);
+  CeedElemRestrictionDestroy(&elem_restriction_q_data);
+  CeedElemRestrictionDestroy(&elem_restriction_x);
+  CeedElemRestrictionDestroy(&elem_restriction_x_points);
+  CeedElemRestrictionDestroy(&elem_restriction_u);
+  CeedBasisDestroy(&basis_x);
+  CeedBasisDestroy(&basis_u);
+  CeedQFunctionDestroy(&qf_setup);
+  CeedQFunctionDestroy(&qf_mass);
+  CeedOperatorDestroy(&op_setup);
+  CeedOperatorDestroy(&op_mass);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t594-operator-mixed.c b/tests/t594-operator-mixed.c
new file mode 100644
index 0000000000..fee1b94857
--- /dev/null
+++ b/tests/t594-operator-mixed.c
@@ -0,0 +1,182 @@
+/// @file
+/// Test diagonal assembly of mass matrix operator at points with mixed precision
+/// \test Test diagonal assembly of mass matrix operator at points with mixed precision
+#include <ceed.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "t500-operator.h"
+
+int main(int argc, char **argv) {
+  Ceed       ceed;
+  CeedInt    num_elem = 3, dim = 1, p = 3, q = 5;
+  CeedInt    num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (p - 1) + 1, num_points_per_elem = 4, num_points = num_elem * num_points_per_elem;
+  CeedInt    ind_x[num_elem * 2], ind_u[num_elem * p], ind_x_points[num_elem + 1 + num_points];
+  CeedScalar x_array_mesh[num_nodes_x], x_array_points[num_points], assembled_true[num_nodes_u];
+  CeedVector x_points = NULL, x_elem = NULL, q_data = NULL, u = NULL, v = NULL, assembled = NULL;
+  CeedElemRestriction elem_restriction_x_points, elem_restriction_q_data, elem_restriction_x, elem_restriction_u;
+  CeedBasis           basis_x, basis_u;
+  CeedQFunction       qf_setup, qf_mass;
+  CeedOperator        op_setup, op_mass;
+  bool                is_at_points;
+
+  CeedInit(argv[1], &ceed);
+
+  // Mesh coordinates
+  for (CeedInt i = 0; i < num_nodes_x; i++) x_array_mesh[i] = (CeedScalar)i / (num_nodes_x - 1);
+  for (CeedInt i = 0; i < num_elem; i++) {
+    ind_x[2 * i + 0] = i;
+    ind_x[2 * i + 1] = i + 1;
+  }
+  CeedElemRestrictionCreate(ceed, num_elem, 2, 1, 1, num_nodes_x, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x);
+  CeedVectorCreate(ceed, num_nodes_x, &x_elem);
+  CeedVectorSetArray(x_elem, CEED_MEM_HOST, CEED_USE_POINTER, x_array_mesh);
+
+  // U mesh
+  for (CeedInt i = 0; i < num_elem; i++) {
+    for (CeedInt j = 0; j < p; j++) {
+      ind_u[p * i + j] = i * (p - 1) + j;
+    }
+  }
+  CeedElemRestrictionCreate(ceed, num_elem, p, 1, 1, num_nodes_u, CEED_MEM_HOST, CEED_USE_POINTER, ind_u, &elem_restriction_u);
+
+  // Point reference coordinates
+  {
+    CeedScalar weight_tmp[num_points_per_elem + 1];
+    CeedInt    current_index = 0;
+
+    // Use num_points_per_elem + 1 to test non-uniform quadrature
+    CeedGaussQuadrature(num_points_per_elem + 1, x_array_points, weight_tmp);
+    ind_x_points[0] = num_elem + 1;
+    for (CeedInt p = 0; p < num_points_per_elem + 1; p++, current_index++) {
+      ind_x_points[num_elem + 1 + current_index] = current_index;
+    }
+    // Use num_points_per_elem for middle elements
+    for (CeedInt e = 1; e < num_elem - 1; e++) {
+      CeedGaussQuadrature(num_points_per_elem, &x_array_points[current_index], weight_tmp);
+      ind_x_points[e] = num_elem + 1 + current_index;
+      for (CeedInt p = 0; p < num_points_per_elem; p++, current_index++) {
+        ind_x_points[num_elem + 1 + current_index] = current_index;
+      }
+    }
+    // Use num_points_per_elem - 1 to test non-uniform quadrature
+    CeedGaussQuadrature(num_points_per_elem - 1, &x_array_points[current_index], weight_tmp);
+    ind_x_points[num_elem - 1] = num_elem + 1 + current_index;
+    for (CeedInt p = 0; p < num_points_per_elem - 1; p++, current_index++) {
+      ind_x_points[num_elem + 1 + current_index] = current_index;
+    }
+    ind_x_points[num_elem] = num_elem + 1 + current_index;
+
+    CeedVectorCreate(ceed, num_elem * num_points_per_elem, &x_points);
+    CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_USE_POINTER, x_array_points);
+    CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x_points,
+                                      &elem_restriction_x_points);
+    CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x_points,
+                                      &elem_restriction_q_data);
+
+    // Q data
+    CeedVectorCreate(ceed, num_points, &q_data);
+  }
+
+  // Basis creation
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, q, CEED_GAUSS, &basis_x);
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis_u);
+
+  // Setup geometric scaling
+  CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup);
+  CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT);
+  CeedQFunctionAddInput(qf_setup, "x", dim * dim, CEED_EVAL_GRAD);
+  CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE);
+
+  CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup);
+  CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
+  CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetMixedPrecision(op_setup);
+  CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
+
+  CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE);
+
+  // Mass operator
+  CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass);
+  CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP);
+  CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE);
+  CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP);
+
+  CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass);
+  CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
+  CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetMixedPrecision(op_mass);
+  CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points);
+
+  CeedOperatorIsAtPoints(op_mass, &is_at_points);
+  if (!is_at_points) printf("Error: Operator should be at points\n");
+
+  CeedVectorCreate(ceed, num_nodes_u, &u);
+  CeedVectorSetValue(u, 0.0);
+  CeedVectorCreate(ceed, num_nodes_u, &v);
+
+  // Assemble diagonal
+  CeedVectorCreate(ceed, num_nodes_u, &assembled);
+  CeedOperatorLinearAssembleDiagonal(op_mass, assembled, CEED_REQUEST_IMMEDIATE);
+
+  // Manually assemble diagonal
+  CeedVectorSetValue(u, 0.0);
+  for (CeedInt i = 0; i < num_nodes_u; i++) {
+    CeedScalar       *u_array;
+    const CeedScalar *v_array;
+
+    // Set input
+    CeedVectorGetArray(u, CEED_MEM_HOST, &u_array);
+    u_array[i] = 1.0;
+    if (i) u_array[i - 1] = 0.0;
+    CeedVectorRestoreArray(u, &u_array);
+
+    // Compute diag entry for DoF i
+    CeedOperatorApply(op_mass, u, v, CEED_REQUEST_IMMEDIATE);
+
+    // Retrieve entry
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    assembled_true[i] = v_array[i];
+    CeedVectorRestoreArrayRead(v, &v_array);
+  }
+
+  // Check output
+  {
+    const CeedScalar *assembled_array;
+
+    CeedVectorGetArrayRead(assembled, CEED_MEM_HOST, &assembled_array);
+    for (CeedInt i = 0; i < num_nodes_u; i++) {
+      if (fabs(assembled_array[i] - assembled_true[i]) > FLT_EPSILON) {
+        // LCOV_EXCL_START
+        printf("[%" CeedInt_FMT "] Error in assembly: %f != %f\n", i, assembled_array[i], assembled_true[i]);
+        // LCOV_EXCL_STOP
+      }
+    }
+    CeedVectorRestoreArrayRead(assembled, &assembled_array);
+  }
+
+  // Cleanup
+  CeedVectorDestroy(&q_data);
+  CeedVectorDestroy(&u);
+  CeedVectorDestroy(&v);
+  CeedVectorDestroy(&x_points);
+  CeedVectorDestroy(&q_data);
+  CeedVectorDestroy(&x_elem);
+  CeedVectorDestroy(&assembled);
+  CeedElemRestrictionDestroy(&elem_restriction_q_data);
+  CeedElemRestrictionDestroy(&elem_restriction_x);
+  CeedElemRestrictionDestroy(&elem_restriction_x_points);
+  CeedElemRestrictionDestroy(&elem_restriction_u);
+  CeedBasisDestroy(&basis_x);
+  CeedBasisDestroy(&basis_u);
+  CeedQFunctionDestroy(&qf_setup);
+  CeedQFunctionDestroy(&qf_mass);
+  CeedOperatorDestroy(&op_setup);
+  CeedOperatorDestroy(&op_mass);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t596-operator-mixed.c b/tests/t596-operator-mixed.c
new file mode 100644
index 0000000000..4341d16912
--- /dev/null
+++ b/tests/t596-operator-mixed.c
@@ -0,0 +1,205 @@
+/// @file
+/// Test full assembly of mass matrix operator with mixed precision
+/// \test Test full assembly of mass matrix operator AtPoints with mixed precision
+#include <ceed.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "t596-operator.h"
+
+int main(int argc, char **argv) {
+  Ceed ceed;
+
+  CeedInit(argv[1], &ceed);
+
+  for (CeedInt num_comp = 1; num_comp <= 3; num_comp++) {
+    CeedElemRestriction elem_restriction_x, elem_restriction_x_points, elem_restriction_u, elem_restriction_q_data;
+    CeedBasis           basis_x, basis_u;
+    CeedQFunction       qf_setup, qf_mass;
+    CeedOperator        op_setup, op_mass;
+    CeedVector          q_data, x, x_points, u, v;
+    CeedInt             p = 3, q = 4, dim = 2;
+    CeedInt             n_x = 3, n_y = 2;
+    CeedInt             num_elem = n_x * n_y;
+    CeedInt             num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_points_per_elem = 4, num_points = num_elem * num_points_per_elem;
+    CeedInt             ind_x[num_elem * p * p];
+    CeedScalar          assembled_values[num_comp * num_comp * num_dofs * num_dofs];
+    CeedScalar          assembled_true[num_comp * num_comp * num_dofs * num_dofs];
+
+    // Points
+    CeedVectorCreate(ceed, dim * num_points, &x_points);
+    {
+      CeedScalar x_array[dim * num_points];
+
+      for (CeedInt e = 0; e < num_elem; e++) {
+        for (CeedInt d = 0; d < dim; d++) {
+          x_array[num_points_per_elem * (e * dim + d) + 0] = 0.25;
+          x_array[num_points_per_elem * (e * dim + d) + 1] = d == 0 ? -0.25 : 0.25;
+          x_array[num_points_per_elem * (e * dim + d) + 2] = d == 0 ? 0.25 : -0.25;
+          x_array[num_points_per_elem * (e * dim + d) + 3] = 0.25;
+        }
+      }
+      CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+    }
+    {
+      CeedInt ind_x[num_elem + 1 + num_points];
+
+      for (CeedInt i = 0; i <= num_elem; i++) ind_x[i] = num_elem + 1 + i * num_points_per_elem;
+      for (CeedInt i = 0; i < num_points; i++) ind_x[num_elem + 1 + i] = i;
+      CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim, num_points * dim, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x,
+                                        &elem_restriction_x_points);
+      CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_q_data);
+    }
+
+    // Vectors
+    CeedVectorCreate(ceed, dim * num_dofs, &x);
+    {
+      CeedScalar x_array[dim * num_dofs];
+
+      for (CeedInt i = 0; i < n_x * 2 + 1; i++) {
+        for (CeedInt j = 0; j < n_y * 2 + 1; j++) {
+          x_array[i + j * (n_x * 2 + 1) + 0 * num_dofs] = (CeedScalar)i / (2 * n_x);
+          x_array[i + j * (n_x * 2 + 1) + 1 * num_dofs] = (CeedScalar)j / (2 * n_y);
+        }
+      }
+      CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+    }
+    CeedVectorCreate(ceed, num_comp * num_dofs, &u);
+    CeedVectorCreate(ceed, num_comp * num_dofs, &v);
+    CeedVectorCreate(ceed, num_points, &q_data);
+
+    // Restrictions
+    for (CeedInt i = 0; i < num_elem; i++) {
+      CeedInt col, row, offset;
+      col    = i % n_x;
+      row    = i / n_x;
+      offset = col * (p - 1) + row * (n_x * 2 + 1) * (p - 1);
+      for (CeedInt j = 0; j < p; j++) {
+        for (CeedInt k = 0; k < p; k++) ind_x[p * (p * i + k) + j] = offset + k * (n_x * 2 + 1) + j;
+      }
+    }
+    CeedElemRestrictionCreate(ceed, num_elem, p * p, dim, num_dofs, dim * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x);
+    CeedElemRestrictionCreate(ceed, num_elem, p * p, num_comp, num_dofs, num_comp * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x,
+                              &elem_restriction_u);
+
+    // Bases
+    CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, p, q, CEED_GAUSS, &basis_x);
+    CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp, p, q, CEED_GAUSS, &basis_u);
+
+    // QFunctions
+    CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup);
+    CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT);
+    CeedQFunctionAddInput(qf_setup, "dx", dim * dim, CEED_EVAL_GRAD);
+    CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE);
+
+    CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass);
+    CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE);
+    CeedQFunctionAddInput(qf_mass, "u", num_comp, CEED_EVAL_INTERP);
+    CeedQFunctionAddOutput(qf_mass, "v", num_comp, CEED_EVAL_INTERP);
+    {
+      CeedQFunctionContext qf_context;
+
+      CeedQFunctionContextCreate(ceed, &qf_context);
+      CeedQFunctionContextSetData(qf_context, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof(CeedInt), &num_comp);
+      CeedQFunctionSetContext(qf_mass, qf_context);
+      CeedQFunctionContextDestroy(&qf_context);
+    }
+
+    // Operators
+    CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup);
+    CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
+    CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetMixedPrecision(op_setup);
+    CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
+
+    CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass);
+    CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
+    CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetMixedPrecision(op_mass);
+    CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points);
+
+    // Apply Setup Operator
+    CeedOperatorApply(op_setup, x, q_data, CEED_REQUEST_IMMEDIATE);
+
+    // Fully assemble operator
+    CeedSize   num_entries;
+    CeedInt   *rows;
+    CeedInt   *cols;
+    CeedVector assembled;
+
+    for (CeedInt k = 0; k < num_comp * num_comp * num_dofs * num_dofs; ++k) {
+      assembled_values[k] = 0.0;
+      assembled_true[k]   = 0.0;
+    }
+    CeedOperatorLinearAssembleSymbolic(op_mass, &num_entries, &rows, &cols);
+    CeedVectorCreate(ceed, num_entries, &assembled);
+    CeedOperatorLinearAssemble(op_mass, assembled);
+    {
+      const CeedScalar *assembled_array;
+
+      CeedVectorGetArrayRead(assembled, CEED_MEM_HOST, &assembled_array);
+      for (CeedInt k = 0; k < num_entries; k++) {
+        assembled_values[rows[k] * num_comp * num_dofs + cols[k]] += assembled_array[k];
+      }
+      CeedVectorRestoreArrayRead(assembled, &assembled_array);
+    }
+
+    // Manually assemble operator
+    CeedVectorSetValue(u, 0.0);
+    for (CeedInt j = 0; j < num_comp * num_dofs; j++) {
+      CeedScalar       *u_array;
+      const CeedScalar *v_array;
+
+      // Set input
+      CeedVectorGetArray(u, CEED_MEM_HOST, &u_array);
+      u_array[j] = 1.0;
+      if (j) u_array[j - 1] = 0.0;
+      CeedVectorRestoreArray(u, &u_array);
+
+      // Compute entries for column j
+      CeedOperatorApply(op_mass, u, v, CEED_REQUEST_IMMEDIATE);
+
+      CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+      for (CeedInt i = 0; i < num_comp * num_dofs; i++) assembled_true[i * num_comp * num_dofs + j] = v_array[i];
+      CeedVectorRestoreArrayRead(v, &v_array);
+    }
+
+    // Check output
+    for (CeedInt i = 0; i < num_comp * num_dofs; i++) {
+      for (CeedInt j = 0; j < num_comp * num_dofs; j++) {
+        if (fabs(assembled_values[i * num_dofs * num_comp + j] - assembled_true[i * num_dofs * num_comp + j]) > FLT_EPSILON) {
+          // LCOV_EXCL_START
+          printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] Error in assembly: %f != %f\n", i, j, assembled_values[i * num_dofs * num_comp + j],
+                 assembled_true[i * num_dofs * num_comp + j]);
+          // LCOV_EXCL_STOP
+        }
+      }
+    }
+
+    // Cleanup
+    free(rows);
+    free(cols);
+    CeedVectorDestroy(&x);
+    CeedVectorDestroy(&x_points);
+    CeedVectorDestroy(&q_data);
+    CeedVectorDestroy(&u);
+    CeedVectorDestroy(&v);
+    CeedVectorDestroy(&assembled);
+    CeedElemRestrictionDestroy(&elem_restriction_u);
+    CeedElemRestrictionDestroy(&elem_restriction_x);
+    CeedElemRestrictionDestroy(&elem_restriction_x_points);
+    CeedElemRestrictionDestroy(&elem_restriction_q_data);
+    CeedBasisDestroy(&basis_u);
+    CeedBasisDestroy(&basis_x);
+    CeedQFunctionDestroy(&qf_setup);
+    CeedQFunctionDestroy(&qf_mass);
+    CeedOperatorDestroy(&op_setup);
+    CeedOperatorDestroy(&op_mass);
+  }
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t597-operator-mixed.c b/tests/t597-operator-mixed.c
new file mode 100644
index 0000000000..90d36eda69
--- /dev/null
+++ b/tests/t597-operator-mixed.c
@@ -0,0 +1,206 @@
+/// @file
+/// Test full assembly of Poisson operator AtPoints with mixed precision
+/// \test Test full assembly of Poisson operator AtPoints with mixed precision
+#include <ceed.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "t597-operator.h"
+
+int main(int argc, char **argv) {
+  Ceed ceed;
+
+  CeedInit(argv[1], &ceed);
+
+  for (CeedInt num_comp = 1; num_comp <= 3; num_comp++) {
+    CeedElemRestriction elem_restriction_x, elem_restriction_x_points, elem_restriction_u, elem_restriction_q_data;
+    CeedBasis           basis_x, basis_u;
+    CeedQFunction       qf_setup, qf_diff;
+    CeedOperator        op_setup, op_diff;
+    CeedVector          q_data, x, x_points, u, v;
+    CeedInt             p = 3, q = 4, dim = 2;
+    CeedInt             n_x = 3, n_y = 2;
+    CeedInt             num_elem = n_x * n_y;
+    CeedInt             num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_points_per_elem = 4, num_points = num_elem * num_points_per_elem;
+    CeedInt             ind_x[num_elem * p * p];
+    CeedScalar          assembled_values[num_comp * num_comp * num_dofs * num_dofs];
+    CeedScalar          assembled_true[num_comp * num_comp * num_dofs * num_dofs];
+
+    // Points
+    CeedVectorCreate(ceed, dim * num_points, &x_points);
+    {
+      CeedScalar x_array[dim * num_points];
+
+      for (CeedInt e = 0; e < num_elem; e++) {
+        for (CeedInt d = 0; d < dim; d++) {
+          x_array[num_points_per_elem * (e * dim + d) + 0] = 0.25;
+          x_array[num_points_per_elem * (e * dim + d) + 1] = d == 0 ? -0.25 : 0.25;
+          x_array[num_points_per_elem * (e * dim + d) + 2] = d == 0 ? 0.25 : -0.25;
+          x_array[num_points_per_elem * (e * dim + d) + 3] = 0.25;
+        }
+      }
+      CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+    }
+    {
+      CeedInt ind_x[num_elem + 1 + num_points];
+
+      for (CeedInt i = 0; i <= num_elem; i++) ind_x[i] = num_elem + 1 + i * num_points_per_elem;
+      for (CeedInt i = 0; i < num_points; i++) ind_x[num_elem + 1 + i] = i;
+      CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim, num_points * dim, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x,
+                                        &elem_restriction_x_points);
+      CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim * (dim + 1) / 2, num_points * dim * (dim + 1) / 2, CEED_MEM_HOST,
+                                        CEED_COPY_VALUES, ind_x, &elem_restriction_q_data);
+    }
+
+    // Vectors
+    CeedVectorCreate(ceed, dim * num_dofs, &x);
+    {
+      CeedScalar x_array[dim * num_dofs];
+
+      for (CeedInt i = 0; i < n_x * 2 + 1; i++) {
+        for (CeedInt j = 0; j < n_y * 2 + 1; j++) {
+          x_array[i + j * (n_x * 2 + 1) + 0 * num_dofs] = (CeedScalar)i / (2 * n_x);
+          x_array[i + j * (n_x * 2 + 1) + 1 * num_dofs] = (CeedScalar)j / (2 * n_y);
+        }
+      }
+      CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+    }
+    CeedVectorCreate(ceed, num_comp * num_dofs, &u);
+    CeedVectorCreate(ceed, num_comp * num_dofs, &v);
+    CeedVectorCreate(ceed, num_points * dim * (dim + 1) / 2, &q_data);
+
+    // Restrictions
+    for (CeedInt i = 0; i < num_elem; i++) {
+      CeedInt col, row, offset;
+      col    = i % n_x;
+      row    = i / n_x;
+      offset = col * (p - 1) + row * (n_x * 2 + 1) * (p - 1);
+      for (CeedInt j = 0; j < p; j++) {
+        for (CeedInt k = 0; k < p; k++) ind_x[p * (p * i + k) + j] = offset + k * (n_x * 2 + 1) + j;
+      }
+    }
+    CeedElemRestrictionCreate(ceed, num_elem, p * p, dim, num_dofs, dim * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x);
+    CeedElemRestrictionCreate(ceed, num_elem, p * p, num_comp, num_dofs, num_comp * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x,
+                              &elem_restriction_u);
+
+    // Bases
+    CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, p, q, CEED_GAUSS, &basis_x);
+    CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp, p, q, CEED_GAUSS, &basis_u);
+
+    // QFunction - setup
+    CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup);
+    CeedQFunctionAddInput(qf_setup, "dx", dim * dim, CEED_EVAL_GRAD);
+    CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT);
+    CeedQFunctionAddOutput(qf_setup, "q data", dim * (dim + 1) / 2, CEED_EVAL_NONE);
+
+    // Operator - setup
+    CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup);
+    CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
+    CeedOperatorSetField(op_setup, "q data", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetMixedPrecision(op_setup);
+    CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
+
+    // Apply Setup Operator
+    CeedOperatorApply(op_setup, x, q_data, CEED_REQUEST_IMMEDIATE);
+
+    // QFunction - apply
+    CeedQFunctionCreateInterior(ceed, 1, diff, diff_loc, &qf_diff);
+    CeedQFunctionAddInput(qf_diff, "du", num_comp * dim, CEED_EVAL_GRAD);
+    CeedQFunctionAddInput(qf_diff, "q data", dim * (dim + 1) / 2, CEED_EVAL_NONE);
+    CeedQFunctionAddOutput(qf_diff, "dv", num_comp * dim, CEED_EVAL_GRAD);
+    {
+      CeedQFunctionContext qf_context;
+
+      CeedQFunctionContextCreate(ceed, &qf_context);
+      CeedQFunctionContextSetData(qf_context, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof(CeedInt), &num_comp);
+      CeedQFunctionSetContext(qf_diff, qf_context);
+      CeedQFunctionContextDestroy(&qf_context);
+    }
+
+    // Operator - apply
+    CeedOperatorCreateAtPoints(ceed, qf_diff, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_diff);
+    CeedOperatorSetField(op_diff, "du", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetField(op_diff, "q data", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
+    CeedOperatorSetField(op_diff, "dv", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetMixedPrecision(op_diff);
+    CeedOperatorAtPointsSetPoints(op_diff, elem_restriction_x_points, x_points);
+
+    // Fully assemble operator
+    CeedSize   num_entries;
+    CeedInt   *rows;
+    CeedInt   *cols;
+    CeedVector assembled;
+
+    for (CeedInt k = 0; k < num_comp * num_comp * num_dofs * num_dofs; ++k) {
+      assembled_values[k] = 0.0;
+      assembled_true[k]   = 0.0;
+    }
+    CeedOperatorLinearAssembleSymbolic(op_diff, &num_entries, &rows, &cols);
+    CeedVectorCreate(ceed, num_entries, &assembled);
+    CeedOperatorLinearAssemble(op_diff, assembled);
+    {
+      const CeedScalar *assembled_array;
+
+      CeedVectorGetArrayRead(assembled, CEED_MEM_HOST, &assembled_array);
+      for (CeedInt k = 0; k < num_entries; k++) assembled_values[rows[k] * num_comp * num_dofs + cols[k]] += assembled_array[k];
+      CeedVectorRestoreArrayRead(assembled, &assembled_array);
+    }
+
+    // Manually assemble operator
+    CeedVectorSetValue(u, 0.0);
+    for (CeedInt j = 0; j < num_comp * num_dofs; j++) {
+      CeedScalar       *u_array;
+      const CeedScalar *v_array;
+
+      // Set input
+      CeedVectorGetArray(u, CEED_MEM_HOST, &u_array);
+      u_array[j] = 1.0;
+      if (j) u_array[j - 1] = 0.0;
+      CeedVectorRestoreArray(u, &u_array);
+
+      // Compute entries for column j
+      CeedOperatorApply(op_diff, u, v, CEED_REQUEST_IMMEDIATE);
+
+      CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+      for (CeedInt i = 0; i < num_comp * num_dofs; i++) assembled_true[i * num_comp * num_dofs + j] = v_array[i];
+      CeedVectorRestoreArrayRead(v, &v_array);
+    }
+
+    // Check output
+    for (CeedInt i = 0; i < num_comp * num_dofs; i++) {
+      for (CeedInt j = 0; j < num_comp * num_dofs; j++) {
+        if (fabs(assembled_values[i * num_comp * num_dofs + j] - assembled_true[i * num_comp * num_dofs + j]) > FLT_EPSILON) {
+          // LCOV_EXCL_START
+          printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] Error in assembly: %f != %f\n", i, j, assembled_values[i * num_comp * num_dofs + j],
+                 assembled_true[i * num_comp * num_dofs + j]);
+          // LCOV_EXCL_STOP
+        }
+      }
+    }
+
+    // Cleanup
+    free(rows);
+    free(cols);
+    CeedVectorDestroy(&x);
+    CeedVectorDestroy(&x_points);
+    CeedVectorDestroy(&q_data);
+    CeedVectorDestroy(&u);
+    CeedVectorDestroy(&v);
+    CeedVectorDestroy(&assembled);
+    CeedElemRestrictionDestroy(&elem_restriction_u);
+    CeedElemRestrictionDestroy(&elem_restriction_x);
+    CeedElemRestrictionDestroy(&elem_restriction_x_points);
+    CeedElemRestrictionDestroy(&elem_restriction_q_data);
+    CeedBasisDestroy(&basis_u);
+    CeedBasisDestroy(&basis_x);
+    CeedQFunctionDestroy(&qf_setup);
+    CeedQFunctionDestroy(&qf_diff);
+    CeedOperatorDestroy(&op_setup);
+    CeedOperatorDestroy(&op_diff);
+  }
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t598-operator-mixed.c b/tests/t598-operator-mixed.c
new file mode 100644
index 0000000000..2d8bb2b8a5
--- /dev/null
+++ b/tests/t598-operator-mixed.c
@@ -0,0 +1,282 @@
+/// @file
+/// Test creation, action, and destruction for mass matrix operator AtPoints with mixed precision
+/// \test Test creation, action, and destruction for mass matrix operator AtPoints with mixed precision
+#include "t591-operator.h"
+
+#include <ceed.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int main(int argc, char **argv) {
+  Ceed                ceed;
+  CeedInt             num_elem_1d = 3, num_elem = num_elem_1d * num_elem_1d, dim = 2, p_coarse = 2, p_fine = 3, q = 5;
+  CeedInt             num_points_per_elem = 4, num_points = num_elem * num_points_per_elem;
+  CeedInt             num_nodes_coarse = (num_elem_1d * (p_coarse - 1) + 1) * (num_elem_1d * (p_coarse - 1) + 1);
+  CeedInt             num_nodes_fine   = (num_elem_1d * (p_fine - 1) + 1) * (num_elem_1d * (p_fine - 1) + 1);
+  CeedVector          x_points, x_elem, q_data, u_coarse, u_fine, v_coarse, v_fine, p_mult_fine;
+  CeedElemRestriction elem_restriction_x_points, elem_restriction_q_data, elem_restriction_x, elem_restriction_u_coarse, elem_restriction_u_fine;
+  CeedBasis           basis_x, basis_u_coarse, basis_u_fine;
+  CeedQFunction       qf_setup, qf_mass;
+  CeedOperator        op_setup, op_mass_coarse, op_mass_fine, op_prolong, op_restrict;
+
+  CeedInit(argv[1], &ceed);
+
+  // Point reference coordinates
+  CeedVectorCreate(ceed, dim * num_points, &x_points);
+  {
+    CeedScalar x_array[dim * num_points];
+
+    for (CeedInt e = 0; e < num_elem; e++) {
+      for (CeedInt d = 0; d < dim; d++) {
+        x_array[num_points_per_elem * (e * dim + d) + 0] = 0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 1] = d == 0 ? -0.25 : 0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 2] = d == 0 ? 0.25 : -0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 3] = 0.25;
+      }
+    }
+    CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+  }
+  {
+    CeedInt ind_x[num_elem + 1 + num_points];
+
+    for (CeedInt i = 0; i <= num_elem; i++) ind_x[i] = num_elem + 1 + i * num_points_per_elem;
+    for (CeedInt i = 0; i < num_points; i++) ind_x[num_elem + 1 + i] = i;
+    CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim, num_points * dim, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x,
+                                      &elem_restriction_x_points);
+    CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_q_data);
+  }
+
+  // Q data
+  CeedVectorCreate(ceed, num_points, &q_data);
+
+  // Cell coordinates
+  {
+    CeedInt p = 2, num_nodes = (num_elem_1d * (p - 1) + 1) * (num_elem_1d * (p - 1) + 1);
+    CeedInt ind_x[num_elem * p * p];
+
+    for (CeedInt e = 0; e < num_elem; e++) {
+      CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0};
+
+      for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p - 1) + 1;
+      {
+        CeedInt r_e = e;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          elem_xy[d] = r_e % num_elem_1d;
+          r_e /= num_elem_1d;
+        }
+      }
+      CeedInt num_nodes_in_elem = p * p, *elem_nodes = ind_x + e * num_nodes_in_elem;
+
+      for (CeedInt n = 0; n < num_nodes_in_elem; n++) {
+        CeedInt g_node = 0, g_node_stride = 1, r_node = n;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          g_node += (elem_xy[d] * (p - 1) + r_node % p) * g_node_stride;
+          g_node_stride *= n_d[d];
+          r_node /= p;
+        }
+        elem_nodes[n] = p * g_node;
+      }
+    }
+    CeedElemRestrictionCreate(ceed, num_elem, p * p, dim, 1, dim * num_nodes, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_x);
+    CeedVectorCreate(ceed, dim * num_nodes, &x_elem);
+    {
+      CeedScalar x_array[dim * num_nodes];
+
+      for (CeedInt i = 0; i <= num_elem_1d; i++) {
+        for (CeedInt j = 0; j <= num_elem_1d; j++) {
+          x_array[(i * (num_elem_1d + 1) + j) * dim + 0] = j;
+          x_array[(i * (num_elem_1d + 1) + j) * dim + 1] = i;
+        }
+      }
+      CeedVectorSetArray(x_elem, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+    }
+  }
+
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, q, CEED_GAUSS, &basis_x);
+
+  // Cell solution
+  {
+    CeedInt ind_u[num_elem * p_coarse * p_coarse];
+
+    for (CeedInt e = 0; e < num_elem; e++) {
+      CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0};
+
+      for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p_coarse - 1) + 1;
+      {
+        CeedInt r_e = e;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          elem_xy[d] = r_e % num_elem_1d;
+          r_e /= num_elem_1d;
+        }
+      }
+      CeedInt num_nodes_in_elem = p_coarse * p_coarse, *elem_nodes = ind_u + e * num_nodes_in_elem;
+
+      for (CeedInt n = 0; n < num_nodes_in_elem; n++) {
+        CeedInt g_node = 0, g_node_stride = 1, r_node = n;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          g_node += (elem_xy[d] * (p_coarse - 1) + r_node % p_coarse) * g_node_stride;
+          g_node_stride *= n_d[d];
+          r_node /= p_coarse;
+        }
+        elem_nodes[n] = g_node;
+      }
+    }
+    CeedElemRestrictionCreate(ceed, num_elem, p_coarse * p_coarse, 1, 1, num_nodes_coarse, CEED_MEM_HOST, CEED_COPY_VALUES, ind_u,
+                              &elem_restriction_u_coarse);
+  }
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p_coarse, q, CEED_GAUSS, &basis_u_coarse);
+  {
+    CeedInt ind_u[num_elem * p_fine * p_fine];
+
+    for (CeedInt e = 0; e < num_elem; e++) {
+      CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0};
+
+      for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p_fine - 1) + 1;
+      {
+        CeedInt r_e = e;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          elem_xy[d] = r_e % num_elem_1d;
+          r_e /= num_elem_1d;
+        }
+      }
+      CeedInt num_nodes_in_elem = p_fine * p_fine, *elem_nodes = ind_u + e * num_nodes_in_elem;
+
+      for (CeedInt n = 0; n < num_nodes_in_elem; n++) {
+        CeedInt g_node = 0, g_node_stride = 1, r_node = n;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          g_node += (elem_xy[d] * (p_fine - 1) + r_node % p_fine) * g_node_stride;
+          g_node_stride *= n_d[d];
+          r_node /= p_fine;
+        }
+        elem_nodes[n] = g_node;
+      }
+    }
+    CeedElemRestrictionCreate(ceed, num_elem, p_fine * p_fine, 1, 1, num_nodes_fine, CEED_MEM_HOST, CEED_COPY_VALUES, ind_u,
+                              &elem_restriction_u_fine);
+  }
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p_fine, q, CEED_GAUSS, &basis_u_fine);
+
+  // Setup geometric scaling
+  CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup);
+  CeedQFunctionAddInput(qf_setup, "x", dim * dim, CEED_EVAL_GRAD);
+  CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT);
+  CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE);
+
+  CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup);
+  CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
+  CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetMixedPrecision(op_setup);
+  CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
+
+  CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE);
+
+  // Mass operator
+  CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass);
+  CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP);
+  CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE);
+  CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP);
+
+  CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_fine);
+  CeedOperatorSetField(op_mass_fine, "u", elem_restriction_u_fine, basis_u_fine, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_mass_fine, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
+  CeedOperatorSetField(op_mass_fine, "v", elem_restriction_u_fine, basis_u_fine, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetMixedPrecision(op_mass_fine);
+  CeedOperatorAtPointsSetPoints(op_mass_fine, elem_restriction_x_points, x_points);
+
+  CeedVectorCreate(ceed, num_nodes_fine, &u_fine);
+  CeedVectorCreate(ceed, num_nodes_fine, &v_fine);
+  CeedVectorCreate(ceed, num_nodes_fine, &p_mult_fine);
+  CeedVectorCreate(ceed, num_nodes_coarse, &u_coarse);
+  CeedVectorCreate(ceed, num_nodes_coarse, &v_coarse);
+
+  // Create multigrid level
+  CeedVectorSetValue(p_mult_fine, 1.0);
+  CeedOperatorMultigridLevelCreate(op_mass_fine, p_mult_fine, elem_restriction_u_coarse, basis_u_coarse, &op_mass_coarse, &op_prolong, &op_restrict);
+
+  // Coarse problem
+  CeedVectorSetValue(u_coarse, 1.0);
+  CeedOperatorApply(op_mass_coarse, u_coarse, v_coarse, CEED_REQUEST_IMMEDIATE);
+
+  // Check output
+  {
+    const CeedScalar *v_array;
+    CeedScalar        sum = 0.;
+
+    CeedVectorGetArrayRead(v_coarse, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < num_nodes_coarse; i++) {
+      sum += v_array[i];
+    }
+    CeedVectorRestoreArrayRead(v_coarse, &v_array);
+    if (fabs(sum - num_elem) > 1000. * FLT_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: %f\n", sum, num_elem);
+  }
+
+  // Prolong coarse u
+  CeedOperatorApply(op_prolong, u_coarse, u_fine, CEED_REQUEST_IMMEDIATE);
+
+  // Fine problem
+  CeedOperatorApply(op_mass_fine, u_fine, v_fine, CEED_REQUEST_IMMEDIATE);
+
+  // Check output
+  {
+    const CeedScalar *v_array;
+    CeedScalar        sum = 0.;
+
+    CeedVectorGetArrayRead(v_fine, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < num_nodes_fine; i++) {
+      sum += v_array[i];
+    }
+    CeedVectorRestoreArrayRead(v_fine, &v_array);
+
+    if (fabs(sum - num_elem) > 1000. * FLT_EPSILON) printf("Computed Area Fine Grid: %f != True Area: %f\n", sum, num_elem);
+  }
+  // Restrict state to coarse grid
+  CeedOperatorApply(op_restrict, v_fine, v_coarse, CEED_REQUEST_IMMEDIATE);
+
+  // Check output
+  {
+    const CeedScalar *v_array;
+    CeedScalar        sum = 0.;
+
+    CeedVectorGetArrayRead(v_coarse, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < num_nodes_coarse; i++) {
+      sum += v_array[i];
+    }
+    CeedVectorRestoreArrayRead(v_coarse, &v_array);
+    if (fabs(sum - num_elem) > 1000. * FLT_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: %f\n", sum, num_elem);
+  }
+
+  CeedVectorDestroy(&x_points);
+  CeedVectorDestroy(&q_data);
+  CeedVectorDestroy(&x_elem);
+  CeedVectorDestroy(&u_coarse);
+  CeedVectorDestroy(&u_fine);
+  CeedVectorDestroy(&v_fine);
+  CeedVectorDestroy(&v_coarse);
+  CeedVectorDestroy(&p_mult_fine);
+  CeedElemRestrictionDestroy(&elem_restriction_x_points);
+  CeedElemRestrictionDestroy(&elem_restriction_q_data);
+  CeedElemRestrictionDestroy(&elem_restriction_x);
+  CeedElemRestrictionDestroy(&elem_restriction_u_coarse);
+  CeedElemRestrictionDestroy(&elem_restriction_u_fine);
+  CeedBasisDestroy(&basis_x);
+  CeedBasisDestroy(&basis_u_coarse);
+  CeedBasisDestroy(&basis_u_fine);
+  CeedQFunctionDestroy(&qf_setup);
+  CeedQFunctionDestroy(&qf_mass);
+  CeedOperatorDestroy(&op_setup);
+  CeedOperatorDestroy(&op_mass_coarse);
+  CeedOperatorDestroy(&op_mass_fine);
+  CeedOperatorDestroy(&op_prolong);
+  CeedOperatorDestroy(&op_restrict);
+  CeedDestroy(&ceed);
+  return 0;
+}

From 766d575ef3435d36abec09e79fac14aec0ba81bd Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Thu, 10 Jul 2025 17:24:01 -0600
Subject: [PATCH 2/5] Tweak tolerances

---
 interface/ceed-basis.c | 2 +-
 tests/t360-basis.c     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index 9a99ae6611..dd313c8947 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -1303,7 +1303,7 @@ int CeedSymmetricSchurDecomposition(Ceed ceed, CeedScalar *mat, CeedScalar *lamb
 
   // Reduce sub and super diagonal
   CeedInt    p = 0, q = 0, itr = 0, max_itr = n * n * n * n;
-  CeedScalar tol = CEED_EPSILON;
+  CeedScalar tol = 10 * CEED_EPSILON;
 
   while (itr < max_itr) {
     // Update p, q, size of reduced portions of diagonal
diff --git a/tests/t360-basis.c b/tests/t360-basis.c
index f953157e1c..5e8a3fbe2b 100644
--- a/tests/t360-basis.c
+++ b/tests/t360-basis.c
@@ -40,7 +40,7 @@ int main(int argc, char **argv) {
 
       CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
       for (CeedInt i = 0; i < p_dim; i++) area += v_array[i];
-      if (fabs(area - 2.0 * CeedIntPow(2, dim)) > 5E-6) printf("Incorrect area computed %f != %f\n", area, 2.0 * CeedIntPow(2, dim));
+      if (fabs(area - 2.0 * CeedIntPow(2, dim)) > 1E-5) printf("Incorrect area computed %f != %f\n", area, 2.0 * CeedIntPow(2, dim));
       CeedVectorRestoreArrayRead(v, &v_array);
     }
 

From 4be129e8445f86e580171452ffbb7fcb74a2deda Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Thu, 10 Jul 2025 18:18:06 -0600
Subject: [PATCH 3/5] change CEED_EPSILON to true constant to appease bindgen

---
 include/ceed/ceed-f32.h | 6 +++++-
 include/ceed/ceed-f64.h | 8 ++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/include/ceed/ceed-f32.h b/include/ceed/ceed-f32.h
index 39d2fb1187..9aa5e4a226 100644
--- a/include/ceed/ceed-f32.h
+++ b/include/ceed/ceed-f32.h
@@ -10,6 +10,10 @@
 /// Include this header in ceed.h to use float instead of double.
 #pragma once
 
+#ifndef CEED_RUNNING_JIT_PASS
+#include <float.h>
+#endif
+
 #define CEED_SCALAR_IS_FP32
 
 /// Set base scalar type to FP32. (See CeedScalarType enum in ceed.h for all options.)
@@ -18,4 +22,4 @@ typedef float      CeedScalar;
 typedef CeedScalar CeedScalarCPU;
 
 /// Machine epsilon
-#define CEED_EPSILON 0x1p-23
+static const CeedScalar CEED_EPSILON = FLT_EPSILON;
diff --git a/include/ceed/ceed-f64.h b/include/ceed/ceed-f64.h
index 1e3a7fd7bf..ddfb56e6d3 100644
--- a/include/ceed/ceed-f64.h
+++ b/include/ceed/ceed-f64.h
@@ -10,6 +10,10 @@
 /// This is the default header included in ceed.h.
 #pragma once
 
+#ifndef CEED_RUNNING_JIT_PASS
+#include <float.h>
+#endif
+
 #define CEED_SCALAR_IS_FP64
 
 /// Set base scalar type to FP64. (See CeedScalarType enum in ceed.h for all options.)
@@ -19,11 +23,11 @@ typedef float  CeedScalar;
 typedef double CeedScalarCPU;
 
 /// Machine epsilon
-#define CEED_EPSILON 0x1p-23
+static const CeedScalar CEED_EPSILON = FLT_EPSILON;
 #else
 typedef double     CeedScalar;
 typedef CeedScalar CeedScalarCPU;
 
 /// Machine epsilon
-#define CEED_EPSILON 0x1p-52
+static const CeedScalar CEED_EPSILON = DBL_EPSILON;
 #endif  // CEED_RUNNING_JIT_PASS && CEED_JIT_MIXED_PRECISION

From 767aa77280a7e59a5aa19f7e2fef044ad0898d8f Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Fri, 11 Jul 2025 15:13:39 -0600
Subject: [PATCH 4/5] Ensure DBL_EPSILON and FLT_EPSILON are defined

---
 include/ceed/jit-source/cuda/cuda-jit.h | 7 +++++++
 include/ceed/jit-source/hip/hip-jit.h   | 7 +++++++
 include/ceed/jit-source/sycl/sycl-jit.h | 7 +++++++
 3 files changed, 21 insertions(+)

diff --git a/include/ceed/jit-source/cuda/cuda-jit.h b/include/ceed/jit-source/cuda/cuda-jit.h
index baa15a1e85..22a0e903b2 100644
--- a/include/ceed/jit-source/cuda/cuda-jit.h
+++ b/include/ceed/jit-source/cuda/cuda-jit.h
@@ -13,4 +13,11 @@
 #define CeedPragmaSIMD
 #define CEED_Q_VLA 1
 
+#ifndef DBL_EPSILON
+#define DBL_EPSILON 2.22044604925031308084726333618164062e-16
+#endif
+#ifndef FLT_EPSILON
+#define FLT_EPSILON 1.19209289550781250000000000000000000e-7F
+#endif
+
 #include "cuda-types.h"
diff --git a/include/ceed/jit-source/hip/hip-jit.h b/include/ceed/jit-source/hip/hip-jit.h
index 70a00416e4..03040e6908 100644
--- a/include/ceed/jit-source/hip/hip-jit.h
+++ b/include/ceed/jit-source/hip/hip-jit.h
@@ -13,4 +13,11 @@
 #define CeedPragmaSIMD
 #define CEED_Q_VLA 1
 
+#ifndef DBL_EPSILON
+#define DBL_EPSILON 2.22044604925031308084726333618164062e-16
+#endif
+#ifndef FLT_EPSILON
+#define FLT_EPSILON 1.19209289550781250000000000000000000e-7F
+#endif
+
 #include "hip-types.h"
diff --git a/include/ceed/jit-source/sycl/sycl-jit.h b/include/ceed/jit-source/sycl/sycl-jit.h
index 1a2971f4df..b42aa5304f 100644
--- a/include/ceed/jit-source/sycl/sycl-jit.h
+++ b/include/ceed/jit-source/sycl/sycl-jit.h
@@ -13,5 +13,12 @@
 #define CeedPragmaSIMD
 #define CEED_Q_VLA 1
 
+#ifndef DBL_EPSILON
+#define DBL_EPSILON 2.22044604925031308084726333618164062e-16
+#endif
+#ifndef FLT_EPSILON
+#define FLT_EPSILON 1.19209289550781250000000000000000000e-7F
+#endif
+
 // Need quotes for recursive header inclusion
 #include "sycl-types.h"

From 873a330eff1e7b699f162d895613cf805c07aa61 Mon Sep 17 00:00:00 2001
From: Zach Atkins <zach.atkins@colorado.edu>
Date: Fri, 11 Jul 2025 16:20:06 -0600
Subject: [PATCH 5/5] Change operator precision to a more flexible interface

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp | 38 +++++++++----------
 .../hip-gen/ceed-hip-gen-operator-build.cpp   |  4 +-
 include/ceed-impl.h                           |  2 +-
 include/ceed/ceed-f32.h                       | 10 +++++
 include/ceed/ceed-f64.h                       |  4 +-
 include/ceed/ceed.h                           |  4 +-
 interface/ceed-operator.c                     | 22 ++++++-----
 tests/t502-operator-mixed.c                   |  4 +-
 tests/t503-operator-mixed.c                   |  4 +-
 tests/t505-operator-mixed.c                   |  4 +-
 tests/t506-operator-mixed.c                   | 11 ++++--
 tests/t510-operator-mixed.c                   |  4 +-
 tests/t520-operator-mixed.c                   |  8 ++--
 tests/t522-operator-mixed.c                   |  8 ++--
 tests/t591-operator-mixed.c                   |  4 +-
 tests/t592-operator-mixed.c                   |  4 +-
 tests/t593-operator-mixed.c                   |  4 +-
 tests/t594-operator-mixed.c                   |  4 +-
 tests/t596-operator-mixed.c                   |  4 +-
 tests/t597-operator-mixed.c                   |  4 +-
 tests/t598-operator-mixed.c                   | 10 ++---
 21 files changed, 89 insertions(+), 72 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 59fcf03d42..a0d1553635 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -1572,17 +1572,17 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
 
   // Compile
   {
-    bool          is_compile_good = false;
-    const CeedInt T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
-    bool          use_mixed_precision;
+    bool           is_compile_good = false;
+    const CeedInt  T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
+    CeedScalarType precision;
 
     // Check for mixed precision
-    CeedCallBackend(CeedOperatorGetMixedPrecision(op, &use_mixed_precision));
+    CeedCallBackend(CeedOperatorGetPrecision(op, &precision));
 
     data->thread_1d = T_1d;
-    if (use_mixed_precision) {
-      CeedCallBackend(
-          CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", T_1d, "CEED_JIT_MIXED_PRECISION", 1));
+    if (precision) {
+      CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", T_1d, "CEED_JIT_PRECISION",
+                                          (CeedInt)precision));
     } else {
       CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "OP_T_1D", T_1d));
     }
@@ -2052,18 +2052,18 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo
 
   // Compile
   {
-    bool          is_compile_good = false;
-    const CeedInt T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
-    bool          use_mixed_precision;
+    bool           is_compile_good = false;
+    const CeedInt  T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
+    CeedScalarType precision;
 
     // Check for mixed precision
-    CeedCallBackend(CeedOperatorGetMixedPrecision(op, &use_mixed_precision));
+    CeedCallBackend(CeedOperatorGetPrecision(op, &precision));
 
     data->thread_1d = T_1d;
-    if (use_mixed_precision) {
+    if (precision) {
       CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good,
                                           is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 2, "OP_T_1D", T_1d,
-                                          "CEED_JIT_MIXED_PRECISION", 1));
+                                          "CEED_JIT_PRECISION", (CeedInt)precision));
     } else {
       CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good,
                                           is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 1, "OP_T_1D", T_1d));
@@ -2642,17 +2642,17 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera
 
   // Compile
   {
-    bool          is_compile_good = false;
-    const CeedInt T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
-    bool          use_mixed_precision;
+    bool           is_compile_good = false;
+    const CeedInt  T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
+    CeedScalarType precision;
 
     // Check for mixed precision
-    CeedCallBackend(CeedOperatorGetMixedPrecision(op, &use_mixed_precision));
+    CeedCallBackend(CeedOperatorGetPrecision(op, &precision));
 
     data->thread_1d = T_1d;
-    if (use_mixed_precision) {
+    if (precision) {
       CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 2, "OP_T_1D", T_1d,
-                                          "CEED_JIT_MIXED_PRECISION", 1));
+                                          "CEED_JIT_PRECISION", (CeedInt)precision));
     } else {
       CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 1, "OP_T_1D", T_1d));
     }
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 674f94b428..f4ed5313d6 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -2483,8 +2483,8 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperat
       CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode));
       if (eval_mode == CEED_EVAL_GRAD) {
-        code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*"
-             << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
+        code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*"
+             << "dim_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
       } else {
         code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
       }
diff --git a/include/ceed-impl.h b/include/ceed-impl.h
index e8f6976736..d555444c62 100644
--- a/include/ceed-impl.h
+++ b/include/ceed-impl.h
@@ -381,7 +381,7 @@ struct CeedOperator_private {
   bool                      is_composite;
   bool                      is_at_points;
   bool                      has_restriction;
-  bool                      use_mixed_precision;
+  CeedScalarType            precision;
   CeedQFunctionAssemblyData qf_assembled;
   CeedOperatorAssemblyData  op_assembled;
   CeedOperator             *sub_operators;
diff --git a/include/ceed/ceed-f32.h b/include/ceed/ceed-f32.h
index 9aa5e4a226..2382b10c95 100644
--- a/include/ceed/ceed-f32.h
+++ b/include/ceed/ceed-f32.h
@@ -18,8 +18,18 @@
 
 /// Set base scalar type to FP32. (See CeedScalarType enum in ceed.h for all options.)
 #define CEED_SCALAR_TYPE CEED_SCALAR_FP32
+#if defined(CEED_RUNNING_JIT_PASS) && defined(CEED_JIT_PRECISION) && (CEED_JIT_PRECISION != CEED_SCALAR_TYPE)
+#ifdef CEED_JIT_PRECISION == CEED_SCALAR_FP64
+typedef double CeedScalar;
+typedef float  CeedScalarCPU;
+
+/// Machine epsilon
+static const CeedScalar CEED_EPSILON = DBL_EPSILON;
+#endif  // CEED_JIT_PRECISION
+#else
 typedef float      CeedScalar;
 typedef CeedScalar CeedScalarCPU;
 
 /// Machine epsilon
 static const CeedScalar CEED_EPSILON = FLT_EPSILON;
+#endif
diff --git a/include/ceed/ceed-f64.h b/include/ceed/ceed-f64.h
index ddfb56e6d3..22c7e694f5 100644
--- a/include/ceed/ceed-f64.h
+++ b/include/ceed/ceed-f64.h
@@ -18,12 +18,14 @@
 
 /// Set base scalar type to FP64. (See CeedScalarType enum in ceed.h for all options.)
 #define CEED_SCALAR_TYPE CEED_SCALAR_FP64
-#if defined(CEED_RUNNING_JIT_PASS) && defined(CEED_JIT_MIXED_PRECISION)
+#if defined(CEED_RUNNING_JIT_PASS) && defined(CEED_JIT_PRECISION) && (CEED_JIT_PRECISION != CEED_SCALAR_TYPE)
+#if CEED_JIT_PRECISION == CEED_SCALAR_FP32
 typedef float  CeedScalar;
 typedef double CeedScalarCPU;
 
 /// Machine epsilon
 static const CeedScalar CEED_EPSILON = FLT_EPSILON;
+#endif  // CEED_JIT_PRECISION
 #else
 typedef double     CeedScalar;
 typedef CeedScalar CeedScalarCPU;
diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index 46d22b5dab..fbd5075333 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -427,8 +427,8 @@ CEED_EXTERN int  CeedOperatorCheckReady(CeedOperator op);
 CEED_EXTERN int  CeedOperatorGetActiveVectorLengths(CeedOperator op, CeedSize *input_size, CeedSize *output_size);
 CEED_EXTERN int  CeedOperatorSetQFunctionAssemblyReuse(CeedOperator op, bool reuse_assembly_data);
 CEED_EXTERN int  CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(CeedOperator op, bool needs_data_update);
-CEED_EXTERN int  CeedOperatorSetMixedPrecision(CeedOperator op);
-CEED_EXTERN int  CeedOperatorGetMixedPrecision(CeedOperator op, bool *use_mixed_precision);
+CEED_EXTERN int  CeedOperatorSetPrecision(CeedOperator op, CeedScalarType precision);
+CEED_EXTERN int  CeedOperatorGetPrecision(CeedOperator op, CeedScalarType *precision);
 CEED_EXTERN int  CeedOperatorLinearAssembleQFunction(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request);
 CEED_EXTERN int  CeedOperatorLinearAssembleQFunctionBuildOrUpdate(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr,
                                                                   CeedRequest *request);
diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index 6c2a0cc5cd..902f62ffae 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -637,25 +637,27 @@ int CeedOperatorIsSetupDone(CeedOperator op, bool *is_setup_done) {
 /**
   @brief Set a `CeedOperator` to use reduced precision for operator application
 
-  @param[in] op `CeedOperator`
+  @param[in] op        `CeedOperator`
+  @param[in] precision `CeedScalarType` to use for operator application
 
   @return An error code: 0 - success, otherwise - failure
 
   @ref User
 **/
-int CeedOperatorSetMixedPrecision(CeedOperator op) {
+int CeedOperatorSetPrecision(CeedOperator op, CeedScalarType scalar_type) {
   bool is_immutable, is_composite, supports_mixed_precision;
   Ceed ceed;
 
   CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorIsImmutable(op, &is_immutable));
-  CeedCheck(!is_immutable, ceed, CEED_ERROR_INCOMPATIBLE, "CeedOperatorSetMixedPrecision must be called before operator is finalized");
+  CeedCheck(!is_immutable, ceed, CEED_ERROR_INCOMPATIBLE, "CeedOperatorSetPrecision must be called before operator is finalized");
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
-  CeedCheck(!is_composite, ceed, CEED_ERROR_INCOMPATIBLE, "CeedOperatorSetMixedPrecision should be set on single operators");
+  CeedCheck(!is_composite, ceed, CEED_ERROR_INCOMPATIBLE, "CeedOperatorSetPrecision should be set on single operators");
   CeedCall(CeedGetSupportsMixedPrecision(ceed, &supports_mixed_precision));
-  CeedCheck(supports_mixed_precision, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement mixed precision operators");
+  CeedCheck(scalar_type == CEED_SCALAR_TYPE || supports_mixed_precision, ceed, CEED_ERROR_UNSUPPORTED,
+            "Backend does not implement mixed precision operators");
 
-  op->use_mixed_precision = true;
+  op->precision = true;
   CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
@@ -663,15 +665,15 @@ int CeedOperatorSetMixedPrecision(CeedOperator op) {
 /**
   @brief Get whether a `CeedOperator` is set to use reduced precision for operator application
 
-  @param[in]  op                  `CeedOperator`
-  @param[out] use_mixed_precision Variable to store `CeedQFunction`
+  @param[in]  op        `CeedOperator`
+  @param[out] precision Variable to store operator precision
 
   @return An error code: 0 - success, otherwise - failure
 
   @ref User
 **/
-int CeedOperatorGetMixedPrecision(CeedOperator op, bool *use_mixed_precision) {
-  *use_mixed_precision = op->use_mixed_precision;
+int CeedOperatorGetPrecision(CeedOperator op, CeedScalarType *precision) {
+  *precision = op->precision;
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/tests/t502-operator-mixed.c b/tests/t502-operator-mixed.c
index 4218b82b1d..8a44ee36ed 100644
--- a/tests/t502-operator-mixed.c
+++ b/tests/t502-operator-mixed.c
@@ -70,13 +70,13 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
-  CeedOperatorSetMixedPrecision(op_setup);
+  CeedOperatorSetPrecision(op_setup, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
 
   CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass);
   CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
   CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
-  CeedOperatorSetMixedPrecision(op_mass);
+  CeedOperatorSetPrecision(op_mass, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
 
   CeedOperatorApply(op_setup, x, q_data, CEED_REQUEST_IMMEDIATE);
 
diff --git a/tests/t503-operator-mixed.c b/tests/t503-operator-mixed.c
index 7042c7846c..b195b0629e 100644
--- a/tests/t503-operator-mixed.c
+++ b/tests/t503-operator-mixed.c
@@ -71,13 +71,13 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, x);
   CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
-  CeedOperatorSetMixedPrecision(op_setup);
+  CeedOperatorSetPrecision(op_setup, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
 
   CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass);
   CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
   CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, u);
   CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, v);
-  CeedOperatorSetMixedPrecision(op_mass);
+  CeedOperatorSetPrecision(op_mass, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
 
   // Note - It is atypical to use only passive fields; this test is intended
   //   as a test for all passive input modes rather than as an example.
diff --git a/tests/t505-operator-mixed.c b/tests/t505-operator-mixed.c
index 2efe5bd05d..bb2a62fda4 100644
--- a/tests/t505-operator-mixed.c
+++ b/tests/t505-operator-mixed.c
@@ -69,13 +69,13 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
-  CeedOperatorSetMixedPrecision(op_setup);
+  CeedOperatorSetPrecision(op_setup, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
 
   CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass);
   CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
   CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
-  CeedOperatorSetMixedPrecision(op_mass);
+  CeedOperatorSetPrecision(op_mass, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
 
   CeedOperatorApply(op_setup, x, q_data, CEED_REQUEST_IMMEDIATE);
 
diff --git a/tests/t506-operator-mixed.c b/tests/t506-operator-mixed.c
index 0069f451ff..8aae3d335a 100644
--- a/tests/t506-operator-mixed.c
+++ b/tests/t506-operator-mixed.c
@@ -76,26 +76,29 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_setup_small, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_small, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup_small, "x", elem_restriction_x, basis_x_small, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup_small, "rho", elem_restriction_q_data_small, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
-  CeedOperatorSetMixedPrecision(op_setup_small);
+  CeedOperatorSetPrecision(op_setup_small, CEED_SCALAR_TYPE == CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64
+                                           : CEED_SCALAR_FP32                                       ? CEED_SCALAR_FP64
+                                           : CEED_SCALAR_TYPE == CEED_SCALAR_FP32                   ? CEED_SCALAR_FP64
+                                                                                                    : CEED_SCALAR_FP32);
 
   CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_small);
   CeedOperatorSetField(op_mass_small, "rho", elem_restriction_q_data_small, CEED_BASIS_NONE, q_data_small);
   CeedOperatorSetField(op_mass_small, "u", elem_restriction_u, basis_u_small, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass_small, "v", elem_restriction_u, basis_u_small, CEED_VECTOR_ACTIVE);
-  CeedOperatorSetMixedPrecision(op_mass_small);
+  CeedOperatorSetPrecision(op_mass_small, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
 
   // 'Large' operators
   CeedOperatorCreate(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup_large);
   CeedOperatorSetField(op_setup_large, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_large, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup_large, "x", elem_restriction_x, basis_x_large, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup_large, "rho", elem_restriction_q_data_large, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
-  CeedOperatorSetMixedPrecision(op_setup_large);
+  CeedOperatorSetPrecision(op_setup_large, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
 
   CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_large);
   CeedOperatorSetField(op_mass_large, "rho", elem_restriction_q_data_large, CEED_BASIS_NONE, q_data_large);
   CeedOperatorSetField(op_mass_large, "u", elem_restriction_u, basis_u_large, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass_large, "v", elem_restriction_u, basis_u_large, CEED_VECTOR_ACTIVE);
-  CeedOperatorSetMixedPrecision(op_mass_large);
+  CeedOperatorSetPrecision(op_mass_large, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
 
   // Setup
   CeedOperatorApply(op_setup_small, x, q_data_small, CEED_REQUEST_IMMEDIATE);
diff --git a/tests/t510-operator-mixed.c b/tests/t510-operator-mixed.c
index 42853f7165..4fdad2af8e 100644
--- a/tests/t510-operator-mixed.c
+++ b/tests/t510-operator-mixed.c
@@ -91,13 +91,13 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
-  CeedOperatorSetMixedPrecision(op_setup);
+  CeedOperatorSetPrecision(op_setup, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
 
   CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass);
   CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
   CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
-  CeedOperatorSetMixedPrecision(op_mass);
+  CeedOperatorSetPrecision(op_mass, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
 
   CeedOperatorApply(op_setup, x, q_data, CEED_REQUEST_IMMEDIATE);
 
diff --git a/tests/t520-operator-mixed.c b/tests/t520-operator-mixed.c
index ea641fb604..7ab5faefa0 100644
--- a/tests/t520-operator-mixed.c
+++ b/tests/t520-operator-mixed.c
@@ -107,14 +107,14 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_setup_tet, "_weight", CEED_ELEMRESTRICTION_NONE, basis_x_tet, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup_tet, "dx", elem_restriction_x_tet, basis_x_tet, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup_tet, "rho", elem_restriction_q_data_tet, CEED_BASIS_NONE, q_data_tet);
-  CeedOperatorSetMixedPrecision(op_setup_tet);
+  CeedOperatorSetPrecision(op_setup_tet, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
   // ---- Mass Tet
   CeedOperatorCreate(ceed, qf_mass_tet, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_tet);
   CeedOperatorSetField(op_mass_tet, "rho", elem_restriction_q_data_tet, CEED_BASIS_NONE, q_data_tet);
   CeedOperatorSetField(op_mass_tet, "u", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass_tet, "v", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE);
   CeedOperatorSetName(op_mass_tet, "mass tet");
-  CeedOperatorSetMixedPrecision(op_mass_tet);
+  CeedOperatorSetPrecision(op_mass_tet, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
 
   // Set up Hex Elements
   // -- Restrictions
@@ -153,14 +153,14 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_setup_hex, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_hex, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup_hex, "dx", elem_restriction_x_hex, basis_x_hex, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup_hex, "rho", elem_restriction_q_data_hex, CEED_BASIS_NONE, q_data_hex);
-  CeedOperatorSetMixedPrecision(op_setup_hex);
+  CeedOperatorSetPrecision(op_setup_hex, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
 
   CeedOperatorCreate(ceed, qf_mass_hex, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_hex);
   CeedOperatorSetField(op_mass_hex, "rho", elem_restriction_q_data_hex, CEED_BASIS_NONE, q_data_hex);
   CeedOperatorSetField(op_mass_hex, "u", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass_hex, "v", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE);
   CeedOperatorSetName(op_mass_hex, "mass hex");
-  CeedOperatorSetMixedPrecision(op_mass_hex);
+  CeedOperatorSetPrecision(op_mass_hex, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
 
   // Set up Composite Operators
   // -- Create
diff --git a/tests/t522-operator-mixed.c b/tests/t522-operator-mixed.c
index 071c979790..3478107f57 100644
--- a/tests/t522-operator-mixed.c
+++ b/tests/t522-operator-mixed.c
@@ -109,13 +109,13 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_setup_tet, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_tet, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup_tet, "dx", elem_restriction_x_tet, basis_x_tet, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup_tet, "rho", elem_restriction_q_data_tet, CEED_BASIS_NONE, q_data_tet);
-  CeedOperatorSetMixedPrecision(op_setup_tet);
+  CeedOperatorSetPrecision(op_setup_tet, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
   // ---- Diff Tet
   CeedOperatorCreate(ceed, qf_diff_tet, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_diff_tet);
   CeedOperatorSetField(op_diff_tet, "rho", elem_restriction_q_data_tet, CEED_BASIS_NONE, q_data_tet);
   CeedOperatorSetField(op_diff_tet, "u", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_diff_tet, "v", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE);
-  CeedOperatorSetMixedPrecision(op_diff_tet);
+  CeedOperatorSetPrecision(op_diff_tet, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
 
   // Hex Elements
   // -- Restrictions
@@ -155,13 +155,13 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_setup_hex, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_hex, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup_hex, "dx", elem_restriction_x_hex, basis_x_hex, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup_hex, "rho", elem_restriction_q_data_hex, CEED_BASIS_NONE, q_data_hex);
-  CeedOperatorSetMixedPrecision(op_setup_hex);
+  CeedOperatorSetPrecision(op_setup_hex, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
 
   CeedOperatorCreate(ceed, qf_diff_hex, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_diff_hex);
   CeedOperatorSetField(op_diff_hex, "rho", elem_restriction_q_data_hex, CEED_BASIS_NONE, q_data_hex);
   CeedOperatorSetField(op_diff_hex, "u", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_diff_hex, "v", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE);
-  CeedOperatorSetMixedPrecision(op_diff_hex);
+  CeedOperatorSetPrecision(op_diff_hex, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
 
   // Composite Operators
   CeedCompositeOperatorCreate(ceed, &op_setup);
diff --git a/tests/t591-operator-mixed.c b/tests/t591-operator-mixed.c
index d0bc270977..068d029196 100644
--- a/tests/t591-operator-mixed.c
+++ b/tests/t591-operator-mixed.c
@@ -140,7 +140,7 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
-  CeedOperatorSetMixedPrecision(op_setup);
+  CeedOperatorSetPrecision(op_setup, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
   CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
 
   CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE);
@@ -155,7 +155,7 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
   CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
-  CeedOperatorSetMixedPrecision(op_mass);
+  CeedOperatorSetPrecision(op_mass, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
   CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points);
 
   CeedOperatorIsAtPoints(op_mass, &is_at_points);
diff --git a/tests/t592-operator-mixed.c b/tests/t592-operator-mixed.c
index 897c16299b..d80811f60a 100644
--- a/tests/t592-operator-mixed.c
+++ b/tests/t592-operator-mixed.c
@@ -140,7 +140,7 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
-  CeedOperatorSetMixedPrecision(op_setup);
+  CeedOperatorSetPrecision(op_setup, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
   CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
 
   CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE);
@@ -155,7 +155,7 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
   CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
-  CeedOperatorSetMixedPrecision(op_mass);
+  CeedOperatorSetPrecision(op_mass, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
   CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points);
 
   CeedOperatorIsAtPoints(op_mass, &is_at_points);
diff --git a/tests/t593-operator-mixed.c b/tests/t593-operator-mixed.c
index dfd778fe18..de49c38abf 100644
--- a/tests/t593-operator-mixed.c
+++ b/tests/t593-operator-mixed.c
@@ -95,7 +95,7 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
   CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
-  CeedOperatorSetMixedPrecision(op_setup);
+  CeedOperatorSetPrecision(op_setup, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
 
   CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE);
 
@@ -109,7 +109,7 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
   CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
-  CeedOperatorSetMixedPrecision(op_mass);
+  CeedOperatorSetPrecision(op_mass, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
   CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points);
 
   CeedOperatorIsAtPoints(op_mass, &is_at_points);
diff --git a/tests/t594-operator-mixed.c b/tests/t594-operator-mixed.c
index fee1b94857..e802ab893b 100644
--- a/tests/t594-operator-mixed.c
+++ b/tests/t594-operator-mixed.c
@@ -94,7 +94,7 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
-  CeedOperatorSetMixedPrecision(op_setup);
+  CeedOperatorSetPrecision(op_setup, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
   CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
 
   CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE);
@@ -109,7 +109,7 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
   CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
-  CeedOperatorSetMixedPrecision(op_mass);
+  CeedOperatorSetPrecision(op_mass, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
   CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points);
 
   CeedOperatorIsAtPoints(op_mass, &is_at_points);
diff --git a/tests/t596-operator-mixed.c b/tests/t596-operator-mixed.c
index 4341d16912..a9b0cc039d 100644
--- a/tests/t596-operator-mixed.c
+++ b/tests/t596-operator-mixed.c
@@ -112,14 +112,14 @@ int main(int argc, char **argv) {
     CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
     CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
     CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
-    CeedOperatorSetMixedPrecision(op_setup);
+    CeedOperatorSetPrecision(op_setup, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
     CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
 
     CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass);
     CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
     CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
     CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
-    CeedOperatorSetMixedPrecision(op_mass);
+    CeedOperatorSetPrecision(op_mass, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
     CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points);
 
     // Apply Setup Operator
diff --git a/tests/t597-operator-mixed.c b/tests/t597-operator-mixed.c
index 90d36eda69..105a963d3e 100644
--- a/tests/t597-operator-mixed.c
+++ b/tests/t597-operator-mixed.c
@@ -100,7 +100,7 @@ int main(int argc, char **argv) {
     CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
     CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
     CeedOperatorSetField(op_setup, "q data", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
-    CeedOperatorSetMixedPrecision(op_setup);
+    CeedOperatorSetPrecision(op_setup, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
     CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
 
     // Apply Setup Operator
@@ -125,7 +125,7 @@ int main(int argc, char **argv) {
     CeedOperatorSetField(op_diff, "du", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
     CeedOperatorSetField(op_diff, "q data", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
     CeedOperatorSetField(op_diff, "dv", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
-    CeedOperatorSetMixedPrecision(op_diff);
+    CeedOperatorSetPrecision(op_diff, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
     CeedOperatorAtPointsSetPoints(op_diff, elem_restriction_x_points, x_points);
 
     // Fully assemble operator
diff --git a/tests/t598-operator-mixed.c b/tests/t598-operator-mixed.c
index 2d8bb2b8a5..08e8bb0e38 100644
--- a/tests/t598-operator-mixed.c
+++ b/tests/t598-operator-mixed.c
@@ -174,7 +174,7 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
-  CeedOperatorSetMixedPrecision(op_setup);
+  CeedOperatorSetPrecision(op_setup, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
   CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
 
   CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE);
@@ -189,7 +189,7 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_mass_fine, "u", elem_restriction_u_fine, basis_u_fine, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass_fine, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
   CeedOperatorSetField(op_mass_fine, "v", elem_restriction_u_fine, basis_u_fine, CEED_VECTOR_ACTIVE);
-  CeedOperatorSetMixedPrecision(op_mass_fine);
+  CeedOperatorSetPrecision(op_mass_fine, CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ? CEED_SCALAR_FP64 : CEED_SCALAR_FP32);
   CeedOperatorAtPointsSetPoints(op_mass_fine, elem_restriction_x_points, x_points);
 
   CeedVectorCreate(ceed, num_nodes_fine, &u_fine);
@@ -216,7 +216,7 @@ int main(int argc, char **argv) {
       sum += v_array[i];
     }
     CeedVectorRestoreArrayRead(v_coarse, &v_array);
-    if (fabs(sum - num_elem) > 1000. * FLT_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: %f\n", sum, num_elem);
+    if (fabs(sum - num_elem) > 1000. * FLT_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: %f\n", sum, (float)num_elem);
   }
 
   // Prolong coarse u
@@ -236,7 +236,7 @@ int main(int argc, char **argv) {
     }
     CeedVectorRestoreArrayRead(v_fine, &v_array);
 
-    if (fabs(sum - num_elem) > 1000. * FLT_EPSILON) printf("Computed Area Fine Grid: %f != True Area: %f\n", sum, num_elem);
+    if (fabs(sum - num_elem) > 1000. * FLT_EPSILON) printf("Computed Area Fine Grid: %f != True Area: %f\n", sum, (float)num_elem);
   }
   // Restrict state to coarse grid
   CeedOperatorApply(op_restrict, v_fine, v_coarse, CEED_REQUEST_IMMEDIATE);
@@ -251,7 +251,7 @@ int main(int argc, char **argv) {
       sum += v_array[i];
     }
     CeedVectorRestoreArrayRead(v_coarse, &v_array);
-    if (fabs(sum - num_elem) > 1000. * FLT_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: %f\n", sum, num_elem);
+    if (fabs(sum - num_elem) > 1000. * FLT_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: %f\n", sum, (float)num_elem);
   }
 
   CeedVectorDestroy(&x_points);