[SPARK-52853][SDP] Prevent imperative PySpark methods in declarative pipelines

jackywang-db · sryza · commit dc687d4c83b8 · 2025-07-24T11:17:00.000-07:00
### What changes were proposed in this pull request? This PR adds a context manager `block_imperative_construct()` that prevents the execution of imperative Spark operations within declarative pipeline definitions. When these blocked methods are called, users receive clear error messages with guidance on declarative alternatives. #### Blocked Methods ##### Configuration Management - **`spark.conf.set()`** → Use pipeline spec or `spark_conf` decorator parameter ##### Catalog Management - **`spark.catalog.setCurrentCatalog()`** → Set via pipeline spec or dataset decorator `name` argument - **`spark.catalog.setCurrentDatabase()`** → Set via pipeline spec or dataset decorator `name` argument ##### Temporary View Management - **`spark.catalog.dropTempView()`** → Remove temporary view definition directly - **`spark.catalog.dropGlobalTempView()`** → Remove temporary view definition directly - **`DataFrame.createTempView()`** → Use `temporary_view` decorator - **`DataFrame.createOrReplaceTempView()`** → Use `temporary_view` decorator - **`DataFrame.createGlobalTempView()`** → Use `temporary_view` decorator - **`DataFrame.createOrReplaceGlobalTempView()`** → Use `temporary_view` decorator ##### UDF Registration - **`spark.udf.register()`** → Define and register UDFs before pipeline execution - **`spark.udf.registerJavaFunction()`** → Define and register Java UDFs before pipeline execution - **`spark.udf.registerJavaUDAF()`** → Define and register Java UDAFs before pipeline execution ### Why are the changes needed? These are imperative construct that can cause friction and unexpected behavior from within a pipeline declaration. E.g. it makes pipeline behavior sensitive to the order that Python files are imported in, which can be unpredictable. There are already existing mechanisms for setting Spark confs for pipelines: ### Does this PR introduce _any_ user-facing change? Yes, it prevents the behavior of setting spark confs imperatively in the pipeline definition file. ### How was this patch tested? Created new test suite to test that the context manager behave as expected and ran `spark-pipelines` cli manually. ### Was this patch authored or co-authored using generative AI tooling? No Closes #51590 from JiaqiWang18/SPARK-52853-prevent-py-conf-set. Authored-by: Jacky Wang <jacky.wang@databricks.com> Signed-off-by: Sandy Ryza <sandy.ryza@databricks.com>
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
@@ -1520,6 +1520,7 @@ def __hash__(self):
     source_file_regexes=["python/pyspark/pipelines"],
     python_test_goals=[
         "pyspark.pipelines.tests.test_block_connect_access",
+        "pyspark.pipelines.tests.test_block_session_mutations",
         "pyspark.pipelines.tests.test_cli",
         "pyspark.pipelines.tests.test_decorators",
         "pyspark.pipelines.tests.test_graph_element_registry",
diff --git a/python/pyspark/errors/error-conditions.json b/python/pyspark/errors/error-conditions.json
@@ -1007,6 +1007,73 @@
       "Cannot start a remote Spark session because there is a regular Spark session already running."
     ]
   },
+  "SESSION_MUTATION_IN_DECLARATIVE_PIPELINE": {
+    "message": [
+      "Session mutation <method> is not allowed in declarative pipelines."
+    ],
+    "sub_class": {
+      "SET_RUNTIME_CONF": {
+        "message": [
+          "Instead set configuration via the pipeline spec or use the 'spark_conf' argument in various decorators."
+        ]
+      },
+      "SET_CURRENT_CATALOG": {
+        "message": [
+          "Instead set catalog via the pipeline spec or the 'name' argument on the dataset decorators."
+        ]
+      },
+      "SET_CURRENT_DATABASE": {
+        "message": [
+          "Instead set database via the pipeline spec or the 'name' argument on the dataset decorators."
+        ]
+      },
+      "DROP_TEMP_VIEW": {
+        "message": [
+          "Instead remove the temporary view definition directly."
+        ]
+      },
+      "DROP_GLOBAL_TEMP_VIEW": {
+        "message": [
+          "Instead remove the temporary view definition directly."
+        ]
+      },
+      "CREATE_TEMP_VIEW": {
+        "message": [
+          "Instead use the @temporary_view decorator to define temporary views."
+        ]
+      },
+      "CREATE_OR_REPLACE_TEMP_VIEW": {
+        "message": [
+          "Instead use the @temporary_view decorator to define temporary views."
+        ]
+      },
+      "CREATE_GLOBAL_TEMP_VIEW": {
+        "message": [
+          "Instead use the @temporary_view decorator to define temporary views."
+        ]
+      },
+      "CREATE_OR_REPLACE_GLOBAL_TEMP_VIEW": {
+        "message": [
+          "Instead use the @temporary_view decorator to define temporary views."
+        ]
+      },
+      "REGISTER_UDF": {
+        "message": [
+          ""
+        ]
+      },
+      "REGISTER_JAVA_UDF": {
+        "message": [
+          ""
+        ]
+      },
+      "REGISTER_JAVA_UDAF": {
+        "message": [
+          ""
+        ]
+      }
+    }
+  },
   "SESSION_NEED_CONN_STR_OR_BUILDER": {
     "message": [
       "Needs either connection string or channelBuilder (mutually exclusive) to create a new SparkSession."
diff --git a/python/pyspark/pipelines/block_session_mutations.py b/python/pyspark/pipelines/block_session_mutations.py
@@ -0,0 +1,135 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from contextlib import contextmanager
+from typing import Generator, NoReturn, List, Callable
+
+from pyspark.errors import PySparkException
+from pyspark.sql.connect.catalog import Catalog
+from pyspark.sql.connect.conf import RuntimeConf
+from pyspark.sql.connect.dataframe import DataFrame
+from pyspark.sql.connect.udf import UDFRegistration
+
+# pyspark methods that should be blocked from executing in python pipeline definition files
+ERROR_CLASS = "SESSION_MUTATION_IN_DECLARATIVE_PIPELINE"
+BLOCKED_METHODS: List = [
+    {
+        "class": RuntimeConf,
+        "method": "set",
+        "error_sub_class": "SET_RUNTIME_CONF",
+    },
+    {
+        "class": Catalog,
+        "method": "setCurrentCatalog",
+        "error_sub_class": "SET_CURRENT_CATALOG",
+    },
+    {
+        "class": Catalog,
+        "method": "setCurrentDatabase",
+        "error_sub_class": "SET_CURRENT_DATABASE",
+    },
+    {
+        "class": Catalog,
+        "method": "dropTempView",
+        "error_sub_class": "DROP_TEMP_VIEW",
+    },
+    {
+        "class": Catalog,
+        "method": "dropGlobalTempView",
+        "error_sub_class": "DROP_GLOBAL_TEMP_VIEW",
+    },
+    {
+        "class": DataFrame,
+        "method": "createTempView",
+        "error_sub_class": "CREATE_TEMP_VIEW",
+    },
+    {
+        "class": DataFrame,
+        "method": "createOrReplaceTempView",
+        "error_sub_class": "CREATE_OR_REPLACE_TEMP_VIEW",
+    },
+    {
+        "class": DataFrame,
+        "method": "createGlobalTempView",
+        "error_sub_class": "CREATE_GLOBAL_TEMP_VIEW",
+    },
+    {
+        "class": DataFrame,
+        "method": "createOrReplaceGlobalTempView",
+        "error_sub_class": "CREATE_OR_REPLACE_GLOBAL_TEMP_VIEW",
+    },
+    {
+        "class": UDFRegistration,
+        "method": "register",
+        "error_sub_class": "REGISTER_UDF",
+    },
+    {
+        "class": UDFRegistration,
+        "method": "registerJavaFunction",
+        "error_sub_class": "REGISTER_JAVA_UDF",
+    },
+    {
+        "class": UDFRegistration,
+        "method": "registerJavaUDAF",
+        "error_sub_class": "REGISTER_JAVA_UDAF",
+    },
+]
+
+
+def _create_blocked_method(error_method_name: str, error_sub_class: str) -> Callable:
+    def blocked_method(*args: object, **kwargs: object) -> NoReturn:
+        raise PySparkException(
+            errorClass=f"{ERROR_CLASS}.{error_sub_class}",
+            messageParameters={
+                "method": error_method_name,
+            },
+        )
+
+    return blocked_method
+
+
+@contextmanager
+def block_session_mutations() -> Generator[None, None, None]:
+    """
+    Context manager that blocks imperative constructs found in a pipeline python definition file
+    See BLOCKED_METHODS above for a list
+    """
+    # Store original methods
+    original_methods = {}
+    for method_info in BLOCKED_METHODS:
+        cls = method_info["class"]
+        method_name = method_info["method"]
+        original_methods[(cls, method_name)] = getattr(cls, method_name)
+
+    try:
+        # Replace methods with blocked versions
+        for method_info in BLOCKED_METHODS:
+            cls = method_info["class"]
+            method_name = method_info["method"]
+            error_method_name = f"'{cls.__name__}.{method_name}'"
+            blocked_method = _create_blocked_method(
+                error_method_name, method_info["error_sub_class"]
+            )
+            setattr(cls, method_name, blocked_method)
+
+        yield
+    finally:
+        # Restore original methods
+        for method_info in BLOCKED_METHODS:
+            cls = method_info["class"]
+            method_name = method_info["method"]
+            original_method = original_methods[(cls, method_name)]
+            setattr(cls, method_name, original_method)
diff --git a/python/pyspark/pipelines/cli.py b/python/pyspark/pipelines/cli.py
@@ -32,6 +32,7 @@
 
 from pyspark.errors import PySparkException, PySparkTypeError
 from pyspark.sql import SparkSession
+from pyspark.pipelines.block_session_mutations import block_session_mutations
 from pyspark.pipelines.graph_element_registry import (
     graph_element_registration_context,
     GraphElementRegistry,
@@ -192,7 +193,8 @@ def register_definitions(
                         assert (
                             module_spec.loader is not None
                         ), f"Module spec has no loader for {file}"
-                        module_spec.loader.exec_module(module)
+                        with block_session_mutations():
+                            module_spec.loader.exec_module(module)
                     elif file.suffix == ".sql":
                         log_with_curr_timestamp(f"Registering SQL file {file}...")
                         with file.open("r") as f:
diff --git a/python/pyspark/pipelines/tests/test_block_session_mutations.py b/python/pyspark/pipelines/tests/test_block_session_mutations.py