add flag

zhengruifeng · zhengruifeng · commit 13a7da97daaa · 2025-07-11T09:23:57.000+08:00
add flag

add flag
diff --git a/python/docs/source/migration_guide/pyspark_upgrade.rst b/python/docs/source/migration_guide/pyspark_upgrade.rst
@@ -22,8 +22,8 @@ Upgrading PySpark
 Upgrading from PySpark 4.0 to 4.1
 ---------------------------------
 
+* In Spark 4.1, ``DataFrame['name']`` and ``DataFrame.name`` on Spark Connect Python Client no longer eagerly validate the column name. To restore the legacy behavior, set ``PYSPARK_VALIDATE_COLUMN_NAME_LEGACY`` environment variable to ``1``.
 * In Spark 4.1, Arrow-optimized Python UDF supports UDT input / output instead of falling back to the regular UDF. To restore the legacy behavior, set ``spark.sql.execution.pythonUDF.arrow.legacy.fallbackOnUDT`` to ``true``.
-
 * In Spark 4.1, unnecessary conversion to pandas instances is removed when ``spark.sql.execution.pythonUDTF.arrow.enabled`` is enabled. As a result, the type coercion changes when the produced output has a schema different from the specified schema. To restore the previous behavior, enable ``spark.sql.legacy.execution.pythonUDTF.pandas.conversion.enabled``.
 
 
diff --git a/python/pyspark/sql/connect/dataframe.py b/python/pyspark/sql/connect/dataframe.py
@@ -44,6 +44,7 @@
 )
 
 import copy
+import os
 import sys
 import random
 import pyarrow as pa
@@ -1703,7 +1704,10 @@ def __getattr__(self, name: str) -> "Column":
                 errorClass="JVM_ATTRIBUTE_NOT_SUPPORTED", messageParameters={"attr_name": name}
             )
 
-        if name not in self.columns:
+        if (
+            os.environ.get("PYSPARK_VALIDATE_COLUMN_NAME_LEGACY") == "1"
+            and name not in self.columns
+        ):
             raise PySparkAttributeError(
                 errorClass="ATTRIBUTE_NOT_SUPPORTED", messageParameters={"attr_name": name}
             )
@@ -1732,6 +1736,25 @@ def __getitem__(
                     )
                 )
             else:
+                # TODO: revisit classic Spark's Dataset.col
+                # if (sparkSession.sessionState.conf.supportQuotedRegexColumnName) {
+                #   colRegex(colName)
+                # } else {
+                #   ConnectColumn(addDataFrameIdToCol(resolve(colName)))
+                # }
+
+                # validate the column name
+                if (
+                    os.environ.get("PYSPARK_VALIDATE_COLUMN_NAME_LEGACY") == "1"
+                    and not hasattr(self._session, "is_mock_session"),
+                ):
+                    from pyspark.sql.connect.types import verify_col_name
+
+                    # Try best to verify the column name with cached schema
+                    # If fails, fall back to the server side validation
+                    if not verify_col_name(item, self._schema):
+                        self.select(item).isLocal()
+
                 return self._col(item)
         elif isinstance(item, Column):
             return self.filter(item)