[SPARK-52402][PS] Fix divide-by-zero errors in Kendall and Pearson correlation under ANSI mode

xinrong-meng · xinrong-meng · commit 3b75442b5cab · 2025-06-17T17:49:12.000-07:00
### What changes were proposed in this pull request? Fix divide-by-zero error in groupby().corr('kendall') with ANSI mode enabled ### Why are the changes needed? Ensure pandas on Spark works well with ANSI mode on. Part of https://issues.apache.org/jira/browse/SPARK-52169. ### Does this PR introduce _any_ user-facing change? Yes ```py >>> ps.set_option("compute.fail_on_ansi_mode", False) >>> ps.set_option("compute.ansi_mode_support", True) >>> df = ps.DataFrame( ... {"A": [0, 0, 0, 1, 1, 2], "B": [-1, 2, 3, 5, 6, 0], "C": [4, 6, 5, 1, 3, 0]}, ... columns=["A", "B", "C"] ... ) ``` FROM ```py >>> df.groupby("A").corr('kendall') 25/06/04 14:40:03 ERROR Executor: Exception in task 0.0 in stage 13.0 (TID 51) org.apache.spark.SparkArithmeticException: [DIVIDE_BY_ZERO] Division by zero. Use `try_divide` to tolerate divisor being 0 and return NULL instead. If necessary set "spark.sql.ansi.enabled" to "false" to bypass this error. SQLSTATE: 22012 == DataFrame == "__truediv__" was called from ... ``` TO ```py >>> df.groupby("A").corr('kendall') B C A 0 B 1.000000 0.333333 C 0.333333 1.000000 1 B 1.000000 1.000000 C 1.000000 1.000000 2 B 1.000000 NaN C NaN 1.000000 ``` ### How was this patch tested? Unit tests ### Was this patch authored or co-authored using generative AI tooling? No Closes #51090 from xinrong-meng/ansi_corr. Authored-by: Xinrong Meng <xinrong@apache.org> Signed-off-by: Xinrong Meng <xinrong@apache.org>
diff --git a/python/pyspark/pandas/correlation.py b/python/pyspark/pandas/correlation.py
@@ -19,7 +19,7 @@
 
 from pyspark.sql import DataFrame as SparkDataFrame, functions as F
 from pyspark.sql.window import Window
-from pyspark.pandas.utils import verify_temp_column_name
+from pyspark.pandas.utils import verify_temp_column_name, is_ansi_mode_enabled
 
 
 CORRELATION_VALUE_1_COLUMN = "__correlation_value_1_input__"
@@ -60,6 +60,7 @@ def compute(sdf: SparkDataFrame, groupKeys: List[str], method: str) -> SparkData
             .alias(CORRELATION_VALUE_2_COLUMN),
         ],
     )
+    spark_session = sdf.sparkSession
 
     if method in ["pearson", "spearman"]:
         # convert values to avg ranks for spearman correlation
@@ -125,16 +126,20 @@ def compute(sdf: SparkDataFrame, groupKeys: List[str], method: str) -> SparkData
                 )
             )
 
+        if is_ansi_mode_enabled(spark_session):
+            corr_expr = F.try_divide(
+                F.covar_samp(CORRELATION_VALUE_1_COLUMN, CORRELATION_VALUE_2_COLUMN),
+                F.stddev_samp(CORRELATION_VALUE_1_COLUMN)
+                * F.stddev_samp(CORRELATION_VALUE_2_COLUMN),
+            )
+        else:
+            corr_expr = F.corr(CORRELATION_VALUE_1_COLUMN, CORRELATION_VALUE_2_COLUMN)
+
         sdf = sdf.groupby(groupKeys).agg(
-            F.corr(CORRELATION_VALUE_1_COLUMN, CORRELATION_VALUE_2_COLUMN).alias(
-                CORRELATION_CORR_OUTPUT_COLUMN
+            corr_expr.alias(CORRELATION_CORR_OUTPUT_COLUMN),
+            F.count(F.when(~F.isnull(CORRELATION_VALUE_1_COLUMN), 1)).alias(
+                CORRELATION_COUNT_OUTPUT_COLUMN
             ),
-            F.count(
-                F.when(
-                    ~F.isnull(CORRELATION_VALUE_1_COLUMN),
-                    1,
-                )
-            ).alias(CORRELATION_COUNT_OUTPUT_COLUMN),
         )
 
         return sdf
@@ -219,6 +224,42 @@ def compute(sdf: SparkDataFrame, groupKeys: List[str], method: str) -> SparkData
             F.col(CORRELATION_VALUE_2_COLUMN) == F.col(CORRELATION_VALUE_Y_COLUMN)
         )
 
+        if is_ansi_mode_enabled(spark_session):
+            corr_expr = F.try_divide(
+                F.col(CORRELATION_KENDALL_P_COLUMN) - F.col(CORRELATION_KENDALL_Q_COLUMN),
+                F.sqrt(
+                    (
+                        F.col(CORRELATION_KENDALL_P_COLUMN)
+                        + F.col(CORRELATION_KENDALL_Q_COLUMN)
+                        + F.col(CORRELATION_KENDALL_T_COLUMN)
+                    )
+                    * (
+                        F.col(CORRELATION_KENDALL_P_COLUMN)
+                        + F.col(CORRELATION_KENDALL_Q_COLUMN)
+                        + F.col(CORRELATION_KENDALL_U_COLUMN)
+                    )
+                ),
+            )
+        else:
+            corr_expr = (
+                F.col(CORRELATION_KENDALL_P_COLUMN) - F.col(CORRELATION_KENDALL_Q_COLUMN)
+            ) / F.sqrt(
+                (
+                    (
+                        F.col(CORRELATION_KENDALL_P_COLUMN)
+                        + F.col(CORRELATION_KENDALL_Q_COLUMN)
+                        + (F.col(CORRELATION_KENDALL_T_COLUMN))
+                    )
+                )
+                * (
+                    (
+                        F.col(CORRELATION_KENDALL_P_COLUMN)
+                        + F.col(CORRELATION_KENDALL_Q_COLUMN)
+                        + (F.col(CORRELATION_KENDALL_U_COLUMN))
+                    )
+                )
+            )
+
         sdf = (
             sdf.groupby(groupKeys)
             .agg(
@@ -232,26 +273,7 @@ def compute(sdf: SparkDataFrame, groupKeys: List[str], method: str) -> SparkData
                     ).otherwise(F.lit(0))
                 ).alias(CORRELATION_COUNT_OUTPUT_COLUMN),
             )
-            .withColumn(
-                CORRELATION_CORR_OUTPUT_COLUMN,
-                (F.col(CORRELATION_KENDALL_P_COLUMN) - F.col(CORRELATION_KENDALL_Q_COLUMN))
-                / F.sqrt(
-                    (
-                        (
-                            F.col(CORRELATION_KENDALL_P_COLUMN)
-                            + F.col(CORRELATION_KENDALL_Q_COLUMN)
-                            + (F.col(CORRELATION_KENDALL_T_COLUMN))
-                        )
-                    )
-                    * (
-                        (
-                            F.col(CORRELATION_KENDALL_P_COLUMN)
-                            + F.col(CORRELATION_KENDALL_Q_COLUMN)
-                            + (F.col(CORRELATION_KENDALL_U_COLUMN))
-                        )
-                    )
-                ),
-            )
+            .withColumn(CORRELATION_CORR_OUTPUT_COLUMN, corr_expr)
         )
 
         sdf = sdf.select(
diff --git a/python/pyspark/pandas/tests/computation/test_corr.py b/python/pyspark/pandas/tests/computation/test_corr.py
@@ -22,11 +22,9 @@
 from pyspark import pandas as ps
 from pyspark.testing.pandasutils import PandasOnSparkTestCase, SPARK_CONF_ARROW_ENABLED
 from pyspark.testing.sqlutils import SQLTestUtils
-from pyspark.testing.utils import is_ansi_mode_test, ansi_mode_not_supported_message
 
 
 class FrameCorrMixin:
-    @unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message)
     def test_dataframe_corr(self):
         pdf = pd.DataFrame(
             index=[
diff --git a/python/pyspark/pandas/tests/diff_frames_ops/test_corrwith.py b/python/pyspark/pandas/tests/diff_frames_ops/test_corrwith.py
@@ -82,7 +82,6 @@ def tearDownClass(cls):
         reset_option("compute.ops_on_diff_frames")
         super().tearDownClass()
 
-    @unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message)
     def test_corrwith(self):
         df1 = ps.DataFrame({"A": [1, np.nan, 7, 8], "X": [5, 8, np.nan, 3], "C": [10, 4, 9, 3]})
         df2 = ps.DataFrame({"A": [5, 3, 6, 4], "B": [11, 2, 4, 3], "C": [4, 3, 8, np.nan]})
diff --git a/python/pyspark/pandas/tests/groupby/test_corr.py b/python/pyspark/pandas/tests/groupby/test_corr.py
@@ -48,7 +48,6 @@ def test_corr(self):
                 almost=True,
             )
 
-    @unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message)
     def test_method(self):
         for m in ["pearson", "spearman", "kendall"]:
             self.assert_eq(

Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,6 @@ def test_corr(self):`
`48`	`48`	`almost=True,`
`49`	`49`	`)`
`50`	`50`
`51`		`- @unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message)`
`52`	`51`	`def test_method(self):`
`53`	`52`	`for m in ["pearson", "spearman", "kendall"]:`
`54`	`53`	`self.assert_eq(`