Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 420ac24

Browse files
yuexingLuciferYang
authored andcommittedJun 17, 2025·
[SPARK-33538][SQL] Directly push IN/NOT predicates to the Hive Metastore
### What changes were proposed in this pull request? This PR refactors the way IN and NOT IN predicates are converted in `HiveShim`. Previously, these predicates were expanded into chains of OR/AND comparisons (e.g., `a = 1` or `a = 2`). This change updates the logic to use SQL-native IN and NOT IN syntax (e.g., `a in (1, 2)`), improving readability and possibly query performance. ### Why are the changes needed? Efficiency: Many SQL engines optimize IN/NOT IN natively, which can improve performance and reduce query complexity. Maintainability: Simplifies predicate generation logic, making the codebase easier to maintain and extend. ### Does this PR introduce _any_ user-facing change? No user-facing changes; this is an internal refactor that affects query generation logic. ### How was this patch tested? Existing unit tests for predicate conversion and query generation should be sufficient to cover the updated logic. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #51132 from yuexing/SPARK-33538-hive. Lead-authored-by: xingyue <[email protected]> Co-authored-by: Yue <[email protected]> Signed-off-by: yangjie01 <[email protected]>
1 parent 243af2f commit 420ac24

File tree

2 files changed

+19
-19
lines changed

2 files changed

+19
-19
lines changed
 

‎sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -749,12 +749,12 @@ private[client] class Shim_v2_0 extends Shim with Logging {
749749
}
750750
}
751751

752-
def convertInToOr(name: String, values: Seq[String]): String = {
753-
values.map(value => s"$name = $value").mkString("(", " or ", ")")
752+
def convertIn(name: String, values: Seq[String]): String = {
753+
s"($name) in (${values.mkString(", ")})"
754754
}
755755

756-
def convertNotInToAnd(name: String, values: Seq[String]): String = {
757-
values.map(value => s"$name != $value").mkString("(", " and ", ")")
756+
def convertNotIn(name: String, values: Seq[String]): String = {
757+
s"($name) not in (${values.mkString(", ")})"
758758
}
759759

760760
def hasNullLiteral(list: Seq[Expression]): Boolean = list.exists {
@@ -786,11 +786,11 @@ private[client] class Shim_v2_0 extends Shim with Logging {
786786

787787
case In(ExtractAttribute(SupportedAttribute(name)), ExtractableLiterals(values))
788788
if useAdvanced =>
789-
Some(convertInToOr(name, values))
789+
Some(convertIn(name, values))
790790

791791
case Not(In(ExtractAttribute(SupportedAttribute(name)), ExtractableLiterals(values)))
792792
if useAdvanced =>
793-
Some(convertNotInToAnd(name, values))
793+
Some(convertNotIn(name, values))
794794

795795
case InSet(child, values) if useAdvanced && values.size > inSetThreshold =>
796796
val dataType = child.dataType
@@ -802,19 +802,19 @@ private[client] class Shim_v2_0 extends Shim with Logging {
802802

803803
case InSet(child @ ExtractAttribute(SupportedAttribute(name)), ExtractableDateValues(values))
804804
if useAdvanced && child.dataType == DateType =>
805-
Some(convertInToOr(name, values))
805+
Some(convertIn(name, values))
806806

807807
case Not(InSet(child @ ExtractAttribute(SupportedAttribute(name)),
808808
ExtractableDateValues(values))) if useAdvanced && child.dataType == DateType =>
809-
Some(convertNotInToAnd(name, values))
809+
Some(convertNotIn(name, values))
810810

811811
case InSet(ExtractAttribute(SupportedAttribute(name)), ExtractableValues(values))
812812
if useAdvanced =>
813-
Some(convertInToOr(name, values))
813+
Some(convertIn(name, values))
814814

815815
case Not(InSet(ExtractAttribute(SupportedAttribute(name)), ExtractableValues(values)))
816816
if useAdvanced =>
817-
Some(convertNotInToAnd(name, values))
817+
Some(convertNotIn(name, values))
818818

819819
case op @ SpecialBinaryComparison(
820820
ExtractAttribute(SupportedAttribute(name)), ExtractableLiteral(value)) =>

‎sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ class FiltersSuite extends SparkFunSuite with PlanTest {
7171
filterTest("date filter with IN predicate",
7272
(a("datecol", DateType) in
7373
(Literal(Date.valueOf("2019-01-01")), Literal(Date.valueOf("2019-01-07")))) :: Nil,
74-
"(datecol = \"2019-01-01\" or datecol = \"2019-01-07\")")
74+
"(datecol) in (\"2019-01-01\", \"2019-01-07\")")
7575

7676
filterTest("date and string filter",
7777
(Literal(Date.valueOf("2019-01-01")) === a("datecol", DateType)) ::
@@ -84,7 +84,7 @@ class FiltersSuite extends SparkFunSuite with PlanTest {
8484

8585
filterTest("string filter with InSet predicate",
8686
InSet(a("strcol", StringType), Set("1", "2").map(s => UTF8String.fromString(s))) :: Nil,
87-
"(strcol = \"1\" or strcol = \"2\")")
87+
"(strcol) in (\"1\", \"2\")")
8888

8989
filterTest("skip varchar",
9090
(Literal("") === a("varchar", StringType)) :: Nil,
@@ -97,7 +97,7 @@ class FiltersSuite extends SparkFunSuite with PlanTest {
9797

9898
filterTest("SPARK-24879 null literals should be ignored for IN constructs",
9999
(a("intcol", IntegerType) in (Literal(1), Literal(null))) :: Nil,
100-
"(intcol = 1)")
100+
"(intcol) in (1)")
101101

102102
filterTest("NOT: int and string filters",
103103
(a("intcol", IntegerType) =!= Literal(1)) :: (Literal("a") =!= a("strcol", IntegerType)) :: Nil,
@@ -109,7 +109,7 @@ class FiltersSuite extends SparkFunSuite with PlanTest {
109109

110110
filterTest("not-in, string filter",
111111
(Not(In(a("strcol", StringType), Seq(Literal("a"), Literal("b"))))) :: Nil,
112-
"""(strcol != "a" and strcol != "b")""")
112+
"""(strcol) not in ("a", "b")""")
113113

114114
filterTest("not-in, string filter with null",
115115
(Not(In(a("strcol", StringType), Seq(Literal("a"), Literal("b"), Literal(null))))) :: Nil,
@@ -118,7 +118,7 @@ class FiltersSuite extends SparkFunSuite with PlanTest {
118118
filterTest("not-in, date filter",
119119
(Not(In(a("datecol", DateType),
120120
Seq(Literal(Date.valueOf("2021-01-01")), Literal(Date.valueOf("2021-01-02")))))) :: Nil,
121-
"""(datecol != "2021-01-01" and datecol != "2021-01-02")""")
121+
"""(datecol) not in ("2021-01-01", "2021-01-02")""")
122122

123123
filterTest("not-in, date filter with null",
124124
(Not(In(a("datecol", DateType),
@@ -128,7 +128,7 @@ class FiltersSuite extends SparkFunSuite with PlanTest {
128128

129129
filterTest("not-inset, string filter",
130130
(Not(InSet(a("strcol", StringType), Set(Literal("a").eval(), Literal("b").eval())))) :: Nil,
131-
"""(strcol != "a" and strcol != "b")""")
131+
"""(strcol) not in ("a", "b")""")
132132

133133
filterTest("not-inset, string filter with null",
134134
(Not(InSet(a("strcol", StringType),
@@ -139,7 +139,7 @@ class FiltersSuite extends SparkFunSuite with PlanTest {
139139
(Not(InSet(a("datecol", DateType),
140140
Set(Literal(Date.valueOf("2020-01-01")).eval(),
141141
Literal(Date.valueOf("2020-01-02")).eval())))) :: Nil,
142-
"""(datecol != "2020-01-01" and datecol != "2020-01-02")""")
142+
"""(datecol) not in ("2020-01-01", "2020-01-02")""")
143143

144144
filterTest("not-inset, date filter with null",
145145
(Not(InSet(a("datecol", DateType),
@@ -239,14 +239,14 @@ class FiltersSuite extends SparkFunSuite with PlanTest {
239239
withSQLConf(SQLConf.HIVE_METASTORE_PARTITION_PRUNING_INSET_THRESHOLD.key -> "3") {
240240
val intFilter = InSet(a("p", IntegerType), Set(null, 1, 2))
241241
val intConverted = shim.convertFilters(testTable, Seq(intFilter))
242-
assert(intConverted == "(p = 1 or p = 2)")
242+
assert(intConverted == "(p) in (1, 2)")
243243
}
244244

245245
withSQLConf(SQLConf.HIVE_METASTORE_PARTITION_PRUNING_INSET_THRESHOLD.key -> "3") {
246246
val dateFilter = InSet(a("p", DateType), Set(null,
247247
Literal(Date.valueOf("2020-01-01")).eval(), Literal(Date.valueOf("2021-01-01")).eval()))
248248
val dateConverted = shim.convertFilters(testTable, Seq(dateFilter))
249-
assert(dateConverted == "(p = \"2020-01-01\" or p = \"2021-01-01\")")
249+
assert(dateConverted == "(p) in (\"2020-01-01\", \"2021-01-01\")")
250250
}
251251
}
252252

0 commit comments

Comments
 (0)
Please sign in to comment.