@@ -20,7 +20,9 @@ import org.apache.spark.sql.delta.actions.AddFile
20
20
import org .apache .spark .sql .delta .stats .DeltaDataSkippingType .DeltaDataSkippingType
21
21
import com .fasterxml .jackson .databind .annotation .JsonDeserialize
22
22
23
+ import org .apache .spark .sql .catalyst .analysis .UnresolvedAttribute
23
24
import org .apache .spark .sql .catalyst .expressions ._
25
+ import org .apache .spark .sql .types .StructType
24
26
25
27
/**
26
28
* DataSize describes following attributes for data that consists of a list of input files
@@ -82,4 +84,54 @@ case class DeltaScan(
82
84
val scanDurationMs : Long ,
83
85
val dataSkippingType : DeltaDataSkippingType ) {
84
86
def allFilters : ExpressionSet = partitionFilters ++ dataFilters ++ unusedFilters
87
+
88
+ /**
89
+ * Compare a set of filters to the filters for this DeltaScan. Because these filters could
90
+ * be post optimization, nested fields may have different schemas due to schema pruning. To
91
+ * get around this, we convert any nested field to an UnresolvedAttribute for the comparison.
92
+ *
93
+ * @param other ExpressionSet to compare the filters against
94
+ * @return Whether the expressions match with nested schemas ignored
95
+ */
96
+ def filtersMatch (other : ExpressionSet ): Boolean = DeltaScan .filtersMatch(allFilters, other)
97
+ }
98
+
99
+ object DeltaScan {
100
+ private def constructSchema (source : StructType , ordinals : Seq [Int ]): StructType = {
101
+ val extractedField = source.fields(ordinals.head)
102
+ val nestedType = if (ordinals.tail.nonEmpty) {
103
+ constructSchema(extractedField.dataType.asInstanceOf [StructType ], ordinals.tail)
104
+ } else {
105
+ extractedField.dataType
106
+ }
107
+ StructType (Seq (extractedField.copy(dataType = nestedType)))
108
+ }
109
+
110
+ private def pruneExpression (expr : Expression ): Expression = expr transform {
111
+ case NestedFieldExtraction (nameParts) =>
112
+ new UnresolvedAttribute (nameParts)
113
+ }
114
+
115
+ private [delta] def filtersMatch (source : ExpressionSet , target : ExpressionSet ): Boolean = {
116
+ val prunedSource = source.map(pruneExpression _)
117
+ val prunedTarget = target.map(pruneExpression _)
118
+ prunedSource == prunedTarget
119
+ }
120
+ }
121
+
122
+ object NestedFieldExtraction {
123
+ def unapply (e : Expression ): Option [Seq [String ]] = e match {
124
+ case GetStructField (child, ordinal, _) =>
125
+ val nested = child match {
126
+ case NestedFieldExtraction (nameParts) => Some (nameParts)
127
+ case _ => None
128
+ }
129
+ val childSchema = child.dataType.asInstanceOf [StructType ]
130
+ nested.map { nameParts =>
131
+ nameParts :+ childSchema.fields(ordinal).name
132
+ }
133
+ case a : AttributeReference if a.dataType.isInstanceOf [StructType ] =>
134
+ Some (Seq (a.name))
135
+ case _ => None
136
+ }
85
137
}
0 commit comments