apache · AnishMahto · Jun 17, 2025
diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json
@@ -2719,6 +2719,33 @@
     ],
     "sqlState" : "42000"
   },
+  "INVALID_FLOW_RELATION_TYPE" : {
+    "message" : [
+      "Flow <flowIdentifier> returns an invalid relation type."
+    ],
+    "subClass" : {
+      "FOR_MATERIALIZED_VIEW" : {
+        "message" : [
+          "Materialized views may only be defined by a batch relation, but the flow <flowIdentifier> attempts to write a streaming relation to the materialized view <tableIdentifier>."
+        ]
+      },
+      "FOR_ONCE_FLOW" : {
+        "message" : [
+          "<flowIdentifier> is an append once-flow that is defined by a streaming relation. Append once-flows may only be defined by or return a batch relation."
+        ]
+      },
+      "FOR_PERSISTED_VIEW" : {
+        "message" : [
+          "Persisted views may only be defined by a batch relation, but the flow <flowIdentifier> attempts to write a streaming relation to the persisted view <viewIdentifier>."
+        ]
+      },
+      "FOR_STREAMING_TABLE" : {
+        "message" : [
+          "Streaming tables may only be defined by streaming relations, but the flow <flowIdentifier> attempts to write a batch relation to the streaming table <tableIdentifier>. Consider using the STREAM operator in Spark-SQL to convert the batch relation into a streaming relation, or populating the streaming table with an append once-flow instead."
+        ]
+      }
+    }
+  },
   "INVALID_FORMAT" : {
     "message" : [
       "The format is invalid: <format>."

diff --git a/...nnect/server/src/main/scala/org/apache/spark/sql/connect/pipelines/PipelinesHandler.scala b/...nnect/server/src/main/scala/org/apache/spark/sql/connect/pipelines/PipelinesHandler.scala
@@ -161,7 +161,7 @@ private[connect] object PipelinesHandler extends Logging {
               language = Option(Python())),
             format = Option.when(dataset.hasFormat)(dataset.getFormat),
             normalizedPath = None,
-            isStreamingTableOpt = None))
+            isStreamingTable = dataset.getDatasetType == proto.DatasetType.TABLE))
       case proto.DatasetType.TEMPORARY_VIEW =>
         val viewIdentifier =
           GraphIdentifierManager.parseTableIdentifier(dataset.getDatasetName, sparkSession)

diff --git a/...ct/server/src/test/scala/org/apache/spark/sql/connect/pipelines/PythonPipelineSuite.scala b/...ct/server/src/test/scala/org/apache/spark/sql/connect/pipelines/PythonPipelineSuite.scala
@@ -90,10 +90,11 @@ class PythonPipelineSuite
   }
 
   test("basic") {
-    val graph = buildGraph("""
+    val graph = buildGraph(
+      """
         |@sdp.table
         |def table1():
-        |    return spark.range(10)
+        |    return spark.readStream.format("rate").load()
         |""".stripMargin)
       .resolve()
       .validate()
@@ -112,11 +113,11 @@ class PythonPipelineSuite
         |def c():
         |  return spark.readStream.table("a")
         |
-        |@sdp.table()
+        |@sdp.materialized_view()
         |def d():
         |  return spark.read.table("a")
         |
-        |@sdp.table()
+        |@sdp.materialized_view()
         |def a():
         |  return spark.range(5)
         |""".stripMargin)
@@ -177,11 +178,11 @@ class PythonPipelineSuite
   test("referencing external datasets") {
     sql("CREATE TABLE spark_catalog.default.src AS SELECT * FROM RANGE(5)")
     val graph = buildGraph("""
-        |@sdp.table
+        |@sdp.materialized_view
         |def a():
         |  return spark.read.table("spark_catalog.default.src")
         |
-        |@sdp.table
+        |@sdp.materialized_view
         |def b():
         |  return spark.table("spark_catalog.default.src")
         |
@@ -230,11 +231,11 @@ class PythonPipelineSuite
         |def a():
         |  return spark.read.table("spark_catalog.default.src")
         |
-        |@sdp.table
+        |@sdp.materialized_view
         |def b():
         |  return spark.table("spark_catalog.default.src")
         |
-        |@sdp.table
+        |@sdp.materialized_view
         |def c():
         |  return spark.readStream.table("spark_catalog.default.src")
         |""".stripMargin).resolve()

diff --git a/...t/scala/org/apache/spark/sql/connect/pipelines/SparkDeclarativePipelinesServerSuite.scala b/...t/scala/org/apache/spark/sql/connect/pipelines/SparkDeclarativePipelinesServerSuite.scala
@@ -206,7 +206,7 @@ class SparkDeclarativePipelinesServerSuite
           sql = Some("SELECT * FROM STREAM tableA"))
         createTable(
           name = "tableC",
-          datasetType = DatasetType.TABLE,
+          datasetType = DatasetType.MATERIALIZED_VIEW,
           sql = Some("SELECT * FROM tableB"))
       }
 
@@ -238,7 +238,7 @@ class SparkDeclarativePipelinesServerSuite
         createView(name = "viewC", sql = "SELECT * FROM curr.tableB")
         createTable(
           name = "other.tableD",
-          datasetType = proto.DatasetType.TABLE,
+          datasetType = proto.DatasetType.MATERIALIZED_VIEW,
           sql = Some("SELECT * FROM viewC"))
       }
 

diff --git a/...lines/src/main/scala/org/apache/spark/sql/pipelines/graph/CoreDataflowNodeProcessor.scala b/...lines/src/main/scala/org/apache/spark/sql/pipelines/graph/CoreDataflowNodeProcessor.scala
@@ -80,12 +80,6 @@ class CoreDataflowNodeProcessor(rawGraph: DataflowGraph) {
         val resolvedFlowsToTable = flowsToTable.map { flow =>
           resolvedFlowNodesMap.get(flow.identifier)
         }
-
-        // Assign isStreamingTable (MV or ST) to the table based on the resolvedFlowsToTable
-        val tableWithType = table.copy(
-          isStreamingTableOpt = Option(resolvedFlowsToTable.exists(f => f.df.isStreaming))
-        )
-
         // We mark all tables as virtual to ensure resolution uses incoming flows
         // rather than previously materialized tables.
         val virtualTableInput = VirtualTableInput(
@@ -95,7 +89,7 @@ class CoreDataflowNodeProcessor(rawGraph: DataflowGraph) {
           availableFlows = resolvedFlowsToTable
         )
         resolvedInputs.put(table.identifier, virtualTableInput)
-        Seq(tableWithType)
+        Seq(table)
       case view: View =>
         // For view, add the flow to resolvedInputs and return empty.
         require(upstreamNodes.size == 1, "Found multiple flows to view")

diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DataflowGraph.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DataflowGraph.scala
@@ -191,6 +191,7 @@ case class DataflowGraph(flows: Seq[Flow], tables: Seq[Table], views: Seq[View])
     validatePersistedViewSources()
     validateEveryDatasetHasFlow()
     validateTablesAreResettable()
+    validateFlowStreamingness()
     inferredSchema
   }.failed
 

diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DatasetManager.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DatasetManager.scala
@@ -178,15 +178,15 @@ object DatasetManager extends Logging {
     }
 
     // Wipe the data if we need to
-    if ((isFullRefresh || !table.isStreamingTableOpt.get) && existingTableOpt.isDefined) {
+    if ((isFullRefresh || !table.isStreamingTable) && existingTableOpt.isDefined) {
       context.spark.sql(s"TRUNCATE TABLE ${table.identifier.quotedString}")
     }
 
     // Alter the table if we need to
     if (existingTableOpt.isDefined) {
       val existingSchema = existingTableOpt.get.schema()
 
-      val targetSchema = if (table.isStreamingTableOpt.get && !isFullRefresh) {
+      val targetSchema = if (table.isStreamingTable && !isFullRefresh) {
         SchemaMergingUtils.mergeSchemas(existingSchema, outputSchema)
       } else {
         outputSchema

diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/GraphValidations.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/GraphValidations.scala
@@ -55,6 +55,75 @@ trait GraphValidations extends Logging {
     multiQueryTables
   }
 
+  protected[graph] def validateFlowStreamingness(): Unit = {
+    flowsTo.foreach { case (destTableIdentifier, flows) =>
+      val destTableOpt = table.get(destTableIdentifier)
+
+      // If the destination identifier does not correspond to a table, it must be a view.
+      val destViewOpt = destTableOpt.fold(view.get(destTableIdentifier))(_ => None)
+
+      flows.foreach {
+        case resolvedFlow: ResolvedFlow =>
+          // A flow must be successfully analyzed, thus resolved, in order to determine if it is
+          // streaming or not. Unresolved flows will throw an exception anyway via
+          // [[validateSuccessfulFlowAnalysis]], so don't check them here.
+          if (resolvedFlow.once) {
+            // Once flows by definition should be batch flows, not streaming.
+            if (resolvedFlow.df.isStreaming) {
+              throw new AnalysisException(
+                errorClass = "INVALID_FLOW_RELATION_TYPE.FOR_ONCE_FLOW",
+                messageParameters = Map(
+                  "flowIdentifier" -> resolvedFlow.identifier.quotedString
+                )
+              )
+            }
+          } else {
+            destTableOpt.foreach { destTable =>
+              if (destTable.isStreamingTable) {
+                if (!resolvedFlow.df.isStreaming) {
+                  throw new AnalysisException(
+                    errorClass = "INVALID_FLOW_RELATION_TYPE.FOR_STREAMING_TABLE",
+                    messageParameters = Map(
+                      "flowIdentifier" -> resolvedFlow.identifier.quotedString,
+                      "tableIdentifier" -> destTableIdentifier.quotedString
+                    )
+                  )
+                }
+              } else {
+                if (resolvedFlow.df.isStreaming) {
+                  // This check intentionally does NOT prevent materialized views from reading from
+                  // a streaming table using a _batch_ read, which is still considered valid.
+                  throw new AnalysisException(
+                    errorClass = "INVALID_FLOW_RELATION_TYPE.FOR_MATERIALIZED_VIEW",
+                    messageParameters = Map(
+                      "flowIdentifier" -> resolvedFlow.identifier.quotedString,
+                      "tableIdentifier" -> destTableIdentifier.quotedString
+                    )
+                  )
+                }
+              }
+            }
+
+            destViewOpt.foreach {
+              case _: PersistedView =>
+                if (resolvedFlow.df.isStreaming) {
+                  throw new AnalysisException(
+                    errorClass = "INVALID_FLOW_RELATION_TYPE.FOR_PERSISTED_VIEW",
+                    messageParameters = Map(
+                      "flowIdentifier" -> resolvedFlow.identifier.quotedString,
+                      "viewIdentifier" -> destTableIdentifier.quotedString
+                    )
+                  )
+                }
+              case _: TemporaryView =>
+                // Temporary views' flows are allowed to be either streaming or batch, so no
+                // validation needs to be done for them
+            }
+          }
+      }
+    }
+  }
+
   /** Throws an exception if the flows in this graph are not topologically sorted. */
   protected[graph] def validateGraphIsTopologicallySorted(): Unit = {
     val visitedNodes = mutable.Set.empty[TableIdentifier] // Set of visited nodes

diff --git a/...nes/src/main/scala/org/apache/spark/sql/pipelines/graph/SqlGraphRegistrationContext.scala b/...nes/src/main/scala/org/apache/spark/sql/pipelines/graph/SqlGraphRegistrationContext.scala
@@ -199,7 +199,7 @@ class SqlGraphRegistrationContext(
           ),
           format = cst.tableSpec.provider,
           normalizedPath = None,
-          isStreamingTableOpt = None
+          isStreamingTable = true
         )
       )
     }
@@ -230,7 +230,7 @@ class SqlGraphRegistrationContext(
           ),
           format = cst.tableSpec.provider,
           normalizedPath = None,
-          isStreamingTableOpt = None
+          isStreamingTable = true
         )
       )
 
@@ -281,7 +281,7 @@ class SqlGraphRegistrationContext(
           ),
           format = cmv.tableSpec.provider,
           normalizedPath = None,
-          isStreamingTableOpt = None
+          isStreamingTable = false
         )
       )
 

diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/elements.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/elements.scala
@@ -114,8 +114,7 @@ sealed trait TableInput extends Input {
  *                       path (if not defined, we will normalize a managed storage path for it).
  * @param properties Table Properties to set in table metadata.
  * @param comment User-specified comment that can be placed on the table.
- * @param isStreamingTableOpt if the table is a streaming table, will be None until we have resolved
- *                            flows into table
+ * @param isStreamingTable if the table is a streaming table, as defined by the source code.
  */
 case class Table(
     identifier: TableIdentifier,
@@ -125,7 +124,7 @@ case class Table(
     properties: Map[String, String] = Map.empty,
     comment: Option[String],
     baseOrigin: QueryOrigin,
-    isStreamingTableOpt: Option[Boolean],
+    isStreamingTable: Boolean,
     format: Option[String]
 ) extends TableInput
     with Output {
@@ -163,17 +162,6 @@ case class Table(
     normalizedPath.get
   }
 
-  /**
-   * Tell if a table is a streaming table or not. This property is not set until we have resolved
-   * the flows into the table. The exception reminds engineers that they cant call at random time.
-   */
-  def isStreamingTable: Boolean = isStreamingTableOpt.getOrElse {
-    throw new IllegalStateException(
-      "Cannot identify whether the table is streaming table or not. You may need to resolve the " +
-      "flows into table."
-    )
-  }
-
   /**
    * Get the DatasetType of the table
    */

diff --git a/...nes/src/test/scala/org/apache/spark/sql/pipelines/graph/ConnectInvalidPipelineSuite.scala b/...nes/src/test/scala/org/apache/spark/sql/pipelines/graph/ConnectInvalidPipelineSuite.scala
@@ -429,6 +429,77 @@ class ConnectInvalidPipelineSuite extends PipelineTest {
     )
   }
 
+  test("Streaming table backed by batch relation fails validation") {
+    val session = spark
+    import session.implicits._
+
+    val graph = new TestGraphRegistrationContext(spark) {
+      registerTable("a", query = Option(dfFlowFunc(Seq(1, 2).toDF())))
+    }.resolveToDataflowGraph()
+
+    val ex = intercept[AnalysisException] {
+      graph.validate()
+    }
+
+    checkError(
+      exception = ex,
+      condition = "INVALID_FLOW_RELATION_TYPE.FOR_STREAMING_TABLE",
+      parameters = Map(
+        "flowIdentifier" -> fullyQualifiedIdentifier("a").quotedString,
+        "tableIdentifier" -> fullyQualifiedIdentifier("a").quotedString
+      )
+    )
+  }
+
+  test("Materialized view backed by streaming relation fails validation") {
+    val session = spark
+    import session.implicits._
+
+    val graph = new TestGraphRegistrationContext(spark) {
+      registerMaterializedView("a", query = dfFlowFunc(MemoryStream[Int].toDF()))
+    }.resolveToDataflowGraph()
+
+    val ex = intercept[AnalysisException] {
+      graph.validate()
+    }
+
+    checkError(
+      exception = ex,
+      condition = "INVALID_FLOW_RELATION_TYPE.FOR_MATERIALIZED_VIEW",
+      parameters = Map(
+        "flowIdentifier" -> fullyQualifiedIdentifier("a").quotedString,
+        "tableIdentifier" -> fullyQualifiedIdentifier("a").quotedString
+      )
+    )
+  }
+
+  test("Once flow backed by streaming relation fails validation") {
+    val session = spark
+    import session.implicits._
+
+    val graph = new TestGraphRegistrationContext(spark) {
+      registerTable("a")
+      registerFlow(
+        destinationName = "a",
+        name = "once_flow",
+        query = dfFlowFunc(MemoryStream[Int].toDF()),
+        once = true
+      )
+    }.resolveToDataflowGraph()
+
+    val ex = intercept[AnalysisException] {
+      graph.validate()
+    }
+
+    checkError(
+      exception = ex,
+      condition = "INVALID_FLOW_RELATION_TYPE.FOR_ONCE_FLOW",
+      parameters = Map(
+        "flowIdentifier" -> fullyQualifiedIdentifier("once_flow").quotedString
+      )
+    )
+  }
+
   test("Inferred schema that isn't a subset of user-specified schema") {
     val session = spark
     import session.implicits._