Skip to content

Commit 227515c

Browse files
pavle-martinovic_datacloud-fan
authored andcommitted
[SQL][MINOR] Inline recursive CTEs
### What changes were proposed in this pull request? Remove check that prevents recursive CTEs from being inlined. ### Why are the changes needed? Because of LIMITs being able to stop recursive CTEs, we don't want to put repartition by expression above them so that that LocalLimit can be pushed down to UnionLoop. This prevents infinite recursion when using LIMIT to stop infinite recursion for an rCTE called twice. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New golden file tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #51170 from Pajaraja/pavle-martinovic_data/InlinerCTE. Authored-by: pavle-martinovic_data <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
1 parent 2768c2e commit 227515c

File tree

4 files changed

+289
-5
lines changed

4 files changed

+289
-5
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InlineCTE.scala

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,7 @@ case class InlineCTE(
6161
// 1) It is fine to inline a CTE if it references another CTE that is non-deterministic;
6262
// 2) Any `CTERelationRef` that contains `OuterReference` would have been inlined first.
6363
refCount == 1 ||
64-
// Don't inline recursive CTEs if not necessary as recursion is very costly.
65-
// The check if cteDef is recursive is performed by checking if it contains
66-
// a UnionLoopRef with the same ID.
67-
(cteDef.deterministic && !cteDef.hasSelfReferenceAsUnionLoopRef) ||
64+
cteDef.deterministic ||
6865
cteDef.child.exists(_.expressions.exists(_.isInstanceOf[OuterReference]))
6966
}
7067

sql/core/src/test/resources/sql-tests/analyzer-results/cte-recursion.sql.out

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1854,3 +1854,160 @@ WithCTE
18541854
+- Project [n#x]
18551855
+- SubqueryAlias t1
18561856
+- CTERelationRef xxxx, true, [n#x], false, false
1857+
1858+
1859+
-- !query
1860+
WITH RECURSIVE t1(n) AS (
1861+
SELECT 1
1862+
UNION ALL
1863+
SELECT n + 1 FROM t1 WHERE n < 5
1864+
)
1865+
SELECT (SELECT SUM(n) FROM (SELECT * FROM t1)), (SELECT SUM(n) FROM (SELECT * FROM t1 LIMIT 3))
1866+
-- !query analysis
1867+
WithCTE
1868+
:- CTERelationDef xxxx, false
1869+
: +- SubqueryAlias t1
1870+
: +- Project [1#x AS n#x]
1871+
: +- UnionLoop xxxx
1872+
: :- Project [1 AS 1#x]
1873+
: : +- OneRowRelation
1874+
: +- Project [(n#x + 1) AS (n + 1)#x]
1875+
: +- Filter (n#x < 5)
1876+
: +- SubqueryAlias t1
1877+
: +- Project [1#x AS n#x]
1878+
: +- UnionLoopRef xxxx, [1#x], false
1879+
+- Project [scalar-subquery#x [] AS scalarsubquery()#xL, scalar-subquery#x [] AS scalarsubquery()#xL]
1880+
: :- Aggregate [sum(n#x) AS sum(n)#xL]
1881+
: : +- SubqueryAlias __auto_generated_subquery_name
1882+
: : +- Project [n#x]
1883+
: : +- SubqueryAlias t1
1884+
: : +- CTERelationRef xxxx, true, [n#x], false, false
1885+
: +- Aggregate [sum(n#x) AS sum(n)#xL]
1886+
: +- SubqueryAlias __auto_generated_subquery_name
1887+
: +- GlobalLimit 3
1888+
: +- LocalLimit 3
1889+
: +- Project [n#x]
1890+
: +- SubqueryAlias t1
1891+
: +- CTERelationRef xxxx, true, [n#x], false, false
1892+
+- OneRowRelation
1893+
1894+
1895+
-- !query
1896+
WITH RECURSIVE t1(n) AS (
1897+
SELECT 1
1898+
UNION ALL
1899+
SELECT n + 1 FROM t1
1900+
)
1901+
SELECT (SELECT SUM(n) FROM (SELECT * FROM t1 LIMIT 5)), (SELECT SUM(n) FROM (SELECT * FROM t1 LIMIT 3))
1902+
-- !query analysis
1903+
WithCTE
1904+
:- CTERelationDef xxxx, false
1905+
: +- SubqueryAlias t1
1906+
: +- Project [1#x AS n#x]
1907+
: +- UnionLoop xxxx
1908+
: :- Project [1 AS 1#x]
1909+
: : +- OneRowRelation
1910+
: +- Project [(n#x + 1) AS (n + 1)#x]
1911+
: +- SubqueryAlias t1
1912+
: +- Project [1#x AS n#x]
1913+
: +- UnionLoopRef xxxx, [1#x], false
1914+
+- Project [scalar-subquery#x [] AS scalarsubquery()#xL, scalar-subquery#x [] AS scalarsubquery()#xL]
1915+
: :- Aggregate [sum(n#x) AS sum(n)#xL]
1916+
: : +- SubqueryAlias __auto_generated_subquery_name
1917+
: : +- GlobalLimit 5
1918+
: : +- LocalLimit 5
1919+
: : +- Project [n#x]
1920+
: : +- SubqueryAlias t1
1921+
: : +- CTERelationRef xxxx, true, [n#x], false, false
1922+
: +- Aggregate [sum(n#x) AS sum(n)#xL]
1923+
: +- SubqueryAlias __auto_generated_subquery_name
1924+
: +- GlobalLimit 3
1925+
: +- LocalLimit 3
1926+
: +- Project [n#x]
1927+
: +- SubqueryAlias t1
1928+
: +- CTERelationRef xxxx, true, [n#x], false, false
1929+
+- OneRowRelation
1930+
1931+
1932+
-- !query
1933+
WITH RECURSIVE t1(n) AS (
1934+
SELECT 1
1935+
UNION ALL
1936+
SELECT n + 1 FROM t1
1937+
), t2(m) AS (
1938+
SELECT (SELECT SUM(n) FROM (SELECT n FROM t1 LIMIT 10) AS sums)
1939+
UNION ALL
1940+
SELECT m + (SELECT SUM(n) FROM (SELECT n FROM t1 LIMIT 3) AS sums) FROM t2
1941+
)
1942+
SELECT * FROM t2 LIMIT 20
1943+
-- !query analysis
1944+
WithCTE
1945+
:- CTERelationDef xxxx, false
1946+
: +- SubqueryAlias t1
1947+
: +- Project [1#x AS n#x]
1948+
: +- UnionLoop xxxx
1949+
: :- Project [1 AS 1#x]
1950+
: : +- OneRowRelation
1951+
: +- Project [(n#x + 1) AS (n + 1)#x]
1952+
: +- SubqueryAlias t1
1953+
: +- Project [1#x AS n#x]
1954+
: +- UnionLoopRef xxxx, [1#x], false
1955+
:- CTERelationDef xxxx, false
1956+
: +- SubqueryAlias t2
1957+
: +- Project [scalarsubquery()#xL AS m#xL]
1958+
: +- UnionLoop xxxx
1959+
: :- Project [scalar-subquery#x [] AS scalarsubquery()#xL]
1960+
: : : +- Aggregate [sum(n#x) AS sum(n)#xL]
1961+
: : : +- SubqueryAlias sums
1962+
: : : +- GlobalLimit 10
1963+
: : : +- LocalLimit 10
1964+
: : : +- Project [n#x]
1965+
: : : +- SubqueryAlias t1
1966+
: : : +- CTERelationRef xxxx, true, [n#x], false, false
1967+
: : +- OneRowRelation
1968+
: +- Project [(m#xL + scalar-subquery#x []) AS (m + scalarsubquery())#xL]
1969+
: : +- Aggregate [sum(n#x) AS sum(n)#xL]
1970+
: : +- SubqueryAlias sums
1971+
: : +- GlobalLimit 3
1972+
: : +- LocalLimit 3
1973+
: : +- Project [n#x]
1974+
: : +- SubqueryAlias t1
1975+
: : +- CTERelationRef xxxx, true, [n#x], false, false
1976+
: +- SubqueryAlias t2
1977+
: +- Project [scalarsubquery()#xL AS m#xL]
1978+
: +- UnionLoopRef xxxx, [scalarsubquery()#xL], false
1979+
+- GlobalLimit 20
1980+
+- LocalLimit 20
1981+
+- Project [m#xL]
1982+
+- SubqueryAlias t2
1983+
+- CTERelationRef xxxx, true, [m#xL], false, false
1984+
1985+
1986+
-- !query
1987+
WITH RECURSIVE t1(n) AS (
1988+
SELECT 1
1989+
UNION ALL
1990+
SELECT n + 1 FROM t1
1991+
)
1992+
((SELECT n FROM t1) UNION ALL (SELECT n FROM t1)) LIMIT 20
1993+
-- !query analysis
1994+
WithCTE
1995+
:- CTERelationDef xxxx, false
1996+
: +- SubqueryAlias t1
1997+
: +- Project [1#x AS n#x]
1998+
: +- UnionLoop xxxx
1999+
: :- Project [1 AS 1#x]
2000+
: : +- OneRowRelation
2001+
: +- Project [(n#x + 1) AS (n + 1)#x]
2002+
: +- SubqueryAlias t1
2003+
: +- Project [1#x AS n#x]
2004+
: +- UnionLoopRef xxxx, [1#x], false
2005+
+- GlobalLimit 20
2006+
+- LocalLimit 20
2007+
+- Union false, false
2008+
:- Project [n#x]
2009+
: +- SubqueryAlias t1
2010+
: +- CTERelationRef xxxx, true, [n#x], false, false
2011+
+- Project [n#x]
2012+
+- SubqueryAlias t1
2013+
+- CTERelationRef xxxx, true, [n#x], false, false

sql/core/src/test/resources/sql-tests/inputs/cte-recursion.sql

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -689,4 +689,40 @@ WITH RECURSIVE t1(n) AS (
689689
UNION ALL
690690
SELECT CASE WHEN n < 5 THEN n + 1 ELSE NULL END FROM t1
691691
)
692-
SELECT * FROM t1 LIMIT 25;
692+
SELECT * FROM t1 LIMIT 25;
693+
694+
-- Two calls to same rCTE with and without limit
695+
WITH RECURSIVE t1(n) AS (
696+
SELECT 1
697+
UNION ALL
698+
SELECT n + 1 FROM t1 WHERE n < 5
699+
)
700+
SELECT (SELECT SUM(n) FROM (SELECT * FROM t1)), (SELECT SUM(n) FROM (SELECT * FROM t1 LIMIT 3));
701+
702+
-- Two calls to same infinite rCTE with different limits
703+
WITH RECURSIVE t1(n) AS (
704+
SELECT 1
705+
UNION ALL
706+
SELECT n + 1 FROM t1
707+
)
708+
SELECT (SELECT SUM(n) FROM (SELECT * FROM t1 LIMIT 5)), (SELECT SUM(n) FROM (SELECT * FROM t1 LIMIT 3));
709+
710+
-- Two calls to same infinite rCTE from another rCTE
711+
WITH RECURSIVE t1(n) AS (
712+
SELECT 1
713+
UNION ALL
714+
SELECT n + 1 FROM t1
715+
), t2(m) AS (
716+
SELECT (SELECT SUM(n) FROM (SELECT n FROM t1 LIMIT 10) AS sums)
717+
UNION ALL
718+
SELECT m + (SELECT SUM(n) FROM (SELECT n FROM t1 LIMIT 3) AS sums) FROM t2
719+
)
720+
SELECT * FROM t2 LIMIT 20;
721+
722+
-- Two calls to recursive CTE with single limit pushed to both
723+
WITH RECURSIVE t1(n) AS (
724+
SELECT 1
725+
UNION ALL
726+
SELECT n + 1 FROM t1
727+
)
728+
((SELECT n FROM t1) UNION ALL (SELECT n FROM t1)) LIMIT 20

sql/core/src/test/resources/sql-tests/results/cte-recursion.sql.out

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1710,3 +1710,97 @@ NULL
17101710
NULL
17111711
NULL
17121712
NULL
1713+
1714+
1715+
-- !query
1716+
WITH RECURSIVE t1(n) AS (
1717+
SELECT 1
1718+
UNION ALL
1719+
SELECT n + 1 FROM t1 WHERE n < 5
1720+
)
1721+
SELECT (SELECT SUM(n) FROM (SELECT * FROM t1)), (SELECT SUM(n) FROM (SELECT * FROM t1 LIMIT 3))
1722+
-- !query schema
1723+
struct<scalarsubquery():bigint,scalarsubquery():bigint>
1724+
-- !query output
1725+
15 6
1726+
1727+
1728+
-- !query
1729+
WITH RECURSIVE t1(n) AS (
1730+
SELECT 1
1731+
UNION ALL
1732+
SELECT n + 1 FROM t1
1733+
)
1734+
SELECT (SELECT SUM(n) FROM (SELECT * FROM t1 LIMIT 5)), (SELECT SUM(n) FROM (SELECT * FROM t1 LIMIT 3))
1735+
-- !query schema
1736+
struct<scalarsubquery():bigint,scalarsubquery():bigint>
1737+
-- !query output
1738+
15 6
1739+
1740+
1741+
-- !query
1742+
WITH RECURSIVE t1(n) AS (
1743+
SELECT 1
1744+
UNION ALL
1745+
SELECT n + 1 FROM t1
1746+
), t2(m) AS (
1747+
SELECT (SELECT SUM(n) FROM (SELECT n FROM t1 LIMIT 10) AS sums)
1748+
UNION ALL
1749+
SELECT m + (SELECT SUM(n) FROM (SELECT n FROM t1 LIMIT 3) AS sums) FROM t2
1750+
)
1751+
SELECT * FROM t2 LIMIT 20
1752+
-- !query schema
1753+
struct<m:bigint>
1754+
-- !query output
1755+
103
1756+
109
1757+
115
1758+
121
1759+
127
1760+
133
1761+
139
1762+
145
1763+
151
1764+
157
1765+
163
1766+
169
1767+
55
1768+
61
1769+
67
1770+
73
1771+
79
1772+
85
1773+
91
1774+
97
1775+
1776+
1777+
-- !query
1778+
WITH RECURSIVE t1(n) AS (
1779+
SELECT 1
1780+
UNION ALL
1781+
SELECT n + 1 FROM t1
1782+
)
1783+
((SELECT n FROM t1) UNION ALL (SELECT n FROM t1)) LIMIT 20
1784+
-- !query schema
1785+
struct<n:int>
1786+
-- !query output
1787+
1
1788+
10
1789+
11
1790+
12
1791+
13
1792+
14
1793+
15
1794+
16
1795+
17
1796+
18
1797+
19
1798+
2
1799+
20
1800+
3
1801+
4
1802+
5
1803+
6
1804+
7
1805+
8
1806+
9

0 commit comments

Comments
 (0)