Skip to content

Commit 87cd48f

Browse files
committedApr 25, 2025
[SPARK-51909][INFRA] Add a scheduled workflow for PySpark Classic-only
### What changes were proposed in this pull request? Add a scheduled (every 3 days) workflow for PySpark Classic-only ### Why are the changes needed? pyspark-classic should work without connect-related dependencies, all tests should pass ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? PR builder with ``` default: '{"PYSPARK_IMAGE_TO_TEST": "python-311-classic-only", "PYTHON_TO_TEST": "python3.11"}' ``` https://github.com/zhengruifeng/spark/actions/runs/14640857368/job/41082669149 ### Was this patch authored or co-authored using generative AI tooling? no Closes #50703 from zhengruifeng/infra_py_classic_only. Authored-by: Ruifeng Zheng <[email protected]> Signed-off-by: Ruifeng Zheng <[email protected]>
1 parent 2c0fc70 commit 87cd48f

File tree

5 files changed

+152
-15
lines changed

5 files changed

+152
-15
lines changed
 

‎.github/workflows/build_infra_images_cache.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ on:
3636
- 'dev/spark-test-image/python-309/Dockerfile'
3737
- 'dev/spark-test-image/python-310/Dockerfile'
3838
- 'dev/spark-test-image/python-311/Dockerfile'
39+
- 'dev/spark-test-image/python-311-classic-only/Dockerfile'
3940
- 'dev/spark-test-image/python-312/Dockerfile'
4041
- 'dev/spark-test-image/python-313/Dockerfile'
4142
- 'dev/spark-test-image/python-313-nogil/Dockerfile'
@@ -191,6 +192,19 @@ jobs:
191192
- name: Image digest (PySpark with Python 3.11)
192193
if: hashFiles('dev/spark-test-image/python-311/Dockerfile') != ''
193194
run: echo ${{ steps.docker_build_pyspark_python_311.outputs.digest }}
195+
- name: Build and push (PySpark Classic Only with Python 3.11)
196+
if: hashFiles('dev/spark-test-image/python-311-classic-only/Dockerfile') != ''
197+
id: docker_build_pyspark_python_311_classic_only
198+
uses: docker/build-push-action@v6
199+
with:
200+
context: ./dev/spark-test-image/python-311-classic-only/
201+
push: true
202+
tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-311-classic-only-cache:${{ github.ref_name }}-static
203+
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-311-classic-only-cache:${{ github.ref_name }}
204+
cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-311-classic-only-cache:${{ github.ref_name }},mode=max
205+
- name: Image digest (PySpark Classic Only with Python 3.11)
206+
if: hashFiles('dev/spark-test-image/python-311-classic-only/Dockerfile') != ''
207+
run: echo ${{ steps.docker_build_pyspark_python_311_classic_only.outputs.digest }}
194208
- name: Build and push (PySpark with Python 3.12)
195209
if: hashFiles('dev/spark-test-image/python-312/Dockerfile') != ''
196210
id: docker_build_pyspark_python_312
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one
3+
# or more contributor license agreements. See the NOTICE file
4+
# distributed with this work for additional information
5+
# regarding copyright ownership. The ASF licenses this file
6+
# to you under the Apache License, Version 2.0 (the
7+
# "License"); you may not use this file except in compliance
8+
# with the License. You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing,
13+
# software distributed under the License is distributed on an
14+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
# KIND, either express or implied. See the License for the
16+
# specific language governing permissions and limitations
17+
# under the License.
18+
#
19+
20+
name: "Build / Python-only Classic-only (master, Python 3.11)"
21+
22+
on:
23+
schedule:
24+
- cron: '0 0 */3 * *'
25+
workflow_dispatch:
26+
27+
jobs:
28+
run-build:
29+
permissions:
30+
packages: write
31+
name: Run
32+
uses: ./.github/workflows/build_and_test.yml
33+
if: github.repository == 'apache/spark'
34+
with:
35+
java: 17
36+
branch: master
37+
hadoop: hadoop3
38+
envs: >-
39+
{
40+
"PYSPARK_IMAGE_TO_TEST": "python-311-classic-only",
41+
"PYTHON_TO_TEST": "python3.11"
42+
}
43+
jobs: >-
44+
{
45+
"pyspark": "true",
46+
"pyspark-pandas": "true"
47+
}
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
18+
# Image for building and testing Spark branches. Based on Ubuntu 22.04.
19+
# See also in https://hub.docker.com/_/ubuntu
20+
FROM ubuntu:jammy-20240911.1
21+
LABEL org.opencontainers.image.authors="Apache Spark project <dev@spark.apache.org>"
22+
LABEL org.opencontainers.image.licenses="Apache-2.0"
23+
LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark Classic with Python 3.11"
24+
# Overwrite this label to avoid exposing the underlying Ubuntu OS version label
25+
LABEL org.opencontainers.image.version=""
26+
27+
ENV FULL_REFRESH_DATE=20250424
28+
29+
ENV DEBIAN_FRONTEND=noninteractive
30+
ENV DEBCONF_NONINTERACTIVE_SEEN=true
31+
32+
RUN apt-get update && apt-get install -y \
33+
build-essential \
34+
ca-certificates \
35+
curl \
36+
gfortran \
37+
git \
38+
gnupg \
39+
libcurl4-openssl-dev \
40+
libfontconfig1-dev \
41+
libfreetype6-dev \
42+
libfribidi-dev \
43+
libgit2-dev \
44+
libharfbuzz-dev \
45+
libjpeg-dev \
46+
liblapack-dev \
47+
libopenblas-dev \
48+
libpng-dev \
49+
libpython3-dev \
50+
libssl-dev \
51+
libtiff5-dev \
52+
libxml2-dev \
53+
openjdk-17-jdk-headless \
54+
pkg-config \
55+
qpdf \
56+
tzdata \
57+
software-properties-common \
58+
wget \
59+
zlib1g-dev
60+
61+
# Install Python 3.11
62+
RUN add-apt-repository ppa:deadsnakes/ppa
63+
RUN apt-get update && apt-get install -y \
64+
python3.11 \
65+
&& apt-get autoremove --purge -y \
66+
&& apt-get clean \
67+
&& rm -rf /var/lib/apt/lists/*
68+
69+
70+
ARG BASIC_PIP_PKGS="numpy pyarrow>=19.0.0 pandas==2.2.3 plotly<6.0.0 matplotlib openpyxl memory-profiler>=0.61.0 mlflow>=2.8.1 scipy scikit-learn>=1.3.2"
71+
ARG TEST_PIP_PKGS="coverage unittest-xml-reporting"
72+
73+
# Install Python 3.11 packages
74+
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
75+
RUN python3.11 -m pip install --ignore-installed blinker>=1.6.2 # mlflow needs this
76+
RUN python3.11 -m pip install $BASIC_PIP_PKGS $TEST_PIP_PKGS && \
77+
python3.11 -m pip install 'torch<2.6.0' torchvision --index-url https://download.pytorch.org/whl/cpu && \
78+
python3.11 -m pip install deepspeed torcheval && \
79+
python3.11 -m pip cache purge

‎python/pyspark/ml/connect/functions.py

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -58,25 +58,16 @@ def _test() -> None:
5858
print("Not supported in no-GIL mode", file=sys.stderr)
5959
sys.exit(0)
6060

61+
from pyspark.testing import should_test_connect
62+
63+
if not should_test_connect:
64+
print(f"Skipping pyspark.ml.connect.functions doctests", file=sys.stderr)
65+
sys.exit(0)
66+
6167
import doctest
6268
from pyspark.sql import SparkSession as PySparkSession
6369
import pyspark.ml.connect.functions
6470

65-
from pyspark.sql.pandas.utils import (
66-
require_minimum_pandas_version,
67-
require_minimum_pyarrow_version,
68-
)
69-
70-
try:
71-
require_minimum_pandas_version()
72-
require_minimum_pyarrow_version()
73-
except Exception as e:
74-
print(
75-
f"Skipping pyspark.ml.functions doctests: {e}",
76-
file=sys.stderr,
77-
)
78-
sys.exit(0)
79-
8071
globs = pyspark.ml.connect.functions.__dict__.copy()
8172

8273
globs["spark"] = (

‎python/pyspark/sql/connect/tvf.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,12 @@ def _test() -> None:
127127
print("Not supported in no-GIL mode", file=sys.stderr)
128128
sys.exit(0)
129129

130+
from pyspark.testing import should_test_connect
131+
132+
if not should_test_connect:
133+
print(f"Skipping pyspark.sql.connect.tvf doctests", file=sys.stderr)
134+
sys.exit(0)
135+
130136
import doctest
131137
from pyspark.sql import SparkSession as PySparkSession
132138
import pyspark.sql.connect.tvf

0 commit comments

Comments
 (0)