Skip to content

[Mullapudi2016 GPU] Replace or emplace outer dimensions for GPU schedules #8685

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 12 additions & 10 deletions apps/bgu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,23 @@ find_package(Halide REQUIRED)
# Generator
add_halide_generator(bgu.generator SOURCES bgu_generator.cpp)

set(_bgu_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)

if(Halide_TARGET MATCHES "cuda|metal|opencl")
# Set last_level_cache per GPU block to an extremely small value. This
# eliminates all `.compute_at` in the generated schedules, which in turn
# eliminates all GPU shared memory allocations.
list(APPEND _bgu_autoscheduler_params
autoscheduler.last_level_cache_size=2000
)
endif()

# Filters
add_halide_library(bgu FROM bgu.generator)
add_halide_library(bgu_auto_schedule FROM bgu.generator
GENERATOR bgu
AUTOSCHEDULER Halide::Mullapudi2016
# Note(antonysigma): experimental GPU schedule failed on the Buildbot worker
# "halide-testbranch-main-llvm18-x86-64-linux-cmake" with error:
#
# CUDA error: CUDA_ERROR_ILLEGAL_ADDRESS cuCtxSynchronize failed
#
# Curiously, it works on a low-end GPU: Nvidia GTX 1660S.
#
# Uncomment the following code to debug. PARAMS
# autoscheduler.experimental_gpu_schedule=1
)
PARAMS ${_bgu_autoscheduler})

# Main executable
add_executable(bgu_filter filter.cpp)
Expand Down
20 changes: 19 additions & 1 deletion apps/camera_pipe/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,29 @@ add_halide_generator(camera_pipe.generator
SOURCES camera_pipe_generator.cpp
LINK_LIBRARIES Halide::Tools)

set(_camera_pipe_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)

if(Halide_TARGET MATCHES "cuda|metal")
# Last level cache size estimate of the Nvidia GPU on the Buildbot. Hand
# tuned to pass the Builbot tests.
list(APPEND _camera_pipe_autoscheduler_params
autoscheduler.last_level_cache_size=10000
)
elseif(Halide_TARGET MATCHES "opencl|vulkan")
# Set last_level_cache per GPU block to an extremely small value. This
# eliminates all `.compute_at` in the generated schedules, which in turn
# eliminates all GPU shared memory allocations.
list(APPEND _camera_pipe_autoscheduler_params
autoscheduler.last_level_cache_size=1000
)
endif()

# Filters
add_halide_library(camera_pipe FROM camera_pipe.generator)
add_halide_library(camera_pipe_auto_schedule FROM camera_pipe.generator
GENERATOR camera_pipe
AUTOSCHEDULER Halide::Mullapudi2016)
AUTOSCHEDULER Halide::Mullapudi2016
PARAMS ${_camera_pipe_autoscheduler_params})

# Main executable
add_executable(camera_pipe_process process.cpp)
Expand Down
13 changes: 12 additions & 1 deletion apps/harris/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,23 @@ find_package(Halide REQUIRED)
# Generator
add_halide_generator(harris.generator SOURCES harris_generator.cpp)

set(_harris_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)

if(Halide_TARGET MATCHES "opencl|metal|cuda|vulkan")
# Set last_level_cache per GPU block to an extremely small value. This
# eliminates all `.compute_at` in the generated schedules, which in turn
# eliminates all GPU shared memory allocations.
list(APPEND _harris_autoscheduler_params
autoscheduler.last_level_cache_size=1000
)
endif()

# Filters
add_halide_library(harris FROM harris.generator)
add_halide_library(harris_auto_schedule FROM harris.generator
GENERATOR harris
AUTOSCHEDULER Halide::Mullapudi2016
PARAMS autoscheduler.experimental_gpu_schedule=1)
PARAMS ${_harris_autoscheduler_params})

# Main executable
add_executable(harris_filter filter.cpp)
Expand Down
2 changes: 1 addition & 1 deletion apps/harris/harris_generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ class Harris : public Halide::Generator<Harris> {
const int kHeight = 2560;
input.dim(0).set_estimate(0, kWidth);
input.dim(1).set_estimate(0, kHeight);
input.dim(2).set_estimate(0, 3);
input.dim(2).set_estimate(0, 4);
output.dim(0).set_estimate(3, kWidth - 6);
output.dim(1).set_estimate(3, kHeight - 6);
}
Expand Down
21 changes: 7 additions & 14 deletions apps/iir_blur/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,18 +32,11 @@ target_link_libraries(iir_blur_filter PRIVATE
# Test that the app actually works!
set(IMAGE ${CMAKE_CURRENT_LIST_DIR}/../images/rgb.png)
if (EXISTS ${IMAGE})
if (Halide_TARGET MATCHES "opencl")
# Error message:
#
# Error: OpenCL error: CL_INVALID_COMMAND_QUEUE clFinish failed
message(WARNING "Skipping Mullapudi2016's GPU auto-schedules for OpenCL target.")
else ()
configure_file(${IMAGE} rgb.png COPYONLY)
add_test(NAME iir_blur_filter
COMMAND iir_blur_filter rgb.png out.png)
set_tests_properties(iir_blur_filter PROPERTIES
LABELS iir_blur
PASS_REGULAR_EXPRESSION "Success!"
SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
endif ()
configure_file(${IMAGE} rgb.png COPYONLY)
add_test(NAME iir_blur_filter
COMMAND iir_blur_filter rgb.png out.png)
set_tests_properties(iir_blur_filter PROPERTIES
LABELS iir_blur
PASS_REGULAR_EXPRESSION "Success!"
SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
endif ()
44 changes: 43 additions & 1 deletion apps/iir_blur/filter.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <regex>

#include "HalideBuffer.h"
#include "HalideRuntime.h"
Expand All @@ -13,12 +13,54 @@

using namespace Halide::Tools;

namespace {

enum DeviceState {
USING_METAL_OR_OPENCL,
NOT_METAL_OR_OPENCL,
METADATA_ABSENT,
};
DeviceState ensure_cuda_device() {
const auto hl_target = iir_blur_auto_schedule_metadata()->target;
if (hl_target == nullptr) {
printf("Warning: variable *_metadata()->target not specified. "
"Proceeding to the tests...\n");
return METADATA_ABSENT;
}

if (std::regex_search(hl_target, std::regex{"metal|opencl"})) {
// note(antonysigma): Error messages if we don't skip the test:
//
// OpenCL error: clFinish timeout.
//
// Metal: copy_to_host() failed. Error
// Domain=MTLCommandBufferErrorDomain Code=2 "Caused GPU Timeout Error
// (00000002:kIOAccelCommandBufferCallbackErrorTimeout)"
// UserInfo={NSLocalizedDescription=Caused GPU Timeout Error
// (00000002:kIOAccelCommandBufferCallbackErrorTimeout)}
printf("[SKIP] Mullapudi2016 experimental GPU schedule "
"generates copy_to_host() function calls that timeout. "
"Target = %s. Skipping...\n",
hl_target);

return USING_METAL_OR_OPENCL;
}

return NOT_METAL_OR_OPENCL;
}

} // namespace

int main(int argc, char **argv) {
if (argc != 3) {
printf("Usage: %s in out\n", argv[0]);
return 1;
}

if (ensure_cuda_device() == USING_METAL_OR_OPENCL) {
return 0;
}

Halide::Runtime::Buffer<float, 3> input = load_and_convert_image(argv[1]);
Halide::Runtime::Buffer<float, 3> output(input.width(), input.height(), input.channels());

Expand Down
48 changes: 25 additions & 23 deletions apps/lens_blur/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,29 @@ find_package(Halide REQUIRED)
# Generator
add_halide_generator(lens_blur.generator SOURCES lens_blur_generator.cpp)

set(_lens_blur_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)

if(Halide_TARGET MATCHES "cuda|metal")
# Last level cache size estimate of the Nvidia GPU on the Buildbot. Hand
# tuned to pass the Builbot tests.
list(APPEND _lens_blur_autoscheduler_params
autoscheduler.last_level_cache_size=10000
)
elseif(Halide_TARGET MATCHES "opencl")
# Set last_level_cache per GPU block to an extremely small value. This
# eliminates all `.compute_at` in the generated schedules, which in turn
# eliminates all GPU shared memory allocations.
list(APPEND _lens_blur_autoscheduler_params
autoscheduler.last_level_cache_size=1000
)
endif()

# Filters
add_halide_library(lens_blur FROM lens_blur.generator)
add_halide_library(lens_blur_auto_schedule FROM lens_blur.generator
GENERATOR lens_blur
AUTOSCHEDULER Halide::Mullapudi2016
PARAMS autoscheduler.parallelism=4096 autoscheduler.experimental_gpu_schedule=1)
PARAMS ${_lens_blur_autoscheduler_params})

# Main executable
add_executable(lens_blur_filter process.cpp)
Expand All @@ -32,26 +49,11 @@ target_link_libraries(lens_blur_filter
# Test that the app actually works!
set(IMAGE ${CMAKE_CURRENT_LIST_DIR}/../images/rgb_small.png)
if (EXISTS ${IMAGE})
if (Halide_TARGET MATCHES "metal")
# Note(antonysigma): Buildbot error message:
#
# 2025-06-30 23:26:02.260 lens_blur_filter[32272:21031150] Metal API Validation
# Enabled -[MTLDebugComputeCommandEncoder _validateThreadsPerThreadgroup:]:1267:
# failed assertion `(threadsPerThreadgroup.width(32) *
# threadsPerThreadgroup.height(32) * threadsPerThreadgroup.depth(1))(1024) must
# be <= 896. (kernel threadgroup size limit)'
#
# Possible root cause: Autoscheduler's GPUTilingDedup::max_n_threads is
# hardcoded to 1024 threads per block. The OSX Metal API caps the value at 836
# threads per block because of the register pressure in lens_blur's GPU kernel.
message ("Pipeline lens_blur_auto_schedule skipped for target host-metal")
else ()
configure_file(${IMAGE} rgb_small.png COPYONLY)
add_test(NAME lens_blur_filter
COMMAND lens_blur_filter rgb_small.png 32 13 0.5 32 3 out.png)
set_tests_properties(lens_blur_filter PROPERTIES
LABELS lens_blur
PASS_REGULAR_EXPRESSION "Success!"
SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
endif ()
configure_file(${IMAGE} rgb_small.png COPYONLY)
add_test(NAME lens_blur_filter
COMMAND lens_blur_filter rgb_small.png 32 13 0.5 32 3 out.png)
set_tests_properties(lens_blur_filter PROPERTIES
LABELS lens_blur
PASS_REGULAR_EXPRESSION "Success!"
SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
endif ()
21 changes: 17 additions & 4 deletions apps/local_laplacian/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,23 @@ add_halide_library(local_laplacian FROM local_laplacian.generator)
add_halide_library(local_laplacian_auto_schedule FROM local_laplacian.generator
GENERATOR local_laplacian
AUTOSCHEDULER Halide::Mullapudi2016
# When target=host-cuda or host-metal, limit the GPU shared
# memory per block to avoid gpu kernel launch failure.
PARAMS autoscheduler.last_level_cache_size=30000 autoscheduler.parallelism=4096 autoscheduler.experimental_gpu_schedule=1
)
# note(antonysigma): Works on CUDA and CPU targets, but not
# others. Error messages if we don't skip the test:
#
# OpenCL error: CL_INVALID_WORK_GROUP_SIZE
# clEnqueueNDRangeKernel failed
#
# 2025-07-17 17:24:32.170
# local_laplacian_process[63513:6587844] Metal API Validation
# Enabled -[MTLDebugComputeCommandEncoder
# _validateThreadsPerThreadgroup:]:1266: failed assertion
# `(threadsPerThreadgroup.width(62) *
# threadsPerThreadgroup.height(32)
# * threadsPerThreadgroup.depth(1))(1984) must be <= 1024.
# (device threadgroup size limit)'
#
# Vulkan: vkQueueWaitIdle returned VK_ERROR_DEVICE_LOST
PARAMS autoscheduler.experimental_gpu_schedule=0)

# Main executable
add_executable(local_laplacian_process process.cpp)
Expand Down
15 changes: 14 additions & 1 deletion apps/stencil_chain/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,24 @@ find_package(Halide REQUIRED)
# Generator
add_halide_generator(stencil_chain.generator SOURCES stencil_chain_generator.cpp)

set(_stencil_chain_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)

if(Halide_TARGET MATCHES "cuda|metal|opencl")
# Set last_level_cache per GPU block to an extremely small value. This
# eliminates all `.compute_at` in the generated schedules, which in turn
# eliminates all GPU shared memory allocations.
list(APPEND _stencil_chain_autoscheduler_params
autoscheduler.last_level_cache_size=2000
)
endif()

# Filters
add_halide_library(stencil_chain FROM stencil_chain.generator)
add_halide_library(stencil_chain_auto_schedule FROM stencil_chain.generator
GENERATOR stencil_chain
AUTOSCHEDULER Halide::Mullapudi2016)
AUTOSCHEDULER Halide::Mullapudi2016
PARAMS ${_stenctil_chain_autoscheduler_params}
)

# Main executable
add_executable(stencil_chain_process process.cpp)
Expand Down
22 changes: 21 additions & 1 deletion apps/unsharp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,31 @@ find_package(Halide REQUIRED)
# Generator
add_halide_generator(unsharp.generator SOURCES unsharp_generator.cpp)

set(_unsharp_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)

if(Halide_TARGET MATCHES "cuda|opencl|vulkan")
# Last level cache size estimate of the Nvidia GPU on the Buildbot. Hand
# tuned to pass the Builbot tests.
list(APPEND _unsharp_autoscheduler_params
autoscheduler.last_level_cache_size=20000
)
elseif(Halide_TARGET MATCHES "metal")
# Resolving Metal error regarding the threads per GPU block limit:
#
# -[MTLDebugComputeCommandEncoder _validateThreadsPerThreadgroup:]:1267:
# failed assertion `(threadsPerThreadgroup.width(70) *
# threadsPerThreadgroup.height(8) * threadsPerThreadgroup.depth(1))(560)
# must be <= 448. (kernel threadgroup size limit)`
list(APPEND _unsharp_autoscheduler_params
autoscheduler.last_level_cache_size=1000)
endif()

# Filters
add_halide_library(unsharp FROM unsharp.generator)
add_halide_library(unsharp_auto_schedule FROM unsharp.generator
GENERATOR unsharp
AUTOSCHEDULER Halide::Mullapudi2016)
AUTOSCHEDULER Halide::Mullapudi2016
PARAMS ${_unsharp_autoscheduler_params})

# Main executable
add_executable(unsharp_filter filter.cpp)
Expand Down
Loading
Loading