Skip to content

Commit 596cf76

Browse files
authored
build(intel): bundle intel variants in single-binary (#2494)
* wip: try to build also intel variants Signed-off-by: Ettore Di Giacinto <[email protected]> * Add dependencies * Select automatically intel backend --------- Signed-off-by: Ettore Di Giacinto <[email protected]>
1 parent a293aa1 commit 596cf76

File tree

3 files changed

+55
-5
lines changed

3 files changed

+55
-5
lines changed

.github/workflows/release.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,12 @@ jobs:
7070
run: |
7171
sudo apt-get update
7272
sudo apt-get install build-essential ffmpeg protobuf-compiler ccache
73+
- name: Intel Dependencies
74+
run: |
75+
wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
76+
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
77+
sudo apt update
78+
sudo apt install -y intel-basekit
7379
- name: Install CUDA Dependencies
7480
run: |
7581
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
@@ -127,6 +133,7 @@ jobs:
127133
export PATH=$PATH:$GOPATH/bin
128134
export PATH=/usr/local/cuda/bin:$PATH
129135
export PATH=/opt/rocm/bin:$PATH
136+
source /opt/intel/oneapi/setvars.sh
130137
GO_TAGS=p2p make dist
131138
- uses: actions/upload-artifact@v4
132139
with:

Makefile

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,8 @@ ifeq ($(OS),Darwin)
328328
else
329329
$(MAKE) backend-assets/grpc/llama-cpp-cuda
330330
$(MAKE) backend-assets/grpc/llama-cpp-hipblas
331+
$(MAKE) backend-assets/grpc/llama-cpp-sycl_f16
332+
$(MAKE) backend-assets/grpc/llama-cpp-sycl_f32
331333
endif
332334
$(MAKE) build
333335
mkdir -p release
@@ -720,6 +722,20 @@ backend-assets/grpc/llama-cpp-hipblas: backend-assets/grpc
720722
BUILD_TYPE="hipblas" $(MAKE) VARIANT="llama-hipblas" build-llama-cpp-grpc-server
721723
cp -rfv backend/cpp/llama-hipblas/grpc-server backend-assets/grpc/llama-cpp-hipblas
722724

725+
backend-assets/grpc/llama-cpp-sycl_f16: backend-assets/grpc
726+
cp -rf backend/cpp/llama backend/cpp/llama-sycl_f16
727+
$(MAKE) -C backend/cpp/llama-sycl_f16 purge
728+
$(info ${GREEN}I llama-cpp build info:sycl_f16${RESET})
729+
BUILD_TYPE="sycl_f16" $(MAKE) VARIANT="llama-sycl_f16" build-llama-cpp-grpc-server
730+
cp -rfv backend/cpp/llama-sycl_f16/grpc-server backend-assets/grpc/llama-cpp-sycl_f16
731+
732+
backend-assets/grpc/llama-cpp-sycl_f32: backend-assets/grpc
733+
cp -rf backend/cpp/llama backend/cpp/llama-sycl_f32
734+
$(MAKE) -C backend/cpp/llama-sycl_f32 purge
735+
$(info ${GREEN}I llama-cpp build info:sycl_f32${RESET})
736+
BUILD_TYPE="sycl_f32" $(MAKE) VARIANT="llama-sycl_f32" build-llama-cpp-grpc-server
737+
cp -rfv backend/cpp/llama-sycl_f32/grpc-server backend-assets/grpc/llama-cpp-sycl_f32
738+
723739
backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc
724740
cp -rf backend/cpp/llama backend/cpp/llama-grpc
725741
$(MAKE) -C backend/cpp/llama-grpc purge

pkg/model/initializers.go

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,10 @@ const (
3838
LLamaCPPFallback = "llama-cpp-fallback"
3939
LLamaCPPCUDA = "llama-cpp-cuda"
4040
LLamaCPPHipblas = "llama-cpp-hipblas"
41-
LLamaCPPGRPC = "llama-cpp-grpc"
41+
LLamaCPPSycl16 = "llama-cpp-sycl_16"
42+
LLamaCPPSycl32 = "llama-cpp-sycl_32"
43+
44+
LLamaCPPGRPC = "llama-cpp-grpc"
4245

4346
Gpt4AllLlamaBackend = "gpt4all-llama"
4447
Gpt4AllMptBackend = "gpt4all-mpt"
@@ -94,7 +97,7 @@ ENTRY:
9497
if autoDetect {
9598
// if we find the llama.cpp variants, show them of as a single backend (llama-cpp) as later we are going to pick that up
9699
// when starting the service
97-
foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda, foundLCPPHipblas := false, false, false, false, false, false
100+
foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda, foundLCPPHipblas, foundSycl16, foundSycl32 := false, false, false, false, false, false, false, false
98101
if _, ok := backends[LLamaCPP]; !ok {
99102
for _, e := range entry {
100103
if strings.Contains(e.Name(), LLamaCPPAVX2) && !foundLCPPAVX2 {
@@ -121,6 +124,14 @@ ENTRY:
121124
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPHipblas)
122125
foundLCPPHipblas = true
123126
}
127+
if strings.Contains(e.Name(), LLamaCPPSycl16) && !foundSycl16 {
128+
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPSycl16)
129+
foundSycl16 = true
130+
}
131+
if strings.Contains(e.Name(), LLamaCPPSycl32) && !foundSycl32 {
132+
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPSycl32)
133+
foundSycl32 = true
134+
}
124135
}
125136
}
126137
}
@@ -172,9 +183,10 @@ ENTRY:
172183
}
173184

174185
// selectGRPCProcess selects the GRPC process to start based on system capabilities
175-
func selectGRPCProcess(backend, assetDir string) string {
186+
func selectGRPCProcess(backend, assetDir string, f16 bool) string {
176187
foundCUDA := false
177188
foundAMDGPU := false
189+
foundIntelGPU := false
178190
var grpcProcess string
179191

180192
// Select backend now just for llama.cpp
@@ -211,10 +223,24 @@ func selectGRPCProcess(backend, assetDir string) string {
211223
log.Info().Msgf("GPU device found but no HIPBLAS backend present")
212224
}
213225
}
226+
if strings.Contains(gpu.String(), "intel") {
227+
backend := LLamaCPPSycl16
228+
if !f16 {
229+
backend = LLamaCPPSycl32
230+
}
231+
p := backendPath(assetDir, backend)
232+
if _, err := os.Stat(p); err == nil {
233+
log.Info().Msgf("[%s] attempting to load with Intel variant", backend)
234+
grpcProcess = p
235+
foundIntelGPU = true
236+
} else {
237+
log.Info().Msgf("GPU device found but no Intel backend present")
238+
}
239+
}
214240
}
215241
}
216242

217-
if foundCUDA || foundAMDGPU {
243+
if foundCUDA || foundAMDGPU || foundIntelGPU {
218244
return grpcProcess
219245
}
220246

@@ -236,6 +262,7 @@ func selectGRPCProcess(backend, assetDir string) string {
236262
// It also loads the model
237263
func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string) (ModelAddress, error) {
238264
return func(modelName, modelFile string) (ModelAddress, error) {
265+
239266
log.Debug().Msgf("Loading Model %s with gRPC (file: %s) (backend: %s): %+v", modelName, modelFile, backend, *o)
240267

241268
var client ModelAddress
@@ -284,7 +311,7 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
284311

285312
if autoDetect {
286313
// autoDetect GRPC process to start based on system capabilities
287-
if selectedProcess := selectGRPCProcess(backend, o.assetDir); selectedProcess != "" {
314+
if selectedProcess := selectGRPCProcess(backend, o.assetDir, o.gRPCOptions.F16Memory); selectedProcess != "" {
288315
grpcProcess = selectedProcess
289316
}
290317
}

0 commit comments

Comments
 (0)