Skip to content

Commit 6cfd883

Browse files
committed
initial commit
0 parents  commit 6cfd883

39 files changed

+12445
-0
lines changed

.gitignore

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# Extensions
2+
3+
*.a
4+
*.bat
5+
*.bin
6+
*.dll
7+
*.dot
8+
*.etag
9+
*.exe
10+
*.gcda
11+
*.gcno
12+
*.gcov
13+
*.gguf
14+
*.gguf.json
15+
*.lastModified
16+
*.log
17+
*.metallib
18+
*.o
19+
*.so
20+
*.tmp
21+
22+
# IDE / OS
23+
24+
.cache/
25+
.ccls-cache/
26+
.direnv/
27+
.DS_Store
28+
.envrc
29+
.idea/
30+
.swiftpm
31+
.vs/
32+
.vscode/
33+
nppBackup
34+
35+
# Models
36+
models/*
37+
38+
# Python
39+
40+
/.venv
41+
__pycache__/
42+
*/poetry.lock
43+
poetry.toml
44+
45+
build/
46+
logs/

.gitmodules

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
[submodule "3rdparty/llama.cpp"]
2+
path = 3rdparty/llama.cpp
3+
url = https://github.com/Eddie-Wang1120/llama.cpp.git
4+
branch = merge-dev

3rdparty/llama.cpp

Submodule llama.cpp added at 5371710

CMakeLists.txt

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
2+
project("bitnet.cpp" C CXX)
3+
include(CheckIncludeFileCXX)
4+
5+
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
6+
7+
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
8+
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
9+
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
10+
endif()
11+
12+
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
13+
14+
# option list
15+
option(BITNET_ARM_TL1 "bitnet.cpp: use tl1 on arm platform" OFF)
16+
option(BITNET_X86_TL2 "bitnet.cpp: use tl2 on x86 platform" OFF)
17+
18+
19+
set(CMAKE_CXX_STANDARD_REQUIRED true)
20+
set(CMAKE_C_STANDARD 11)
21+
set(CMAKE_C_STANDARD_REQUIRED true)
22+
set(THREADS_PREFER_PTHREAD_FLAG ON)
23+
24+
# override ggml options
25+
set(GGML_BITNET_ARM_TL1 ${BITNET_ARM_TL1})
26+
set(GGML_BITNET_X86_TL2 ${BITNET_X86_TL2})
27+
28+
if (GGML_BITNET_ARM_TL1)
29+
add_compile_definitions(GGML_BITNET_ARM_TL1)
30+
endif()
31+
if (GGML_BITNET_X86_TL2)
32+
add_compile_definitions(GGML_BITNET_X86_TL2)
33+
endif()
34+
35+
find_package(Threads REQUIRED)
36+
37+
add_subdirectory(src)
38+
add_subdirectory(3rdparty/llama.cpp)
39+
40+
# install
41+
42+
include(GNUInstallDirs)
43+
include(CMakePackageConfigHelpers)
44+
45+
set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR}
46+
CACHE PATH "Location of header files")
47+
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR}
48+
CACHE PATH "Location of library files")
49+
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR}
50+
CACHE PATH "Location of binary files")
51+
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
52+
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
53+
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
54+
55+
get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
56+
get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
57+
get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
58+
set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES})
59+
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
60+
61+
get_directory_property(LLAMA_TRANSIENT_DEFINES COMPILE_DEFINITIONS)
62+
63+
write_basic_package_version_file(
64+
${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
65+
VERSION ${LLAMA_INSTALL_VERSION}
66+
COMPATIBILITY SameMajorVersion)
67+
68+
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
69+
${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
70+
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
71+
72+
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/llama.h)
73+
install(TARGETS llama LIBRARY PUBLIC_HEADER)

CODE_OF_CONDUCT.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Microsoft Open Source Code of Conduct
2+
3+
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
4+
5+
Resources:
6+
7+
- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
8+
- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
9+
- Contact [[email protected]](mailto:[email protected]) with questions or concerns

LICENSE

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) Microsoft Corporation.
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE

README.md

Lines changed: 228 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,228 @@
1+
# bitnet.cpp
2+
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
3+
![version](https://img.shields.io/badge/version-1.0-blue)
4+
5+
bitnet.cpp is the official inference framework for BitNet models (e.g., BitNet b1.58), optimized for CPU devices. It offers a suite of optimized kernels, that support lossless inference of 1.58-bit models on both x86 and ARM architectures.
6+
7+
## Demo
8+
9+
A demo of bitnet.cpp runing a BitNet b1.58 3B model on Apple M2:
10+
11+
https://github.com/user-attachments/assets/7f46b736-edec-4828-b809-4be780a3e5b1
12+
13+
## Timeline
14+
15+
- 10/17/2024 bitnet.cpp 1.0 released.
16+
- 02/27/2024 [The Era of 1-bit LLMs: All Large Language Models are in 1.58 Bits](https://arxiv.org/abs/2402.17764)
17+
- 10/17/2023 [BitNet: Scaling 1-bit Transformers for Large Language Models](https://arxiv.org/abs/2310.11453)
18+
19+
## Supported Models
20+
21+
bitnet.cpp supports a list of 1-bit models available on [Hugging Face](https://huggingface.co/)
22+
23+
24+
<table>
25+
</tr>
26+
<tr>
27+
<th rowspan="2">Model</th>
28+
<th rowspan="2">Parameters</th>
29+
<th rowspan="2">CPU</th>
30+
<th colspan="3">Kernel</th>
31+
</tr>
32+
<tr>
33+
<th>I2_S</th>
34+
<th>TL1</th>
35+
<th>TL2</th>
36+
</tr>
37+
<tr>
38+
<td rowspan="2"><a href="https://huggingface.co/1bitLLM/bitnet_b1_58-large">bitnet_b1_58-large</a></td>
39+
<td rowspan="2">0.7B</td>
40+
<td>x86</td>
41+
<td>&#10004;</td>
42+
<td>&#10008;</td>
43+
<td>&#10004;</td>
44+
</tr>
45+
<tr>
46+
<td>ARM</td>
47+
<td>&#10004;</td>
48+
<td>&#10004;</td>
49+
<td>&#10008;</td>
50+
</tr>
51+
<tr>
52+
<td rowspan="2"><a href="https://huggingface.co/1bitLLM/bitnet_b1_58-3B">bitnet_b1_58-3B</a></td>
53+
<td rowspan="2">3.3B</td>
54+
<td>x86</td>
55+
<td>&#10008;</td>
56+
<td>&#10008;</td>
57+
<td>&#10004;</td>
58+
</tr>
59+
<tr>
60+
<td>ARM</td>
61+
<td>&#10008;</td>
62+
<td>&#10004;</td>
63+
<td>&#10008;</td>
64+
</tr>
65+
<tr>
66+
<td rowspan="2"><a href="https://huggingface.co/HF1BitLLM/Llama3-8B-1.58-100B-tokens">Llama3-8B-1.58-100B-tokens</a></td>
67+
<td rowspan="2">8.0B</td>
68+
<td>x86</td>
69+
<td>&#10004;</td>
70+
<td>&#10008;</td>
71+
<td>&#10004;</td>
72+
</tr>
73+
<tr>
74+
<td>ARM</td>
75+
<td>&#10004;</td>
76+
<td>&#10004;</td>
77+
<td>&#10008;</td>
78+
</tr>
79+
</table>
80+
81+
82+
83+
## Installation
84+
85+
### Requirements
86+
- python>=3.9
87+
- cmake>=3.22
88+
- clang>=18
89+
- For Windows users, install [Visual Studio 2022](https://visualstudio.microsoft.com/downloads/). In the installer, toggle on at least the following options(this also automatically installs the required additional tools like CMake):
90+
- Desktop-development with C++
91+
- C++-CMake Tools for Windows
92+
- Git for Windows
93+
- C++-Clang Compiler for Windows
94+
- MS-Build Support for LLVM-Toolset (clang)
95+
- For Debian/Ubuntu users, you can download with [Automatic installation script](https://apt.llvm.org/)
96+
97+
` bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)"`
98+
- conda (highly recommend)
99+
100+
### Build from source
101+
102+
> [!IMPORTANT]
103+
> If you are using Windows, please remember to always use a Developer Command Prompt / PowerShell for VS2022 for the following commands
104+
105+
1. Clone the repo
106+
```bash
107+
git clone --recursive https://github.com/microsoft/BitNet.git
108+
cd BitNet
109+
```
110+
2. Install the dependencies
111+
```bash
112+
# (Recommended) Create a new conda environment
113+
conda create -n bitnet-cpp python=3.9
114+
conda activate bitnet-cpp
115+
116+
pip install -r requirements.txt
117+
```
118+
3. Build the project
119+
```bash
120+
# Download the model from Hugging Face, convert it to quantized gguf format, and build the project
121+
python setup_env.py --hf-repo HF1BitLLM/Llama3-8B-1.58-100B-tokens -q i2_s
122+
123+
# Or you can manually download the model and run with local path
124+
huggingface-cli download HF1BitLLM/Llama3-8B-1.58-100B-tokens --local-dir models/Llama3-8B-1.58-100B-tokens
125+
python setup_env.py -md models/Llama3-8B-1.58-100B-tokens -q i2_s
126+
```
127+
<pre>
128+
usage: setup_env.py [-h] [--hf-repo {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens}] [--model-dir MODEL_DIR] [--log-dir LOG_DIR] [--quant-type {i2_s,tl1}] [--quant-embd]
129+
[--use-pretuned]
130+
131+
Setup the environment for running inference
132+
133+
optional arguments:
134+
-h, --help show this help message and exit
135+
--hf-repo {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens}, -hr {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens}
136+
Model used for inference
137+
--model-dir MODEL_DIR, -md MODEL_DIR
138+
Directory to save/load the model
139+
--log-dir LOG_DIR, -ld LOG_DIR
140+
Directory to save the logging info
141+
--quant-type {i2_s,tl1}, -q {i2_s,tl1}
142+
Quantization type
143+
--quant-embd Quantize the embeddings to f16
144+
--use-pretuned, -p Use the pretuned kernel parameters
145+
</pre>
146+
## Usage
147+
### Basic usage
148+
```bash
149+
# Run inference with the quantized model
150+
python run_inference.py -m models/Llama3-8B-1.58-100B-tokens/ggml-model-i2_s.gguf -p "Daniel went back to the the the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?\nAnswer:" -n 6 -temp 0
151+
152+
# Output:
153+
# Daniel went back to the the the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?
154+
# Answer: Mary is in the garden.
155+
156+
```
157+
<pre>
158+
usage: run_inference.py [-h] [-m MODEL] [-n N_PREDICT] -p PROMPT [-t THREADS] [-c CTX_SIZE] [-temp TEMPERATURE]
159+
160+
Run inference
161+
162+
optional arguments:
163+
-h, --help show this help message and exit
164+
-m MODEL, --model MODEL
165+
Path to model file
166+
-n N_PREDICT, --n-predict N_PREDICT
167+
Number of tokens to predict when generating text
168+
-p PROMPT, --prompt PROMPT
169+
Prompt to generate text from
170+
-t THREADS, --threads THREADS
171+
Number of threads to use
172+
-c CTX_SIZE, --ctx-size CTX_SIZE
173+
Size of the prompt context
174+
-temp TEMPERATURE, --temperature TEMPERATURE
175+
Temperature, a hyperparameter that controls the randomness of the generated text
176+
</pre>
177+
178+
### Benchmark
179+
We provide scripts to run the inference benchmark providing a model.
180+
181+
```
182+
usage: e2e_benchmark.py -m MODEL [-n N_TOKEN] [-p N_PROMPT] [-t THREADS]
183+
184+
Setup the environment for running the inference
185+
186+
required arguments:
187+
-m MODEL, --model MODEL
188+
Path to the model file.
189+
190+
optional arguments:
191+
-h, --help
192+
Show this help message and exit.
193+
-n N_TOKEN, --n-token N_TOKEN
194+
Number of generated tokens.
195+
-p N_PROMPT, --n-prompt N_PROMPT
196+
Prompt to generate text from.
197+
-t THREADS, --threads THREADS
198+
Number of threads to use.
199+
```
200+
201+
Here's a brief explanation of each argument:
202+
203+
- `-m`, `--model`: The path to the model file. This is a required argument that must be provided when running the script.
204+
- `-n`, `--n-token`: The number of tokens to generate during the inference. It is an optional argument with a default value of 128.
205+
- `-p`, `--n-prompt`: The number of prompt tokens to use for generating text. This is an optional argument with a default value of 512.
206+
- `-t`, `--threads`: The number of threads to use for running the inference. It is an optional argument with a default value of 2.
207+
- `-h`, `--help`: Show the help message and exit. Use this argument to display usage information.
208+
209+
For example:
210+
211+
```sh
212+
python utils/e2e_benchmark.py -m /path/to/model -n 200 -p 256 -t 4
213+
```
214+
215+
This command would run the inference benchmark using the model located at `/path/to/model`, generating 200 tokens from a 256 token prompt, utilizing 4 threads.
216+
217+
For the model layout that do not supported by any public model, we provide scripts to generate a dummy model with the given model layout, and run the benchmark on your machine:
218+
219+
```bash
220+
python utils/generate-dummy-bitnet-model.py models/bitnet_b1_58-large --outfile models/dummy-bitnet-125m.tl1.gguf --outtype tl1 --model-size 125M
221+
222+
# Run benchmark with the generated model, use -m to specify the model path, -p to specify the prompt processed, -n to specify the number of token to generate
223+
python utils/e2e_benchmark.py -m models/dummy-bitnet-125m.tl1.gguf -p 512 -n 128
224+
```
225+
226+
## Acknowledgements
227+
228+
This project is based on the [llama.cpp](https://github.com/ggerganov/llama.cpp) framework. We would like to thank all the authors for their contributions to the open-source community. We also thank [T-MAC](https://github.com/microsoft/T-MAC/) team for the helpful discussion on the LUT method for low-bit LLM inference.

0 commit comments

Comments
 (0)