bytedance
diff --git a/‎.clang-format
Lines changed: 27 additions & 0 deletions b/‎.clang-format
Lines changed: 27 additions & 0 deletions
diff --git a/‎.gitignore
Lines changed: 29 additions & 0 deletions b/‎.gitignore
Lines changed: 29 additions & 0 deletions
diff --git a/‎.gitmodules
Lines changed: 3 additions & 3 deletions b/‎.gitmodules
Lines changed: 3 additions & 3 deletions
diff --git a/‎3rdparty/cutlass3.7.patch
Lines changed: 2089 additions & 0 deletions b/‎3rdparty/cutlass3.7.patch
Lines changed: 2089 additions & 0 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 15 additions & 1 deletion b/‎CMakeLists.txt
Lines changed: 15 additions & 1 deletion
diff --git a/‎MANIFEST.in
Lines changed: 3 additions & 5 deletions b/‎MANIFEST.in
Lines changed: 3 additions & 5 deletions
diff --git a/‎NOTICE
Lines changed: 48 additions & 48 deletions b/‎NOTICE
Lines changed: 48 additions & 48 deletions
diff --git a/‎README.md
Lines changed: 77 additions & 55 deletions b/‎README.md
Lines changed: 77 additions & 55 deletions
@@ -0,0 +1,27 @@
+---
+Language:        Cpp
+BasedOnStyle:  Google
+IndentWidth:     2
+TabWidth:        2
+ColumnLimit: 99
+ContinuationIndentWidth: 4
+AccessModifierOffset: -1  # The private/protected/public has no indent in class
+Standard:  c++17
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: true
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AllowAllParametersOfDeclarationOnNextLine: true
+BinPackParameters: false
+BinPackArguments: false
+AlignAfterOpenBracket: AlwaysBreak
+AlwaysBreakTemplateDeclarations: true
+AlwaysBreakAfterDefinitionReturnType: All
+DerivePointerAlignment: false
+PointerAlignment: Right
+
+# clang-format 3.9+
+SortIncludes: false
+ReflowComments: true
+...
@@ -0,0 +1,29 @@
+# PyCache files
+build/
+.cache/
+tmp/
+report*.sqlite
+report*.nsys-rep
+
+# run files
+log/
+prof/
+workspace/
+
+# general things to ignore
+dist/
+*.egg-info/
+.eggs/
+*.egg
+*.py[cod]
+__pycache__/
+*.so
+*.so.*
+*~
+python/flux/version.py
+
+# due to using tox and pytest and clangd
+.tox
+
+# 3rdparty
+/3rdparty/nvshmem/
@@ -1,6 +1,6 @@
-[submodule "3rdparty/cutlass"]
-	path = 3rdparty/cutlass
-	url = https://github.com/NVIDIA/cutlass
 [submodule "3rdparty/nccl"]
 	path = 3rdparty/nccl
 	url = https://github.com/NVIDIA/nccl
+[submodule "3rdparty/cutlass"]
+	path = 3rdparty/cutlass
+	url = https://github.com/NVIDIA/cutlass
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
-project(FLUX LANGUAGES CXX CUDA)
+project(FLUX LANGUAGES C CXX CUDA)
 
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/modules/")
 
@@ -15,6 +15,8 @@ message("PYTHONPATH: ${PYTHONPATH}")
 message("NVShmem Support: ${ENABLE_NVSHMEM}")
 
 # find cuda
+# specify cuda path if other than default
+# set(CUDA_TOOLKIT_ROOT_DIR /path/to/installed/cuda)
 find_package(CUDAToolkit REQUIRED)
 
 message(STATUS "CUDAToolkit_VERSION: ${CUDAToolkit_VERSION}")
@@ -102,6 +104,10 @@ print(os.path.dirname(torch.__file__),end='');"
   find_package(Torch REQUIRED)
   find_library(TORCH_PYTHON_LIBRARY torch_python PATH "${TORCH_DIR}/lib")
 
+  if(TORCH_CXX_FLAGS)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
+  endif()
+
   execute_process(COMMAND ${PYTHON_EXECUTABLE} "-c" "from __future__ import print_function; from distutils import sysconfig;
 print(sysconfig.get_python_inc());
 print(sysconfig.get_config_var('EXT_SUFFIX'));"
@@ -172,4 +178,12 @@ link_directories(
   ${COMMON_LIB_DIRS}
 )
 
+if (WITH_PROTOBUF)
+  FIND_PACKAGE(Protobuf REQUIRED)
+  add_subdirectory(proto)
+endif()
+
 add_subdirectory(src)
+if (BUILD_TEST)
+  add_subdirectory(test)
+endif()
@@ -1,5 +1,3 @@
-include src/ths_op/*.cc.inc
-exclude pynvshmem/
-recursive-include src *
-recursive-include include *
-recursive-include python/flux_ths_pybind
+global-exclude *.so*
+recursive-include python/flux/include *
+recursive-include python/flux/share *
@@ -1,85 +1,96 @@
 # Flux
 
-Flux is a fast communication-overlapping library for tensor parallelism on GPUs.
+Flux is a communication-overlapping library for dense/MoE models on GPUs, providing high-performance and pluggable kernels to support various parallelisms in model training/inference.
 
+Flux's efficient kernels are compatible with Pytorch and can be integrated into existing frameworks easily, supporting various Nvidia GPU architectures and data types.
 
-## Why Flux
+Welcome to join the [Wechat](https://github.com/bytedance/flux/blob/main/docs/assets/comet_wechat_group.JPG) group and stay tuned!
 
-Flux can significantly reduce latency and increase throughput for tensor parallelism for both inference and training.
+## Getting started
+Install Flux either from source or from PyPI.
 
-## Install from pip
-```
-# Make sure that PyTorch is installed.
-pip install packaging
-pip install byte-flux
+### Install from Source
+```bash
+git clone --recursive https://github.com/bytedance/flux.git && cd flux
+
+# Install dependencies
+bash ./install_deps.sh
+
+# For Ampere(sm80) GPU
+./build.sh --arch 80 --nvshmem
+# For Ada Lovelace(sm89) GPU
+./build.sh --arch 89 --nvshmem
+# For Hopper(sm90) GPU
+./build.sh --arch 90 --nvshmem
 ```
 
-## Build from source
+#### Install in a virtual environment
+Here is a snippet to install Flux in a virtual environment. Let's finish the installation in an virtual environment with CUDA 12.4, torch 2.6.0 and python 3.11.
+
 ```bash
-git clone https://github.com/bytedance/flux.git
-git submodule update --init --recursive
-# Ampere
-./build.sh --arch 80 
-# Hopper
-./build.sh --arch 90 
+conda create -n flux python=3.11
+conda activate flux
+pip3 install packaging
+pip3 install ninja
+pip3 install torch==2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
+
+./build.sh --clean-all
+./build.sh --arch "80;89;90" --nvshmem --package
 ```
-## Build for cross-machine TP
-FLUX relies on NVSHMEM for communication across nodes. Therefore, if you need support for cross-machine tensor parallelism (TP), you must manually download the NVSHMEM source code and enable the nvshmem option during compilation.
+
+Then you would expect a wheel package under `dist/` folder that is suitable for your virtual environment.
+
+### Install from PyPI
+We also provide some pre-built wheels for Flux, and you can directly install with pip if your wanted version is available. Currently we provide wheels for the following configurations: torch(2.4.0, 2.5.0, 2.6.0), python(3.10, 3.11), cuda(12.4).
 
 ```bash
-git clone https://github.com/bytedance/flux.git
-# Download nvshmem-2.11(https://developer.nvidia.com/nvshmem) and place it to flux/3rdparty/nvshmem
-# Flux is temporarily dependent on a specific version of nvshmem (2.11).
-tar Jxvf nvshmem_src_2.11.0-5.txz
-mv nvshmem_src_2.11.0-5 ${YOUR_PATH}/flux/3rdparty/nvshmem
-git submodule update --init --recursive
-
-# Ampere
-./build.sh --arch 80 --nvshmem
-# Hopper
-./build.sh --arch 90 --nvshmem
+# Make sure that PyTorch is installed.
+pip install byte-flux
 ```
 
-If you are tired of the cmake process, you can set environment variable `FLUX_BUILD_SKIP_CMAKE` to 1 to skip cmake if `build/CMakeCache.txt` already exists.
+### Customized Installation
+#### Build options for source installation
 
-If you want to build a wheel package, add `--package` to the build command. find the output wheel file under dist/
+1. Add `--nvshmem` to build Flux with NVSHMEM support. It is essential for the MoE kernels.
+2. If you are tired of the cmake process, you can set environment variable `FLUX_BUILD_SKIP_CMAKE` to 1 to skip cmake if `build/CMakeCache.txt` already exists.
+3. If you want to build a wheel package, add `--package` to the build command. find the output wheel file under dist/
 
-```bash
-# Ampere
-./build.sh --arch 80 --package
 
-# Hopper
-./build.sh --arch 90 --package
-```
+#### Dependencies
+The core dependencies of Flux are NCCL, CUTLASS, and NVSHMEM, which are located under the 3rdparty folder.
+1. NCCL: Managed by git submodule automatically.
+2. NVSHMEM: Downloaded from https://developer.nvidia.com/nvshmem. The current version is 3.2.5-1.
+3. CUTLASS: Flux leverages CUTLASS to generate high-performance GEMM kernels. We currently use CUTLASS 3.7.0 and a tiny patch should be applied to CUTLASS.
+
 
+## Quick Start
 
-## Run Demo
+Below are commands to run some basic demos once you have installed Flux successfully.
 ```bash
 # gemm only
-PYTHONPATH=./python:$PYTHONPATH python3 test/test_gemm_only.py 4096 12288 6144 --dtype=float16
+python3 test/python/gemm_only/test_gemm_only.py 4096 12288 6144 --dtype=float16
 
-# gemm fused with reduce-scatter
-./scripts/launch.sh test/test_gemm_rs.py 4096 12288 49152 --dtype=float16 --iters=10
+# all-gather fused with gemm (dense MLP layer0)
+./launch.sh test/python/ag_gemm/test_ag_kernel.py 4096 49152 12288 --dtype=float16 --iters=10
 
-# all-gather fused with gemm
-./scripts/launch.sh test/test_ag_kernel.py 4096 49152 12288 --dtype=float16 --iters=10
-```
+# gemm fused with reduce-scatter (dense MLP layer1)
+./launch.sh test/python/gemm_rs/test_gemm_rs.py 4096 12288 49152 --dtype=float16 --iters=10
 
-## Performance
-We measured the examples from the above demo on both A800s and H800s. Each machine has 8 GPUs, with a TP size set to 8. The table below shows the performance comparison between flux and torch+nccl. It can be observed that by overlapping fine-grained computation and communication, Flux is able to effectively hide a significant portion of the communication time
+# all-gather fused with grouped gemm (MoE MLP layer0)
+./launch.sh test/python/moe_ag_scatter/test_moe_ag.py
 
-|  | M | K | N | Torch Gemm | Torch NCCL | Torch Total | Flux Gemm | Flux Comm | Flux Total |
-|----------|----------|----------|----------|----------|----------|----------|----------|----------|-----------|
-| AG+Gemm(A800) | 4096 | 12288 | 49152 | 2.438ms | 0.662ms | 3.099ms | 2.378ms | 0.091ms | 2.469ms |
-| Gemm+RS(A800) | 4096 | 49152 | 12288 | 2.453ms | 0.646ms | 3.100ms | 2.429ms | 0.080ms | 2.508ms |
-| AG+Gemm(H800) | 4096 | 12288 | 49152 | 0.846ms | 0.583ms | 1.429ms | 0.814ms | 0.143ms | 0.957ms |
-| Gemm+RS(H800) | 4096 | 49152 | 12288 | 0.818ms | 0.590ms | 1.408ms | 0.822ms | 0.111ms | 0.932ms |
+# grouped gemm fused with reduce-scatter (MoE MLP layer1)
+./launch.sh test/python/moe_gather_rs/test_moe_gather_rs.py
+```
+
+You can check out the documentations for more details!
 
-AG refers to AllGather.
-RS refers to ReduceScatter.
+* For a more detailed usage on MoE kernels, please refer to [Flux MoE Usage](https://github.com/bytedance/flux/blob/main/docs/moe_usage.md). Try some [examples](https://github.com/bytedance/flux/blob/main/examples) as a quick start. A [minimal MoE layer](https://github.com/bytedance/flux/blob/main/examples/moe_flux_only.py) can be implemented within only a few tens of lines of code using Flux!
+* For some performance numbers, please refer to [Performance Doc](https://github.com/bytedance/flux/blob/main/docs/performance.md).
+* To learn more about the design principles of Flux, please refer to [Design Doc](https://github.com/bytedance/flux/blob/main/docs/design.md).
 
 
-## Citing
+## Citations
 
 If you use Flux in a scientific publication, we encourage you to add the following reference
 to the related papers:
@@ -92,11 +103,22 @@ to the related papers:
       archivePrefix={arXiv},
       primaryClass={cs.LG}
 }
+
+@misc{zhang2025comet,
+      title={Comet: Fine-grained Computation-communication Overlapping for Mixture-of-Experts},
+      author={Shulai Zhang, Ningxin Zheng, Haibin Lin, Ziheng Jiang, Wenlei Bao, Chengquan Jiang, Qi Hou, Weihao Cui, Size Zheng, Li-Wen Chang, Quan Chen and Xin Liu},
+      year={2025},
+      eprint={2502.19811},
+      archivePrefix={arXiv},
+      primaryClass={cs.DC}
+}
+
 ```
 
 ## Reference
 
-* [ArXiv Paper](http://arxiv.org/abs/2406.06858)
+* [ArXiv Paper (Flux)](http://arxiv.org/abs/2406.06858)
+* [ArXiv Paper (Comet)](https://arxiv.org/abs/2502.19811)
 
 ## [License](./LICENSE)