Release v0.0.9

kerthcet · kerthcet · commit 598c196d4d99 · 2025-01-06T19:33:42.000+08:00
Signed-off-by: kerthcet &lt;kerthcet@gmail.com&gt;
diff --git a/.github/ISSUE_TEMPLATE/new-release.md b/.github/ISSUE_TEMPLATE/new-release.md
@@ -15,8 +15,8 @@ Please do not remove items from the checklist
 - [ ] Prepare the image and files
   - [ ] Run `PLATFORMS=linux/amd64 make image-push GIT_TAG=$VERSION`  to build and push an image.
   - [ ] Run `make artifacts GIT_TAG=$VERSION` to generate the artifact.
-  - [ ] Run `make helm-package` to package the helm chart and update the index.yaml.
 - [ ] Update `chart/Chart.yaml` and `docs/installation.md`, the helm version is different with the app version.
+  - [ ] Run `make helm-package` to package the helm chart and update the index.yaml.
   - [ ] Submit a PR and merge it.
 - [ ] An OWNER [prepares a draft release](https://github.com/inftyai/llmaz/releases)
   - [ ] Create a new tag
diff --git a/chart/Chart.yaml b/chart/Chart.yaml
@@ -13,9 +13,9 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.0.4
+version: 0.0.5
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: 0.0.8
+appVersion: 0.0.9
diff --git a/chart/crds/openmodel-crd.yaml b/chart/crds/openmodel-crd.yaml
@@ -95,28 +95,20 @@ spec:
                         pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
                         x-kubernetes-int-or-string: true
                       description: |-
-                        Requests defines the required accelerators to serve the model, like nvidia.com/gpu: 8.
-                        When GPU number is greater than 8, like 32, then multi-host inference is enabled and
-                        32/8=4 hosts will be grouped as an unit, each host will have a resource request as
-                        nvidia.com/gpu: 8. The may change in the future if the GPU number limit is broken.
-                        Not recommended to set the cpu and memory usage here.
-                        If using playground, you can define the cpu/mem usage at backendConfig.
-                        If using service, you can define the cpu/mem at the container resources.
-                        Note: if you define the same accelerator requests at playground/service as well,
+                        Requests defines the required accelerators to serve the model for each replica,
+                        like <nvidia.com/gpu: 8>. For multi-hosts cases, the requests here indicates
+                        the resource requirements for each replica. This may change in the future.
+                        Not recommended to set the cpu and memory usage here:
+                        - if using playground, you can define the cpu/mem usage at backendConfig.
+                        - if using inference service, you can define the cpu/mem at the container resources.
+                        However, if you define the same accelerator requests at playground/service as well,
                         the requests here will be covered.
                       type: object
                   required:
                   - name
                   type: object
                 maxItems: 8
                 type: array
-              preheat:
-                default: false
-                description: |-
-                  Preheat represents whether we should preload the model, by default will use Manta(https://github.com/InftyAI/Manta)
-                  to preload the model, so you should enable the Manta in prior.
-                  Note: right now, we only support preloading models from Huggingface.
-                type: boolean
               source:
                 description: |-
                   Source represents the source of the model, there're several ways to load
diff --git a/chart/crds/playground-crd.yaml b/chart/crds/playground-crd.yaml
@@ -45,13 +45,21 @@ spec:
                   BackendRuntimeConfig represents the inference backendRuntime configuration
                   under the hood, e.g. vLLM, which is the default backendRuntime.
                 properties:
-                  args:
+                  argFlags:
                     description: |-
-                      Args represents the arguments appended to the backend.
-                      You can add new args or overwrite the default args.
+                      ArgFlags represents the argument flags appended to the backend.
+                      You can add new flags or overwrite the default flags.
                     items:
                       type: string
                     type: array
+                  argName:
+                    description: |-
+                      ArgName represents the argument name set in the backendRuntimeArg.
+                      If not set, will be derived by the model role, e.g. if one model's role
+                      is <draft>, the argName will be set to <speculative-decoding>. Better to
+                      set the argName explicitly.
+                      By default, the argName will be treated as <default> in runtime.
+                    type: string
                   envs:
                     description: Envs represents the environments set to the container.
                     items:
@@ -214,6 +222,27 @@ spec:
                       from the default version.
                     type: string
                 type: object
+              elasticConfig:
+                description: |-
+                  ElasticConfig defines the configuration for elastic usage,
+                  e.g. the max/min replicas. Default to 0 ~ Inf+.
+                  This requires to install the HPA first or will not work.
+                properties:
+                  maxReplicas:
+                    description: |-
+                      MaxReplicas indicates the maximum number of inference workloads based on the traffic.
+                      Default to nil means there's no limit for the instance number.
+                    format: int32
+                    type: integer
+                  minReplicas:
+                    default: 1
+                    description: |-
+                      MinReplicas indicates the minimum number of inference workloads based on the traffic.
+                      Default to nil means we can scale down the instances to 1.
+                      If minReplicas set to 0, it requires to install serverless component at first.
+                    format: int32
+                    type: integer
+                type: object
               modelClaim:
                 description: |-
                   ModelClaim represents claiming for one model, it's a simplified use case
diff --git a/chart/crds/service-crd.yaml b/chart/crds/service-crd.yaml
@@ -43,27 +43,6 @@ spec:
               Service controller will maintain multi-flavor of workloads with
               different accelerators for cost or performance considerations.
             properties:
-              elasticConfig:
-                description: |-
-                  ElasticConfig defines the configuration for elastic usage,
-                  e.g. the max/min replicas. Default to 0 ~ Inf+.
-                  This requires to install the HPA first or will not work.
-                properties:
-                  maxReplicas:
-                    description: |-
-                      MaxReplicas indicates the maximum number of inference workloads based on the traffic.
-                      Default to nil means there's no limit for the instance number.
-                    format: int32
-                    type: integer
-                  minReplicas:
-                    default: 1
-                    description: |-
-                      MinReplicas indicates the minimum number of inference workloads based on the traffic.
-                      Default to nil means we can scale down the instances to 1.
-                      If minReplicas set to 0, it requires to install serverless component at first.
-                    format: int32
-                    type: integer
-                type: object
               modelClaims:
                 description: ModelClaims represents multiple claims for different
                   models.
diff --git a/chart/templates/serviceaccount.yaml b/chart/templates/serviceaccount.yaml
@@ -7,7 +7,5 @@ metadata:
     app.kubernetes.io/created-by: llmaz
     app.kubernetes.io/part-of: llmaz
   {{- include "chart.labels" . | nindent 4 }}
-  {{- if .Values.controllerManager.serviceAccount.annotations }}
   annotations:
-    {{- toYaml .Values.controllerManager.serviceAccount.annotations | nindent 4 }}
-  {{- end }}
+    {{- toYaml .Values.controllerManager.serviceAccount.annotations | nindent 4 }}
diff --git a/chart/values.yaml b/chart/values.yaml
@@ -33,7 +33,7 @@ controllerManager:
         - ALL
     image:
       repository: inftyai/llmaz
-      tag: v0.0.8
+      tag: v0.0.9
     resources:
       limits:
         cpu: 500m
diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml
@@ -5,4 +5,4 @@ kind: Kustomization
 images:
 - name: controller
   newName: inftyai/llmaz
-  newTag: v0.0.8
+  newTag: v0.0.9
diff --git a/docs/installation.md b/docs/installation.md
@@ -12,7 +12,7 @@
 ```cmd
 helm repo add inftyai https://inftyai.github.io/llmaz
 helm repo update
-helm install llmaz inftyai/llmaz --namespace llmaz-system --create-namespace --version 0.0.4
+helm install llmaz inftyai/llmaz --namespace llmaz-system --create-namespace --version 0.0.5
 ```
 
 ### Uninstall
diff --git a/index.yaml b/index.yaml
@@ -1,6 +1,16 @@
 apiVersion: v1
 entries:
   llmaz:
+  - apiVersion: v2
+    appVersion: 0.0.9
+    created: "2025-01-06T19:30:25.471004+08:00"
+    description: A Helm chart for llmaz
+    digest: 4a36c5c0da481828e9682afb2932a96d74c7eb1dc9e4b9ceac42789520602d01
+    name: llmaz
+    type: application
+    urls:
+    - https://inftyai.github.io/llmaz/llmaz-0.0.5.tgz
+    version: 0.0.5
   - apiVersion: v2
     appVersion: 0.0.8
     created: "2024-10-23T16:25:18.126844+08:00"
@@ -41,4 +51,4 @@ entries:
     urls:
     - https://inftyai.github.io/llmaz/llmaz-0.0.1.tgz
     version: 0.0.1
-generated: "2024-10-23T16:25:18.101337+08:00"
+generated: "2025-01-06T19:30:25.435128+08:00"
diff --git a/pkg/defaults.go b/pkg/defaults.go
@@ -17,5 +17,5 @@ limitations under the License.
 package pkg
 
 const (
-	LOADER_IMAGE = "inftyai/model-loader:v0.0.9"
+	LOADER_IMAGE = "inftyai/model-loader:v0.0.10"
 )