InftyAI
diff --git a/‎.github/ISSUE_TEMPLATE/new-release.md
Lines changed: 1 addition & 1 deletion b/‎.github/ISSUE_TEMPLATE/new-release.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎chart/Chart.yaml
Lines changed: 2 additions & 2 deletions b/‎chart/Chart.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎chart/crds/openmodel-crd.yaml
Lines changed: 7 additions & 15 deletions b/‎chart/crds/openmodel-crd.yaml
Lines changed: 7 additions & 15 deletions
diff --git a/‎chart/crds/playground-crd.yaml
Lines changed: 32 additions & 3 deletions b/‎chart/crds/playground-crd.yaml
Lines changed: 32 additions & 3 deletions
diff --git a/‎chart/crds/service-crd.yaml
Lines changed: 0 additions & 21 deletions b/‎chart/crds/service-crd.yaml
Lines changed: 0 additions & 21 deletions
@@ -15,8 +15,8 @@ Please do not remove items from the checklist
 - [ ] Prepare the image and files
   - [ ] Run `PLATFORMS=linux/amd64 make image-push GIT_TAG=$VERSION`  to build and push an image.
   - [ ] Run `make artifacts GIT_TAG=$VERSION` to generate the artifact.
-  - [ ] Run `make helm-package` to package the helm chart and update the index.yaml.
 - [ ] Update `chart/Chart.yaml` and `docs/installation.md`, the helm version is different with the app version.
+  - [ ] Run `make helm-package` to package the helm chart and update the index.yaml.
   - [ ] Submit a PR and merge it.
 - [ ] An OWNER [prepares a draft release](https://github.com/inftyai/llmaz/releases)
   - [ ] Create a new tag
 
@@ -13,9 +13,9 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.0.4
+version: 0.0.5
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: 0.0.8
+appVersion: 0.0.9
@@ -95,28 +95,20 @@ spec:
                         pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
                         x-kubernetes-int-or-string: true
                       description: |-
-                        Requests defines the required accelerators to serve the model, like nvidia.com/gpu: 8.
-                        When GPU number is greater than 8, like 32, then multi-host inference is enabled and
-                        32/8=4 hosts will be grouped as an unit, each host will have a resource request as
-                        nvidia.com/gpu: 8. The may change in the future if the GPU number limit is broken.
-                        Not recommended to set the cpu and memory usage here.
-                        If using playground, you can define the cpu/mem usage at backendConfig.
-                        If using service, you can define the cpu/mem at the container resources.
-                        Note: if you define the same accelerator requests at playground/service as well,
+                        Requests defines the required accelerators to serve the model for each replica,
+                        like <nvidia.com/gpu: 8>. For multi-hosts cases, the requests here indicates
+                        the resource requirements for each replica. This may change in the future.
+                        Not recommended to set the cpu and memory usage here:
+                        - if using playground, you can define the cpu/mem usage at backendConfig.
+                        - if using inference service, you can define the cpu/mem at the container resources.
+                        However, if you define the same accelerator requests at playground/service as well,
                         the requests here will be covered.
                       type: object
                   required:
                   - name
                   type: object
                 maxItems: 8
                 type: array
-              preheat:
-                default: false
-                description: |-
-                  Preheat represents whether we should preload the model, by default will use Manta(https://github.com/InftyAI/Manta)
-                  to preload the model, so you should enable the Manta in prior.
-                  Note: right now, we only support preloading models from Huggingface.
-                type: boolean
               source:
                 description: |-
                   Source represents the source of the model, there're several ways to load
 
@@ -45,13 +45,21 @@ spec:
                   BackendRuntimeConfig represents the inference backendRuntime configuration
                   under the hood, e.g. vLLM, which is the default backendRuntime.
                 properties:
-                  args:
+                  argFlags:
                     description: |-
-                      Args represents the arguments appended to the backend.
-                      You can add new args or overwrite the default args.
+                      ArgFlags represents the argument flags appended to the backend.
+                      You can add new flags or overwrite the default flags.
                     items:
                       type: string
                     type: array
+                  argName:
+                    description: |-
+                      ArgName represents the argument name set in the backendRuntimeArg.
+                      If not set, will be derived by the model role, e.g. if one model's role
+                      is <draft>, the argName will be set to <speculative-decoding>. Better to
+                      set the argName explicitly.
+                      By default, the argName will be treated as <default> in runtime.
+                    type: string
                   envs:
                     description: Envs represents the environments set to the container.
                     items:
@@ -214,6 +222,27 @@ spec:
                       from the default version.
                     type: string
                 type: object
+              elasticConfig:
+                description: |-
+                  ElasticConfig defines the configuration for elastic usage,
+                  e.g. the max/min replicas. Default to 0 ~ Inf+.
+                  This requires to install the HPA first or will not work.
+                properties:
+                  maxReplicas:
+                    description: |-
+                      MaxReplicas indicates the maximum number of inference workloads based on the traffic.
+                      Default to nil means there's no limit for the instance number.
+                    format: int32
+                    type: integer
+                  minReplicas:
+                    default: 1
+                    description: |-
+                      MinReplicas indicates the minimum number of inference workloads based on the traffic.
+                      Default to nil means we can scale down the instances to 1.
+                      If minReplicas set to 0, it requires to install serverless component at first.
+                    format: int32
+                    type: integer
+                type: object
               modelClaim:
                 description: |-
                   ModelClaim represents claiming for one model, it's a simplified use case
 
@@ -43,27 +43,6 @@ spec:
               Service controller will maintain multi-flavor of workloads with
               different accelerators for cost or performance considerations.
             properties:
-              elasticConfig:
-                description: |-
-                  ElasticConfig defines the configuration for elastic usage,
-                  e.g. the max/min replicas. Default to 0 ~ Inf+.
-                  This requires to install the HPA first or will not work.
-                properties:
-                  maxReplicas:
-                    description: |-
-                      MaxReplicas indicates the maximum number of inference workloads based on the traffic.
-                      Default to nil means there's no limit for the instance number.
-                    format: int32
-                    type: integer
-                  minReplicas:
-                    default: 1
-                    description: |-
-                      MinReplicas indicates the minimum number of inference workloads based on the traffic.
-                      Default to nil means we can scale down the instances to 1.
-                      If minReplicas set to 0, it requires to install serverless component at first.
-                    format: int32
-                    type: integer
-                type: object
               modelClaims:
                 description: ModelClaims represents multiple claims for different
                   models.