Add ContextWindowStrategy to control context length in OllamaClient

ptitjes · ptitjes · commit 69e288130d17 · 2025-08-07T12:13:13.000+02:00
diff --git a/prompt/prompt-executor/prompt-executor-clients/prompt-executor-ollama-client/build.gradle.kts b/prompt/prompt-executor/prompt-executor-clients/prompt-executor-ollama-client/build.gradle.kts
@@ -12,9 +12,11 @@ kotlin {
     sourceSets {
         commonMain {
             dependencies {
+                api(project(":agents:agents-core"))
                 api(project(":agents:agents-tools"))
                 api(project(":prompt:prompt-llm"))
                 api(project(":prompt:prompt-model"))
+                api(project(":prompt:prompt-tokenizer"))
                 api(project(":agents:agents-tools"))
                 api(project(":prompt:prompt-executor:prompt-executor-model"))
                 api(project(":prompt:prompt-executor:prompt-executor-clients"))
diff --git a/prompt/prompt-executor/prompt-executor-clients/prompt-executor-ollama-client/src/commonMain/kotlin/ai/koog/prompt/executor/ollama/client/ContextWindowStrategy.kt b/prompt/prompt-executor/prompt-executor-clients/prompt-executor-ollama-client/src/commonMain/kotlin/ai/koog/prompt/executor/ollama/client/ContextWindowStrategy.kt
@@ -0,0 +1,114 @@
+package ai.koog.prompt.executor.ollama.client
+
+import ai.koog.agents.core.annotation.ExperimentalAgentsApi
+import ai.koog.prompt.dsl.Prompt
+import ai.koog.prompt.llm.LLModel
+import ai.koog.prompt.tokenizer.PromptTokenizer
+import io.github.oshai.kotlinlogging.KotlinLogging
+
+private val logger = KotlinLogging.logger { }
+
+/**
+ * Represents a strategy for computing the context window length for `OllamaClient`.
+ * Different implementations define specific approaches to computing the context window length.
+ * Based on the context window length computed by this strategy, Ollama will truncate the context window accordingly.
+ *
+ * To decide the context window length, Ollama proceeds as follows:
+ * - If a `num_ctx` parameter is specified in the chat request, the context window length is set to that value.
+ * - If the model definition contains a `num_ctx` parameter, the context window length is set to that value.
+ * - If an `OLLAMA_CONTEXT_LENGTH` environment variable is set, the context window length is set to that value.
+ * - Otherwise, the context window length is set to the default value of 2048.
+ *
+ * Effectively, this strategy allows you to specify what `num_ctx` value will be set in chat requests sent to Ollama,
+ * for a given prompt and model.
+ *
+ * Important: You will want to have a context window length that does not change often for a specific model.
+ * Indeed, Ollama will reload the model every time the context window length changes.
+ *
+ * Example implementations:
+ * - [ContextWindowStrategy.None]
+ * - [ContextWindowStrategy.Fixed]
+ * - [ContextWindowStrategy.FitPrompt]
+ */
+@ExperimentalAgentsApi
+public interface ContextWindowStrategy {
+
+    public fun computeContextLength(prompt: Prompt, model: LLModel): Long?
+
+    public companion object {
+        /**
+         * A strategy for letting the Ollama server decide the context window length.
+         * To decide the context window length, Ollama proceeds as follows:
+         * - If the model definition contains a `num_ctx` parameter, the context window length is set to that value.
+         * - If an `OLLAMA_CONTEXT_LENGTH` environment variable is set, the context window length is set to that value.
+         * - Otherwise, the context window length is set to the default value of 2048.
+         */
+        public data object None : ContextWindowStrategy {
+            override fun computeContextLength(prompt: Prompt, model: LLModel): Long? = null
+        }
+
+        /**
+         * A strategy for specifying a fixed context window length.
+         * If the given [contextLength] is more than the maximum context window length supported by the model,
+         * the context window length will be set to the maximum context window length supported by the model.
+         *
+         * @param contextLength The context window length to use.
+         */
+        public data class Fixed(val contextLength: Long) : ContextWindowStrategy {
+            override fun computeContextLength(prompt: Prompt, model: LLModel): Long {
+                if (contextLength > model.contextLength) {
+                    logger.warn {
+                        "Context length $contextLength was more than what is supported by model '${model.id}'," +
+                            " falling back to the model's maximum context length ${model.contextLength}"
+                    }
+                    return model.contextLength
+                }
+                return contextLength
+            }
+        }
+
+        /**
+         * A strategy for computing the context window length based on the prompt length.
+         *
+         * @param promptTokenizer The [PromptTokenizer] to use for computing the prompt length,
+         *   or null to use the last reported token usage.
+         * @param granularity The granularity to use for computing the context window length. Defaults to 2048.
+         * @param minimumContextLength The minimum context window length,
+         *   if the prompt length is less than it or cannot be computed yet.
+         *   If not null, [minimumContextLength] must be a multiple of the [granularity].
+         *   If null, we let Ollama decide the context window length.
+         */
+        public data class FitPrompt(
+            val promptTokenizer: PromptTokenizer? = null,
+            val granularity: Long = 2048,
+            val minimumContextLength: Long? = null,
+        ) : ContextWindowStrategy {
+
+            init {
+                require(granularity > 0) { "Granularity must be greater than 0" }
+                require(minimumContextLength == null || minimumContextLength % granularity == 0L) {
+                    "Minimum context length must be a multiple of granularity"
+                }
+            }
+
+            override fun computeContextLength(prompt: Prompt, model: LLModel): Long? {
+                val promptLength = when {
+                    promptTokenizer != null -> promptTokenizer.tokenCountFor(prompt)
+                    prompt.latestTokenUsage != 0 -> prompt.latestTokenUsage
+                    else -> null
+                }
+
+                if (promptLength == null) return minimumContextLength
+                if (promptLength > model.contextLength) {
+                    logger.warn {
+                        "Prompt length $promptLength was more than the maximum context length of model '${model.id}'," +
+                            " falling back to the model's maximum context length ${model.contextLength}"
+                    }
+                    return model.contextLength
+                }
+
+                return (promptLength / granularity + 1) * granularity
+            }
+        }
+    }
+}
diff --git a/prompt/prompt-executor/prompt-executor-clients/prompt-executor-ollama-client/src/commonMain/kotlin/ai/koog/prompt/executor/ollama/client/OllamaClient.kt b/prompt/prompt-executor/prompt-executor-clients/prompt-executor-ollama-client/src/commonMain/kotlin/ai/koog/prompt/executor/ollama/client/OllamaClient.kt
@@ -1,5 +1,6 @@
 package ai.koog.prompt.executor.ollama.client
 
+import ai.koog.agents.core.annotation.ExperimentalAgentsApi
 import ai.koog.agents.core.tools.ToolDescriptor
 import ai.koog.prompt.dsl.ModerationCategory
 import ai.koog.prompt.dsl.ModerationCategoryResult
@@ -19,7 +20,6 @@ import ai.koog.prompt.executor.ollama.client.dto.OllamaPullModelResponseDTO
 import ai.koog.prompt.executor.ollama.client.dto.OllamaShowModelRequestDTO
 import ai.koog.prompt.executor.ollama.client.dto.OllamaShowModelResponseDTO
 import ai.koog.prompt.executor.ollama.client.dto.extractOllamaJsonFormat
-import ai.koog.prompt.executor.ollama.client.dto.extractOllamaOptions
 import ai.koog.prompt.executor.ollama.client.dto.getToolCalls
 import ai.koog.prompt.executor.ollama.client.dto.toOllamaChatMessages
 import ai.koog.prompt.executor.ollama.client.dto.toOllamaModelCard
@@ -53,19 +53,24 @@ import kotlinx.serialization.json.Json
 /**
  * Client for interacting with the Ollama API with comprehensive model support.
  *
+ * Implements:
+ * - [LLMClient] for executing prompts and streaming responses.
+ * - [LLMEmbeddingProvider] for generating embeddings from input text.
+ *
  * @param baseUrl The base URL of the Ollama server. Defaults to "http://localhost:11434".
  * @param baseClient The underlying HTTP client used for making requests.
  * @param timeoutConfig Configuration for connection, request, and socket timeouts.
  * @param clock Clock instance used for tracking response metadata timestamps.
- * Implements:
- * - LLMClient for executing prompts and streaming responses.
- * - LLMEmbeddingProvider for generating embeddings from input text.
+ * @param contextWindowStrategy The [ContextWindowStrategy] to use for computing context window lengths.
+ *   Defaults to [ContextWindowStrategy.None].
  */
+@OptIn(ExperimentalAgentsApi::class)
 public class OllamaClient(
     public val baseUrl: String = "http://localhost:11434",
     baseClient: HttpClient = HttpClient(engineFactoryProvider()),
     timeoutConfig: ConnectionTimeoutConfig = ConnectionTimeoutConfig(),
-    private val clock: Clock = Clock.System
+    private val clock: Clock = Clock.System,
+    private val contextWindowStrategy: ContextWindowStrategy = ContextWindowStrategy.Companion.None,
 ) : LLMClient, LLMEmbeddingProvider {
 
     private companion object {
@@ -155,7 +160,7 @@ public class OllamaClient(
                     messages = prompt.toOllamaChatMessages(model),
                     tools = if (tools.isNotEmpty()) tools.map { it.toOllamaTool() } else null,
                     format = prompt.extractOllamaJsonFormat(),
-                    options = prompt.extractOllamaOptions(),
+                    options = extractOllamaOptions(prompt, model),
                     stream = false,
                 )
             )
@@ -230,7 +235,7 @@ public class OllamaClient(
                 OllamaChatRequestDTO(
                     model = model.id,
                     messages = prompt.toOllamaChatMessages(model),
-                    options = prompt.extractOllamaOptions(),
+                    options = extractOllamaOptions(prompt, model),
                     stream = true,
                 )
             )
@@ -256,6 +261,16 @@ public class OllamaClient(
         }
     }
 
+    /**
+     * Prepare Ollama chat request options from the given prompt and model.
+     */
+    internal fun extractOllamaOptions(prompt: Prompt, model: LLModel): OllamaChatRequestDTO.Options {
+        return OllamaChatRequestDTO.Options(
+            temperature = prompt.params.temperature,
+            numCtx = contextWindowStrategy.computeContextLength(prompt, model),
+        )
+    }
+
     /**
      * Embeds the given text using the Ollama model.
      *
diff --git a/prompt/prompt-executor/prompt-executor-clients/prompt-executor-ollama-client/src/commonMain/kotlin/ai/koog/prompt/executor/ollama/client/dto/OllamaConverters.kt b/prompt/prompt-executor/prompt-executor-clients/prompt-executor-ollama-client/src/commonMain/kotlin/ai/koog/prompt/executor/ollama/client/dto/OllamaConverters.kt
@@ -111,14 +111,6 @@ internal fun Prompt.extractOllamaJsonFormat(): JsonObject? {
     return if (schema is LLMParams.Schema.JSON) schema.schema else null
 }
 
-/**
- * Extracts options from the prompt, if temperature is defined.
- */
-internal fun Prompt.extractOllamaOptions(): OllamaChatRequestDTO.Options? {
-    val temperature = params.temperature
-    return temperature?.let { OllamaChatRequestDTO.Options(temperature = temperature) }
-}
-
 /**
  * Extracts tool calls from a ChatMessage.
  * Returns the first tool call for compatibility, but logs if multiple calls exist.
diff --git a/prompt/prompt-executor/prompt-executor-clients/prompt-executor-ollama-client/src/commonMain/kotlin/ai/koog/prompt/executor/ollama/client/dto/OllamaModels.kt b/prompt/prompt-executor/prompt-executor-clients/prompt-executor-ollama-client/src/commonMain/kotlin/ai/koog/prompt/executor/ollama/client/dto/OllamaModels.kt
@@ -70,6 +70,7 @@ internal data class OllamaChatRequestDTO(
     @Serializable
     internal data class Options(
         val temperature: Double? = null,
+        @SerialName("num_ctx") val numCtx: Long? = null,
     )
 }
 

Original file line number	Diff line number	Diff line change
`@@ -70,6 +70,7 @@ internal data class OllamaChatRequestDTO(`
`70`	`70`	`@Serializable`
`71`	`71`	`internal data class Options(`
`72`	`72`	`val temperature: Double? = null,`
	`73`	`+ @SerialName("num_ctx") val numCtx: Long? = null,`
`73`	`74`	`)`
`74`	`75`	`}`
`75`	`76`