|
| 1 | +package ai.koog.prompt.executor.ollama.client |
| 2 | + |
| 3 | +import ai.koog.agents.core.annotation.ExperimentalAgentsApi |
| 4 | +import ai.koog.prompt.dsl.Prompt |
| 5 | +import ai.koog.prompt.llm.LLModel |
| 6 | +import ai.koog.prompt.tokenizer.PromptTokenizer |
| 7 | +import io.github.oshai.kotlinlogging.KotlinLogging |
| 8 | + |
| 9 | +private val logger = KotlinLogging.logger { } |
| 10 | + |
| 11 | +/** |
| 12 | + * Represents a strategy for computing the context window length for `OllamaClient`. |
| 13 | + * Different implementations define specific approaches to computing the context window length. |
| 14 | + * Based on the context window length computed by this strategy, Ollama will truncate the context window accordingly. |
| 15 | + * |
| 16 | + * To decide the context window length, Ollama proceeds as follows: |
| 17 | + * - If a `num_ctx` parameter is specified in the chat request, the context window length is set to that value. |
| 18 | + * - If the model definition contains a `num_ctx` parameter, the context window length is set to that value. |
| 19 | + * - If an `OLLAMA_CONTEXT_LENGTH` environment variable is set, the context window length is set to that value. |
| 20 | + * - Otherwise, the context window length is set to the default value of 2048. |
| 21 | + * |
| 22 | + * Effectively, this strategy allows you to specify what `num_ctx` value will be set in chat requests sent to Ollama, |
| 23 | + * for a given prompt and model. |
| 24 | + * |
| 25 | + * Important: You will want to have a context window length that does not change often for a specific model. |
| 26 | + * Indeed, Ollama will reload the model every time the context window length changes. |
| 27 | + * |
| 28 | + * Example implementations: |
| 29 | + * - [ContextWindowStrategy.None] |
| 30 | + * - [ContextWindowStrategy.Fixed] |
| 31 | + * - [ContextWindowStrategy.FitPrompt] |
| 32 | + */ |
| 33 | +@ExperimentalAgentsApi |
| 34 | +public interface ContextWindowStrategy { |
| 35 | + |
| 36 | + public fun computeContextLength(prompt: Prompt, model: LLModel): Long? |
| 37 | + |
| 38 | + public companion object { |
| 39 | + /** |
| 40 | + * A strategy for letting the Ollama server decide the context window length. |
| 41 | + * To decide the context window length, Ollama proceeds as follows: |
| 42 | + * - If the model definition contains a `num_ctx` parameter, the context window length is set to that value. |
| 43 | + * - If an `OLLAMA_CONTEXT_LENGTH` environment variable is set, the context window length is set to that value. |
| 44 | + * - Otherwise, the context window length is set to the default value of 2048. |
| 45 | + */ |
| 46 | + public data object None : ContextWindowStrategy { |
| 47 | + override fun computeContextLength(prompt: Prompt, model: LLModel): Long? = null |
| 48 | + } |
| 49 | + |
| 50 | + /** |
| 51 | + * A strategy for specifying a fixed context window length. |
| 52 | + * If the given [contextLength] is more than the maximum context window length supported by the model, |
| 53 | + * the context window length will be set to the maximum context window length supported by the model. |
| 54 | + * |
| 55 | + * @param contextLength The context window length to use. |
| 56 | + */ |
| 57 | + public data class Fixed(val contextLength: Long) : ContextWindowStrategy { |
| 58 | + override fun computeContextLength(prompt: Prompt, model: LLModel): Long { |
| 59 | + if (contextLength > model.contextLength) { |
| 60 | + logger.warn { |
| 61 | + "Context length $contextLength was more than what is supported by model '${model.id}'," + |
| 62 | + " falling back to the model's maximum context length ${model.contextLength}" |
| 63 | + } |
| 64 | + return model.contextLength |
| 65 | + } |
| 66 | + return contextLength |
| 67 | + } |
| 68 | + } |
| 69 | + |
| 70 | + /** |
| 71 | + * A strategy for computing the context window length based on the prompt length. |
| 72 | + * |
| 73 | + * @param promptTokenizer The [PromptTokenizer] to use for computing the prompt length, |
| 74 | + * or null to use the last reported token usage. |
| 75 | + * @param granularity The granularity to use for computing the context window length. Defaults to 2048. |
| 76 | + * @param minimumContextLength The minimum context window length, |
| 77 | + * if the prompt length is less than it or cannot be computed yet. |
| 78 | + * If not null, [minimumContextLength] must be a multiple of the [granularity]. |
| 79 | + * If null, we let Ollama decide the context window length. |
| 80 | + */ |
| 81 | + public data class FitPrompt( |
| 82 | + val promptTokenizer: PromptTokenizer? = null, |
| 83 | + val granularity: Long = 2048, |
| 84 | + val minimumContextLength: Long? = null, |
| 85 | + ) : ContextWindowStrategy { |
| 86 | + |
| 87 | + init { |
| 88 | + require(granularity > 0) { "Granularity must be greater than 0" } |
| 89 | + require(minimumContextLength == null || minimumContextLength % granularity == 0L) { |
| 90 | + "Minimum context length must be a multiple of granularity" |
| 91 | + } |
| 92 | + } |
| 93 | + |
| 94 | + override fun computeContextLength(prompt: Prompt, model: LLModel): Long? { |
| 95 | + val promptLength = when { |
| 96 | + promptTokenizer != null -> promptTokenizer.tokenCountFor(prompt) |
| 97 | + prompt.latestTokenUsage != 0 -> prompt.latestTokenUsage |
| 98 | + else -> null |
| 99 | + } |
| 100 | + |
| 101 | + if (promptLength == null) return minimumContextLength |
| 102 | + if (promptLength > model.contextLength) { |
| 103 | + logger.warn { |
| 104 | + "Prompt length $promptLength was more than the maximum context length of model '${model.id}'," + |
| 105 | + " falling back to the model's maximum context length ${model.contextLength}" |
| 106 | + } |
| 107 | + return model.contextLength |
| 108 | + } |
| 109 | + |
| 110 | + return (promptLength / granularity + 1) * granularity |
| 111 | + } |
| 112 | + } |
| 113 | + } |
| 114 | +} |
0 commit comments