Skip to content

Commit 956e5b8

Browse files
authored
Extract content metadata into a separate method (#550)
Signed-off-by: Sergey Karpov <[email protected]>
1 parent 936111f commit 956e5b8

File tree

9 files changed

+102
-71
lines changed

9 files changed

+102
-71
lines changed

rag/rag-base/src/commonMain/kotlin/ai/koog/rag/base/files/FileMetadata.kt

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,12 @@ import kotlinx.serialization.Serializable
88
*
99
* @property type The type of the file, indicating whether it is a file or a directory.
1010
* @property hidden A flag indicating whether the file or directory is hidden.
11-
* @property content The type of content stored in the file.
1211
*/
1312
@Serializable
1413
public data class FileMetadata(
1514
@SerialName("content_type")
1615
val type: FileType,
1716
val hidden: Boolean,
18-
val content: FileContent,
1917
) {
2018
/**
2119
* Represents the type of a file in the context of file metadata.
@@ -60,7 +58,7 @@ public data class FileMetadata(
6058
*
6159
* @property display A string representation of the content type.
6260
*/
63-
public enum class FileContent(public val display: String) {
61+
public enum class FileContentType(public val display: String) {
6462
/**
6563
* Represents textual file content.
6664
* Associated with the string display value "text".
@@ -73,15 +71,6 @@ public data class FileMetadata(
7371
* This value is used to denote that the content of a file is binary data, as opposed to plain text or other types.
7472
* It provides a display string ("binary") to identify this content type.
7573
*/
76-
Binary("binary"),
77-
78-
/**
79-
* Represents a file content type that is not applicable or relevant.
80-
* Typically used in scenarios where the file content type does not
81-
* conform to valid or expected classifications.
82-
*
83-
* This enumeration is part of the `FileContent` enum.
84-
*/
85-
Inapplicable("inapplicable")
74+
Binary("binary")
8675
}
8776
}

rag/rag-base/src/commonMain/kotlin/ai/koog/rag/base/files/FileSystemProvider.kt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,16 @@ public object FileSystemProvider {
7575
*/
7676
public suspend fun metadata(path: Path): FileMetadata?
7777

78+
/**
79+
* Detects the type of content stored in a file using a [path].
80+
*
81+
* @param path The path to a file whose content type is to be detected.
82+
* @return [FileMetadata.FileContentType.Text] for text files, [FileMetadata.FileContentType.Binary] for binary files.
83+
* @throws IllegalArgumentException if the path doesn't exist or isn't a regular file.
84+
* @throws IOException if an I/O error occurs while detecting the file content type.
85+
*/
86+
public suspend fun getFileContentType(path: Path): FileMetadata.FileContentType
87+
7888
/**
7989
* Lists contents of a [directory].
8090
* Children are sorted by name.

rag/rag-base/src/commonMain/kotlin/ai/koog/rag/base/files/FilteredFileSystemProvider.kt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,11 @@ internal open class FilteredReadOnly<P>(
6363
}
6464
}
6565

66+
override suspend fun getFileContentType(path: P): FileMetadata.FileContentType {
67+
requireAllowed(path)
68+
return fs.getFileContentType(path)
69+
}
70+
6671
override suspend fun exists(path: P): Boolean {
6772
return if (filter.show(path, fs)) {
6873
fs.exists(path)

rag/rag-base/src/jvmMain/kotlin/ai/koog/rag/base/files/JVMFileSystemProvider.kt

Lines changed: 41 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
package ai.koog.rag.base.files
22

3-
import ai.koog.rag.base.files.FileMetadata.FileContent
3+
import ai.koog.rag.base.files.FileMetadata.FileContentType
44
import ai.koog.rag.base.files.FileMetadata.FileType
55
import kotlinx.coroutines.Dispatchers
66
import kotlinx.coroutines.withContext
@@ -135,9 +135,9 @@ public object JVMFileSystemProvider {
135135
*/
136136
override suspend fun metadata(path: Path): FileMetadata? {
137137
return if (path.isRegularFile()) {
138-
FileMetadata(FileType.File, path.isHidden(), path.contentType())
138+
FileMetadata(FileType.File, path.isHidden())
139139
} else if (path.isDirectory()) {
140-
FileMetadata(FileType.Directory, path.isHidden(), path.contentType())
140+
FileMetadata(FileType.Directory, path.isHidden())
141141
} else {
142142
null
143143
}
@@ -189,6 +189,44 @@ public object JVMFileSystemProvider {
189189
*/
190190
override suspend fun exists(path: Path): Boolean = path.exists()
191191

192+
/**
193+
* Detects the type of content stored in a file using a [path].
194+
*
195+
* @param path The path to the file whose content type is to be detected.
196+
* @return [FileContentType.Text] for text files, [FileContentType.Binary] for binary files.
197+
* @throws IllegalArgumentException if the path doesn't exist or isn't a regular file.
198+
* @throws IOException if an I/O error occurs while detecting the file content type.
199+
*/
200+
override suspend fun getFileContentType(path: Path): FileContentType {
201+
require(path.exists()) { "Path must exist" }
202+
require(path.isRegularFile()) { "Path must be a regular file" }
203+
return if (path.isFileHeadTextBased()) FileContentType.Text else FileContentType.Binary
204+
}
205+
206+
/**
207+
* Determines if the beginning of a file's content is text-based, as opposed to binary.
208+
* This method reads a specified amount of data from the start of the file,
209+
* attempts decoding with a list of provided character sets, and checks if any succeed.
210+
*
211+
* @param headMaxSize The maximum number of bytes to read from the start of the file. Defaults to 1024 bytes.
212+
* @param charsetsToTry A list of character sets to attempt decoding the file's content. Defaults to a list containing UTF-8.
213+
* @return True if the file's head data is successfully decoded with one of the given character sets, otherwise false.
214+
*/
215+
private fun Path.isFileHeadTextBased(
216+
headMaxSize: Int = 1024,
217+
charsetsToTry: List<Charset> = listOf(
218+
Charsets.UTF_8,
219+
)
220+
): Boolean {
221+
return runCatching {
222+
val headData = inputStream().use { stream ->
223+
val buffer = ByteArray(headMaxSize)
224+
stream.read(buffer, 0, headMaxSize).let { ByteBuffer.wrap(buffer.copyOf(it)) }
225+
}
226+
charsetsToTry.any { runCatching { it.newDecoder().decode(headData) }.isSuccess }
227+
}.getOrElse { false }
228+
}
229+
192230
/**
193231
* Reads the contents of the file located at the specified path.
194232
*
@@ -371,45 +409,4 @@ public object JVMFileSystemProvider {
371409
}
372410
}
373411
}
374-
375-
/**
376-
* Determines the type of content for a file represented by the Path.
377-
*
378-
* This function evaluates the file at the given Path to classify its content type as
379-
* either textual, binary, or inapplicable. It checks if the file's head data can be
380-
* classified as text using specific character sets, identifies binary content for regular files,
381-
* and returns inapplicable for all other cases.
382-
*
383-
* @return the file content type as one of the [FileContent] values: [FileContent.Text],
384-
* [FileContent.Binary], or [FileContent.Inapplicable].
385-
*/
386-
private fun Path.contentType(): FileContent = when {
387-
isFileHeadTextBased() -> FileContent.Text
388-
isRegularFile() -> FileContent.Binary
389-
else -> FileContent.Inapplicable
390-
}
391-
392-
/**
393-
* Determines if the beginning of a file's content is text-based, as opposed to binary.
394-
* This method reads a specified amount of data from the start of the file,
395-
* attempts decoding with a list of provided character sets, and checks if any succeed.
396-
*
397-
* @param headMaxSize The maximum number of bytes to read from the start of the file. Defaults to 1024 bytes.
398-
* @param charsetsToTry A list of character sets to attempt decoding the file's content. Defaults to a list containing UTF-8.
399-
* @return True if the file's head data is successfully decoded with one of the given character sets, otherwise false.
400-
*/
401-
private fun Path.isFileHeadTextBased(
402-
headMaxSize: Int = 1024,
403-
charsetsToTry: List<Charset> = listOf(
404-
Charsets.UTF_8,
405-
)
406-
): Boolean {
407-
return runCatching {
408-
val headData = inputStream().use { stream ->
409-
val buffer = ByteArray(headMaxSize)
410-
stream.read(buffer, 0, headMaxSize).let { ByteBuffer.wrap(buffer.copyOf(it)) }
411-
}
412-
charsetsToTry.any { runCatching { it.newDecoder().decode(headData) }.isSuccess }
413-
}.getOrElse { false }
414-
}
415412
}

rag/rag-base/src/jvmTest/kotlin/ai/koog/rag/base/files/JVMFileSystemProviderTest.kt

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -181,8 +181,7 @@ class JVMFileSystemProviderTest : KoogTestBase() {
181181

182182
@Test
183183
fun `test ReadOnly metadata`() = runBlocking {
184-
val metadata =
185-
FileMetadata(FileMetadata.FileType.File, hidden = false, content = FileMetadata.FileContent.Text)
184+
val metadata = FileMetadata(FileMetadata.FileType.File, hidden = false)
186185
val testMetadata = readOnly.metadata(file1)
187186
assertEquals(metadata, testMetadata)
188187
}
@@ -344,11 +343,36 @@ class JVMFileSystemProviderTest : KoogTestBase() {
344343
}
345344

346345
@Test
347-
fun `test ReadOnly metadata dir`() = runBlocking {
346+
fun `test metadata file`() = runBlocking {
347+
val expectedMetadata = FileMetadata(FileMetadata.FileType.File, hidden = false)
348+
val actualMetadata = readOnly.metadata(file1)
349+
assertEquals(expectedMetadata, actualMetadata)
350+
}
351+
352+
@Test
353+
fun `test getFileContentType with Text type`() = runBlocking {
354+
val contentType = readOnly.getFileContentType(file1)
355+
assertEquals(FileMetadata.FileContentType.Text, contentType)
356+
}
357+
358+
@Test
359+
fun `test getFileContentType with directory throws IllegalArgumentException`() {
360+
assertThrows<IllegalArgumentException> {
361+
runBlocking { readOnly.getFileContentType(dirEmpty) }
362+
}
363+
}
364+
365+
@Test
366+
fun `test getFileContentType with Binary type`() = runBlocking {
367+
val contentType = readOnly.getFileContentType(zip1)
368+
assertEquals(FileMetadata.FileContentType.Binary, contentType)
369+
}
370+
371+
@Test
372+
fun `test metadata dir`() = runBlocking {
348373
val metadata = FileMetadata(
349374
FileMetadata.FileType.Directory,
350375
hidden = false,
351-
content = FileMetadata.FileContent.Inapplicable
352376
)
353377
val testMetadata = readOnly.metadata(dir1)
354378
assertEquals(metadata, testMetadata)
@@ -411,10 +435,9 @@ class JVMFileSystemProviderTest : KoogTestBase() {
411435

412436
@Test
413437
fun `test ReadWrite metadata`() = runBlocking {
414-
val metadata =
415-
FileMetadata(FileMetadata.FileType.File, hidden = false, content = FileMetadata.FileContent.Text)
438+
val expectedMetadata = FileMetadata(FileMetadata.FileType.File, hidden = false)
416439
val testMetadata = readWrite.metadata(file1)
417-
assertEquals(metadata, testMetadata)
440+
assertEquals(expectedMetadata, testMetadata)
418441
}
419442

420443
@Test

rag/vector-storage/src/commonTest/kotlin/ai/koog/rag/vector/FileDocumentEmbeddingStorageTest.kt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import ai.koog.embeddings.base.Vector
44
import ai.koog.rag.vector.mocks.MockDocument
55
import ai.koog.rag.vector.mocks.MockDocumentProvider
66
import ai.koog.rag.vector.mocks.MockFileSystem
7-
import ai.koog.rag.vector.mocks.MockFileSystemProvicer
7+
import ai.koog.rag.vector.mocks.MockFileSystemProvider
88
import kotlinx.coroutines.flow.toList
99
import kotlinx.coroutines.test.runTest
1010
import kotlin.test.Test
@@ -36,7 +36,7 @@ class FileDocumentEmbeddingStorageTest {
3636
private suspend fun createTestStorage(): FileDocumentEmbeddingStorage<MockDocument, String> {
3737
val mockFileSystem = MockFileSystem()
3838
val mockDocumentProvider = MockDocumentProvider(mockFileSystem)
39-
val mockFileSystemProvicer = MockFileSystemProvicer(mockFileSystem)
39+
val mockFileSystemProvicer = MockFileSystemProvider(mockFileSystem)
4040
val mockEmbedder = MockDocumentEmbedder()
4141

4242
return FileDocumentEmbeddingStorage(mockEmbedder, mockDocumentProvider, mockFileSystemProvicer, "test-root")

rag/vector-storage/src/commonTest/kotlin/ai/koog/rag/vector/FileVectorStorageTest.kt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import ai.koog.embeddings.base.Vector
44
import ai.koog.rag.vector.mocks.MockDocument
55
import ai.koog.rag.vector.mocks.MockDocumentProvider
66
import ai.koog.rag.vector.mocks.MockFileSystem
7-
import ai.koog.rag.vector.mocks.MockFileSystemProvicer
7+
import ai.koog.rag.vector.mocks.MockFileSystemProvider
88
import kotlinx.coroutines.flow.toList
99
import kotlinx.coroutines.test.runTest
1010
import kotlin.test.Test
@@ -17,7 +17,7 @@ class FileVectorStorageTest {
1717
private fun createTestStorage(): FileVectorStorage<MockDocument, String> {
1818
val mockFileSystem = MockFileSystem()
1919
val mockDocumentProvider = MockDocumentProvider(mockFileSystem)
20-
val mockFileSystemProvider = MockFileSystemProvicer(mockFileSystem)
20+
val mockFileSystemProvider = MockFileSystemProvider(mockFileSystem)
2121

2222
val storage = FileVectorStorage(mockDocumentProvider, mockFileSystemProvider, "test-root")
2323
return storage

rag/vector-storage/src/commonTest/kotlin/ai/koog/rag/vector/TextFileDocumentEmbeddingStorageTest.kt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import ai.koog.embeddings.base.Vector
55
import ai.koog.rag.vector.mocks.MockDocument
66
import ai.koog.rag.vector.mocks.MockDocumentProvider
77
import ai.koog.rag.vector.mocks.MockFileSystem
8-
import ai.koog.rag.vector.mocks.MockFileSystemProvicer
8+
import ai.koog.rag.vector.mocks.MockFileSystemProvider
99
import kotlinx.coroutines.flow.toList
1010
import kotlinx.coroutines.test.runTest
1111
import kotlin.test.Test
@@ -33,7 +33,7 @@ class TextFileDocumentEmbeddingStorageTest {
3333
private suspend fun createTestStorage(): TextFileDocumentEmbeddingStorage<MockDocument, String> {
3434
val mockFileSystem = MockFileSystem()
3535
val mockDocumentProvider = MockDocumentProvider(mockFileSystem)
36-
val mockFileSystemProvicer = MockFileSystemProvicer(mockFileSystem)
36+
val mockFileSystemProvicer = MockFileSystemProvider(mockFileSystem)
3737
val mockEmbedder = MockEmbedder()
3838

3939
return TextFileDocumentEmbeddingStorage(mockEmbedder, mockDocumentProvider, mockFileSystemProvicer, "test-root")

rag/vector-storage/src/commonTest/kotlin/ai/koog/rag/vector/mocks/MockDocumentProviders.kt

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ class MockDocumentProvider(val mockFileSystem: MockFileSystem) : DocumentProvide
4242
}
4343
}
4444

45-
class MockFileSystemProvicer(val mockFileSystem: MockFileSystem) : FileSystemProvider.ReadWrite<String> {
45+
class MockFileSystemProvider(val mockFileSystem: MockFileSystem) : FileSystemProvider.ReadWrite<String> {
4646
override fun toAbsolutePathString(path: String): String = path
4747

4848
override fun fromAbsoluteString(path: String): String = path
@@ -70,6 +70,13 @@ class MockFileSystemProvicer(val mockFileSystem: MockFileSystem) : FileSystemPro
7070

7171
override suspend fun exists(path: String): Boolean = path in mockFileSystem.documents
7272

73+
override suspend fun getFileContentType(path: String): FileMetadata.FileContentType =
74+
when (mockFileSystem.documents[path]) {
75+
is MockDocument -> FileMetadata.FileContentType.Text
76+
is MockDirectory -> throw IllegalArgumentException("Path must be a regular file")
77+
else -> throw IllegalArgumentException("Path must exist and be a regular file")
78+
}
79+
7380
override suspend fun read(path: String): ByteArray =
7481
mockFileSystem.documents[path]?.let { it as? MockDocument }?.content?.encodeToByteArray()
7582
?: throw IllegalArgumentException("Document not found: $path")

0 commit comments

Comments
 (0)