Add the ability to exclude files based on substrings in `Filterer`/`FilterConfig`

In https://github.com/Unstructured-IO/unstructured-ingest/blob/main/unstructured_ingest/processes/filter.py it would  be really useful to exclude files based on substrings in their path, e.g. documents in an "Archive" folder etc.

Proposed change:

```python
import fnmatch
from abc import ABC
from dataclasses import dataclass, field
from typing import Any, Callable, Optional

from pydantic import BaseModel, Field

from unstructured_ingest.data_types.file_data import FileData
from unstructured_ingest.interfaces.process import BaseProcess
from unstructured_ingest.logger import logger


class FiltererConfig(BaseModel):
    file_glob: Optional[list[str]] = Field(
        default=None,
        description="file globs to limit which data_types of files are accepted",
        examples=["*.pdf", "*.html"],
    )
    max_file_size: Optional[int] = Field(
        default=None, description="Max file size to process in bytes"
    )
    exclude_substrings: Optional[list[str]] = Field(
        default=None,
        description="List of substrings; files with these in their path will be excluded",
        examples=[["Archive", "Obsolete"]],
    )


@dataclass
class Filterer(BaseProcess, ABC):
    config: FiltererConfig = field(default_factory=lambda: FiltererConfig())
    filters: list[Callable[[FileData], bool]] = field(init=False, default_factory=list)

    def __post_init__(self):
        # Populate the filters based on values in config
        if self.config.file_glob is not None:
            self.filters.append(self.glob_filter)
        if self.config.max_file_size:
            self.filters.append(self.file_size_filter)
        if self.config.exclude_substrings:
            self.filters.append(self.exclude_substrings_filter)

    def is_async(self) -> bool:
        return False

    def file_size_filter(self, file_data: FileData) -> bool:
        if filesize_bytes := file_data.metadata.filesize_bytes:
            return filesize_bytes <= self.config.max_file_size
        return True

    def glob_filter(self, file_data: FileData) -> bool:
        patterns = self.config.file_glob
        path = file_data.source_identifiers.fullpath
        for pattern in patterns:
            if fnmatch.filter([path], pattern):
                return True
        logger.debug(f"the file {path!r} is discarded as it does not match any given glob.")
        return False

    def exclude_substrings_filter(self, file_data: FileData) -> bool:
        path = file_data.source_identifiers.fullpath
        for substr in self.config.exclude_substrings:
            if substr in path:
                logger.debug(
                    f"the file {path!r} is discarded due to exclude_substrings match: {substr!r}"
                )
                return False
        return True

    def run(self, file_data: FileData, **kwargs: Any) -> Optional[FileData]:
        for filter in self.filters:
            if not filter(file_data):
                logger.debug(
                    f"filtered out file data due to {filter.__name__}: {file_data.identifier}"
                )
                return None
        return file_data
```

Happy to open a PR if people want this

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Add the ability to exclude files based on substrings in `Filterer`/`FilterConfig` #514

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Add the ability to exclude files based on substrings in Filterer/FilterConfig #514

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions

Add the ability to exclude files based on substrings in `Filterer`/`FilterConfig` #514