Skip to content

Add the ability to exclude files based on substrings in Filterer/FilterConfig #514

Open
@Anjum48

Description

@Anjum48

In https://github.com/Unstructured-IO/unstructured-ingest/blob/main/unstructured_ingest/processes/filter.py it would be really useful to exclude files based on substrings in their path, e.g. documents in an "Archive" folder etc.

Proposed change:

import fnmatch
from abc import ABC
from dataclasses import dataclass, field
from typing import Any, Callable, Optional

from pydantic import BaseModel, Field

from unstructured_ingest.data_types.file_data import FileData
from unstructured_ingest.interfaces.process import BaseProcess
from unstructured_ingest.logger import logger


class FiltererConfig(BaseModel):
    file_glob: Optional[list[str]] = Field(
        default=None,
        description="file globs to limit which data_types of files are accepted",
        examples=["*.pdf", "*.html"],
    )
    max_file_size: Optional[int] = Field(
        default=None, description="Max file size to process in bytes"
    )
    exclude_substrings: Optional[list[str]] = Field(
        default=None,
        description="List of substrings; files with these in their path will be excluded",
        examples=[["Archive", "Obsolete"]],
    )


@dataclass
class Filterer(BaseProcess, ABC):
    config: FiltererConfig = field(default_factory=lambda: FiltererConfig())
    filters: list[Callable[[FileData], bool]] = field(init=False, default_factory=list)

    def __post_init__(self):
        # Populate the filters based on values in config
        if self.config.file_glob is not None:
            self.filters.append(self.glob_filter)
        if self.config.max_file_size:
            self.filters.append(self.file_size_filter)
        if self.config.exclude_substrings:
            self.filters.append(self.exclude_substrings_filter)

    def is_async(self) -> bool:
        return False

    def file_size_filter(self, file_data: FileData) -> bool:
        if filesize_bytes := file_data.metadata.filesize_bytes:
            return filesize_bytes <= self.config.max_file_size
        return True

    def glob_filter(self, file_data: FileData) -> bool:
        patterns = self.config.file_glob
        path = file_data.source_identifiers.fullpath
        for pattern in patterns:
            if fnmatch.filter([path], pattern):
                return True
        logger.debug(f"the file {path!r} is discarded as it does not match any given glob.")
        return False

    def exclude_substrings_filter(self, file_data: FileData) -> bool:
        path = file_data.source_identifiers.fullpath
        for substr in self.config.exclude_substrings:
            if substr in path:
                logger.debug(
                    f"the file {path!r} is discarded due to exclude_substrings match: {substr!r}"
                )
                return False
        return True

    def run(self, file_data: FileData, **kwargs: Any) -> Optional[FileData]:
        for filter in self.filters:
            if not filter(file_data):
                logger.debug(
                    f"filtered out file data due to {filter.__name__}: {file_data.identifier}"
                )
                return None
        return file_data

Happy to open a PR if people want this

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions