Skip to content

Commit 2a15f55

Browse files
test: before_request split pdf hook unit tests
1 parent 4689fc1 commit 2a15f55

File tree

2 files changed

+64
-47
lines changed

2 files changed

+64
-47
lines changed

_test_unstructured_client/integration/test_integration.py

Lines changed: 0 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77

88
import pytest
99
from deepdiff import DeepDiff
10-
from httpx import RequestError
1110
from unstructured_client import UnstructuredClient
1211
from unstructured_client.models import shared, operations
1312
from unstructured_client.models.errors import SDKError, ServerError, HTTPValidationError
@@ -351,52 +350,6 @@ def test_partition_strategy_vlm_anthropic(split_pdf, vlm_model, vlm_model_provid
351350
assert response.elements[0]["metadata"]["partitioner_type"] == "vlm_partition"
352351

353352

354-
@pytest.mark.parametrize(
355-
("pdf_name", "expected_error_message"),
356-
[
357-
(
358-
"failing-encrypted.pdf",
359-
"File is encrypted. Please decrypt it with password.",
360-
),
361-
(
362-
"failing-missing-root.pdf",
363-
"File does not appear to be a valid PDF. Error: Cannot find Root object in pdf",
364-
),
365-
(
366-
"failing-missing-pages.pdf",
367-
"File does not appear to be a valid PDF. Error: Invalid object in /Pages",
368-
),
369-
],
370-
)
371-
def test_returns_request_error_for_invalid_pdf(
372-
caplog: pytest.LogCaptureFixture,
373-
doc_path: Path,
374-
client: UnstructuredClient,
375-
pdf_name: str,
376-
expected_error_message: str,
377-
):
378-
"""Test that we get a RequestError with the correct error message for invalid PDF files."""
379-
with open(doc_path / pdf_name, "rb") as f:
380-
files = shared.Files(
381-
content=f.read(),
382-
file_name=pdf_name,
383-
)
384-
385-
req = operations.PartitionRequest(
386-
partition_parameters=shared.PartitionParameters(
387-
files=files,
388-
strategy="fast",
389-
split_pdf_page=True,
390-
)
391-
)
392-
393-
with pytest.raises(RequestError) as exc_info:
394-
client.general.partition(request=req)
395-
396-
assert exc_info.value.request is not None
397-
assert expected_error_message in caplog.text
398-
399-
400353
def test_returns_422_for_invalid_pdf(
401354
caplog: pytest.LogCaptureFixture,
402355
doc_path: Path,

_test_unstructured_client/unit/test_split_pdf_hook.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,12 @@
88
from unittest.mock import MagicMock, patch
99

1010
import httpx
11+
from httpx import RequestError
1112
import pytest
1213
import requests
1314
from requests_toolbelt import MultipartDecoder
1415

16+
from _test_unstructured_client.unit_utils import sample_docs_path
1517
from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils
1618
from unstructured_client._hooks.custom.form_utils import (
1719
FormData,
@@ -29,6 +31,7 @@
2931
SplitPdfHook,
3032
get_optimal_split_size, run_tasks,
3133
)
34+
from unstructured_client._hooks.types import BeforeRequestContext
3235
from unstructured_client.models import shared
3336

3437

@@ -462,3 +465,64 @@ def test_unit_get_split_pdf_cache_tmp_data_dir_uses_dir_from_form_data(mock_path
462465
mock_path.assert_called_once_with(mock_dir)
463466
mock_path_instance.exists.assert_called_once()
464467
assert result == str(Path(mock_dir).resolve())
468+
469+
470+
def test_before_request_raises_request_error_when_pdf_check_fails():
471+
"""Test that before_request raises RequestError when pdf_utils.check_pdf throws PDFValidationError."""
472+
hook = SplitPdfHook()
473+
474+
# Initialize the hook with a mock client
475+
mock_client = MagicMock()
476+
hook.sdk_init(base_url="http://localhost:8888", client=mock_client)
477+
478+
# Create a mock request context
479+
mock_hook_ctx = MagicMock()
480+
mock_hook_ctx.operation_id = "partition"
481+
482+
# Create a mock request with proper headers and content
483+
mock_request = MagicMock()
484+
mock_request.headers = {"Content-Type": "multipart/form-data"}
485+
mock_request.url.host = "localhost"
486+
487+
# Mock the form data to include the necessary fields for PDF splitting
488+
mock_pdf_file = MagicMock()
489+
mock_pdf_file.read.return_value = b"mock_pdf_content"
490+
491+
mock_form_data = {
492+
"split_pdf_page": "true",
493+
"files": {
494+
"filename": "test.pdf",
495+
"content_type": "application/pdf",
496+
"file": mock_pdf_file
497+
}
498+
}
499+
500+
# Mock the PDF reader object
501+
mock_pdf_reader = MagicMock()
502+
503+
# Define the error message that will be raised
504+
error_message = "File does not appear to be a valid PDF."
505+
506+
with patch("unstructured_client._hooks.custom.request_utils.get_multipart_stream_fields") as mock_get_fields, \
507+
patch("unstructured_client._hooks.custom.pdf_utils.read_pdf") as mock_read_pdf, \
508+
patch("unstructured_client._hooks.custom.pdf_utils.check_pdf") as mock_check_pdf, \
509+
patch("unstructured_client._hooks.custom.request_utils.get_base_url") as mock_get_base_url:
510+
511+
# Set up the mocks
512+
mock_get_fields.return_value = mock_form_data
513+
mock_read_pdf.return_value = mock_pdf_reader
514+
mock_check_pdf.side_effect = pdf_utils.PDFValidationError(error_message)
515+
mock_get_base_url.return_value = "http://localhost:8888"
516+
517+
# Call the method under test and verify it raises RequestError
518+
with pytest.raises(RequestError) as exc_info:
519+
hook.before_request(mock_hook_ctx, mock_request)
520+
521+
# Verify the exception has the correct message and request object
522+
assert str(exc_info.value) == error_message
523+
assert exc_info.value.request == mock_request
524+
525+
# Verify that the mocked functions were called as expected
526+
mock_get_fields.assert_called_once_with(mock_request)
527+
mock_read_pdf.assert_called_once_with(mock_pdf_file)
528+
mock_check_pdf.assert_called_once_with(mock_pdf_reader)

0 commit comments

Comments
 (0)