Skip to content

feat: dedupe hash step #68

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ dependencies= [
"mdformat==0.7.17",
"spacy==3.7.5",
"tiktoken==0.7.0",
"openai==1.82.1",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add new optional dependency group:

docs = [
    "mkdocstrings[python]"
]

"fuzzywuzzy==0.18.0",
"joblib>=1.4.0",
"lxml==5.2.*",
"marshmallow<4.0.0",
Expand Down
3 changes: 3 additions & 0 deletions tests/steps/dedupe_hash/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# SPDX-FileCopyrightText: 2025 Deutsche Telekom AG ([email protected])
#
# SPDX-License-Identifier: Apache-2.0
74 changes: 74 additions & 0 deletions tests/steps/dedupe_hash/test_qdrant_compare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import pandas as pd

import sys
import os
sys.path.append(os.path.abspath("/Users/A1167082/Desktop/wurzel"))


from wurzel.steps.dedupe_hash.settings import QdrantCompareSettings
from wurzel.steps.dedupe_hash.step import QdrantCompareStep

#/Users/A1167082/Desktop/wurzel/wurzel/steps/dedupe_hash


def make_step():
settings = QdrantCompareSettings()
settings.QDRANT_URL = "http://localhost:6333"
settings.QDRANT_API_KEY = "dummy"
settings.OPAI_API_KEY = "dummy"
settings.AZURE_ENDPOINT = "https://dummy-endpoint"
settings.GPT_MODEL = "dummy"
settings.QDRANT_COLLECTION_PREFIX = "test_v"
settings.FUZZY_THRESHOLD = 85
settings.TLSH_MAX_DIFF = 10
step = QdrantCompareStep()
step.settings = settings
return step
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.



def test_identical_tlsh_analysis():
step = make_step()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use fixture here

df1 = pd.DataFrame([{"tlsh": "A" * 70}, {"tlsh": "B" * 70}])
df2 = pd.DataFrame([{"tlsh": "A" * 70}, {"tlsh": "C" * 70}])
identical, count = step._identical_tlsh_analysis(df1, df2, "tlsh")
assert count == 1
assert "A" * 70 in identical


def test_fuzzy_tlsh_matches():
step = make_step()
df = pd.DataFrame([{"tlsh": "A" * 70}, {"tlsh": "A" * 70}, {"tlsh": "B" * 70}])
matches = step._fuzzy_tlsh_matches(df, "tlsh", 100)
assert any(isinstance(m, tuple) and len(m) == 3 for m in matches)


def test_diff_snippet():
step = make_step()
diff = step._diff_snippet("Hallo Welt", "Hallo Erde")
assert "Hallo" in diff


def test_suspicious_cases_analysis():
step = make_step()
df = pd.DataFrame([{"text": "Hallo Welt", "tlsh": "A" * 70}, {"text": "Hallo Erde", "tlsh": "B" * 70}])
matches = [(0, 1, 5)]
suspicious = step._suspicious_cases_analysis(df, matches, "text")
assert isinstance(suspicious, list)
assert suspicious[0]["fuzz_ratio"] < 100


def test_analyze_extra_docs_detail():
step = make_step()
df_base = pd.DataFrame([{"text": "Hallo Welt"}])
df_extra = pd.DataFrame([{"text": "Hallo Mars"}])
result = step._analyze_extra_docs_detail(df_base, df_extra, "text", 80)
assert isinstance(result, list)
assert "is_truly_new" in result[0]


def test_extract_gpt_shortform():
step = make_step()
assert step._extract_gpt_shortform({"gpt_analysis": "Contradiction found."}) == "contradiction"
assert step._extract_gpt_shortform({"gpt_analysis": "Keep both"}) == "both"
assert step._extract_gpt_shortform({"gpt_analysis": "Remove document 1"}) == "a remove"
assert step._extract_gpt_shortform({"gpt_analysis": "Remove document 2"}) == "b remove"
7 changes: 7 additions & 0 deletions wurzel/steps/dedupe_hash/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# SPDX-FileCopyrightText: 2025 Deutsche Telekom AG ([email protected])
#
# SPDX-License-Identifier: Apache-2.0


# from .settings import QdrantCompareSettings # as QdrantCompareSettings
# from .step import QdrantCompareStep # as QdrantCompareStep
73 changes: 73 additions & 0 deletions wurzel/steps/dedupe_hash/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# SPDX-FileCopyrightText: 2025 Deutsche Telekom AG ([email protected])
#
# SPDX-License-Identifier: Apache-2.0


from dotenv import load_dotenv
from pydantic import Field

from wurzel.step.settings import Settings # falls Settings eine Pydantic-Basisklasse ist

# Lade .env-Datei automatisch
load_dotenv()


class QdrantCompareSettings(Settings):
"""Configuration settings for comparing two Qdrant collections.

This class defines all environment-configurable parameters required for
analyzing differences, redundancies, and contradictions between two Qdrant
collections. It supports integration with Azure and OpenAI for advanced
deduplication and fuzzy matching. All settings can be loaded from environment
variables or a .env file, making it suitable for flexible deployment and
secure configuration management.

Attributes:
QDRANT_URL (str): Base URL for Qdrant.
QDRANT_API_KEY (str): API key for Qdrant access.
AZURE_ENDPOINT (str): Endpoint for Azure access.
FUZZY_THRESHOLD (int): Fuzzy match threshold for Qdrant.
TLSH_MAX_DIFF (int): Maximum TLSH difference for deduplication.
OPAI_API_KEY (str): OpenAI API key for deduplication.
GPT_MODEL (str): OpenAI model to use for deduplication.
QDRANT_COLLECTION_PREFIX (str): Prefix for Qdrant collection names to extract versions.

"""

QDRANT_URL: str = Field(
"",
description="Base URL for Qdrant.",
)
QDRANT_API_KEY: str = Field(
"",
description="API key for Qdrant access.",
)

AZURE_ENDPOINT: str = Field("", description="ENDPOINT for AZURE acces.")

FUZZY_THRESHOLD: int = Field(
99,
description="Fuzzy match threshold for Qdrant.",
)
TLSH_MAX_DIFF: int = Field(
1,
description="Maximum TLSH difference for deduplication.",
)
OPAI_API_KEY: str = Field(
"",
description="OpenAI API key for deduplication.",
)
GPT_MODEL: str = Field(
"GPT4-CH",
description="OpenAI model to use for deduplication.",
)
QDRANT_COLLECTION_PREFIX: str = Field(
"",
description="Prefix for Qdrant collection names to extract versions.",
)

class Config:
"""Compares two Qdrant collections and analyzes differences, redundancies, and contradictions."""

env_prefix = "QDRANTCOMPARESTEP__"
env_file = ".env"
Loading