telekom · mavaball · May 27, 2025 · May 28, 2025 · May 28, 2025 · May 30, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -45,6 +45,8 @@ dependencies= [
     "mdformat==0.7.17",
     "spacy==3.7.5",
     "tiktoken==0.7.0",
+    "openai==1.82.1",
+    "fuzzywuzzy==0.18.0",
     "joblib>=1.4.0",
     "lxml==5.2.*",
     "marshmallow<4.0.0",

diff --git a/tests/steps/dedupe_hash/__init__.py b/tests/steps/dedupe_hash/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: 2025 Deutsche Telekom AG ([email protected])
+#
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/steps/dedupe_hash/test_qdrant_compare.py b/tests/steps/dedupe_hash/test_qdrant_compare.py
@@ -0,0 +1,74 @@
+import pandas as pd
+
+import sys
+import os
+sys.path.append(os.path.abspath("/Users/A1167082/Desktop/wurzel"))
+
+
+from wurzel.steps.dedupe_hash.settings import QdrantCompareSettings
+from wurzel.steps.dedupe_hash.step import QdrantCompareStep
+
+#/Users/A1167082/Desktop/wurzel/wurzel/steps/dedupe_hash
+
+
+def make_step():
+    settings = QdrantCompareSettings()
+    settings.QDRANT_URL = "http://localhost:6333"
+    settings.QDRANT_API_KEY = "dummy"
+    settings.OPAI_API_KEY = "dummy"
+    settings.AZURE_ENDPOINT = "https://dummy-endpoint"
+    settings.GPT_MODEL = "dummy"
+    settings.QDRANT_COLLECTION_PREFIX = "test_v"
+    settings.FUZZY_THRESHOLD = 85
+    settings.TLSH_MAX_DIFF = 10
+    step = QdrantCompareStep()
+    step.settings = settings
+    return step
+
+
+def test_identical_tlsh_analysis():
+    step = make_step()
+    df1 = pd.DataFrame([{"tlsh": "A" * 70}, {"tlsh": "B" * 70}])
+    df2 = pd.DataFrame([{"tlsh": "A" * 70}, {"tlsh": "C" * 70}])
+    identical, count = step._identical_tlsh_analysis(df1, df2, "tlsh")
+    assert count == 1
+    assert "A" * 70 in identical
+
+
+def test_fuzzy_tlsh_matches():
+    step = make_step()
+    df = pd.DataFrame([{"tlsh": "A" * 70}, {"tlsh": "A" * 70}, {"tlsh": "B" * 70}])
+    matches = step._fuzzy_tlsh_matches(df, "tlsh", 100)
+    assert any(isinstance(m, tuple) and len(m) == 3 for m in matches)
+
+
+def test_diff_snippet():
+    step = make_step()
+    diff = step._diff_snippet("Hallo Welt", "Hallo Erde")
+    assert "Hallo" in diff
+
+
+def test_suspicious_cases_analysis():
+    step = make_step()
+    df = pd.DataFrame([{"text": "Hallo Welt", "tlsh": "A" * 70}, {"text": "Hallo Erde", "tlsh": "B" * 70}])
+    matches = [(0, 1, 5)]
+    suspicious = step._suspicious_cases_analysis(df, matches, "text")
+    assert isinstance(suspicious, list)
+    assert suspicious[0]["fuzz_ratio"] < 100
+
+
+def test_analyze_extra_docs_detail():
+    step = make_step()
+    df_base = pd.DataFrame([{"text": "Hallo Welt"}])
+    df_extra = pd.DataFrame([{"text": "Hallo Mars"}])
+    result = step._analyze_extra_docs_detail(df_base, df_extra, "text", 80)
+    assert isinstance(result, list)
+    assert "is_truly_new" in result[0]
+
+
+def test_extract_gpt_shortform():
+    step = make_step()
+    assert step._extract_gpt_shortform({"gpt_analysis": "Contradiction found."}) == "contradiction"
+    assert step._extract_gpt_shortform({"gpt_analysis": "Keep both"}) == "both"
+    assert step._extract_gpt_shortform({"gpt_analysis": "Remove document 1"}) == "a remove"
+    assert step._extract_gpt_shortform({"gpt_analysis": "Remove document 2"}) == "b remove"
diff --git a/wurzel/steps/dedupe_hash/__init__.py b/wurzel/steps/dedupe_hash/__init__.py
@@ -0,0 +1,7 @@
+# SPDX-FileCopyrightText: 2025 Deutsche Telekom AG ([email protected])
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+# from .settings import QdrantCompareSettings  # as QdrantCompareSettings
+# from .step import QdrantCompareStep  # as QdrantCompareStep
diff --git a/wurzel/steps/dedupe_hash/settings.py b/wurzel/steps/dedupe_hash/settings.py
@@ -0,0 +1,73 @@
+# SPDX-FileCopyrightText: 2025 Deutsche Telekom AG ([email protected])
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+from dotenv import load_dotenv
+from pydantic import Field
+
+from wurzel.step.settings import Settings  # falls Settings eine Pydantic-Basisklasse ist
+
+# Lade .env-Datei automatisch
+load_dotenv()
+
+
+class QdrantCompareSettings(Settings):
+    """Configuration settings for comparing two Qdrant collections.
+
+    This class defines all environment-configurable parameters required for
+    analyzing differences, redundancies, and contradictions between two Qdrant
+    collections. It supports integration with Azure and OpenAI for advanced
+    deduplication and fuzzy matching. All settings can be loaded from environment
+    variables or a .env file, making it suitable for flexible deployment and
+    secure configuration management.
+
+    Attributes:
+        QDRANT_URL (str): Base URL for Qdrant.
+        QDRANT_API_KEY (str): API key for Qdrant access.
+        AZURE_ENDPOINT (str): Endpoint for Azure access.
+        FUZZY_THRESHOLD (int): Fuzzy match threshold for Qdrant.
+        TLSH_MAX_DIFF (int): Maximum TLSH difference for deduplication.
+        OPAI_API_KEY (str): OpenAI API key for deduplication.
+        GPT_MODEL (str): OpenAI model to use for deduplication.
+        QDRANT_COLLECTION_PREFIX (str): Prefix for Qdrant collection names to extract versions.
+
+    """
+
+    QDRANT_URL: str = Field(
+        "",
+        description="Base URL for Qdrant.",
+    )
+    QDRANT_API_KEY: str = Field(
+        "",
+        description="API key for Qdrant access.",
+    )
+
+    AZURE_ENDPOINT: str = Field("", description="ENDPOINT for AZURE acces.")
+
+    FUZZY_THRESHOLD: int = Field(
+        99,
+        description="Fuzzy match threshold for Qdrant.",
+    )
+    TLSH_MAX_DIFF: int = Field(
+        1,
+        description="Maximum TLSH difference for deduplication.",
+    )
+    OPAI_API_KEY: str = Field(
+        "",
+        description="OpenAI API key for deduplication.",
+    )
+    GPT_MODEL: str = Field(
+        "GPT4-CH",
+        description="OpenAI model to use for deduplication.",
+    )
+    QDRANT_COLLECTION_PREFIX: str = Field(
+        "",
+        description="Prefix for Qdrant collection names to extract versions.",
+    )
+
+    class Config:
+        """Compares two Qdrant collections and analyzes differences, redundancies, and contradictions."""
+
+        env_prefix = "QDRANTCOMPARESTEP__"
+        env_file = ".env"