chore: concurrent search for collection

hash-data · hash-data · commit cbe722fccb16 · 2024-10-13T17:40:24.000+05:30
diff --git a/Makefile b/Makefile
@@ -33,4 +33,7 @@ test:
 	pytest tests -v
 
 test_cov:
-	pytest --cov=dz_mongodb tests -v --cov-fail-under=42
+	pytest --cov=dz_mongodb tests -v --cov-fail-under=42
+
+local_build:
+	python setup.py sdist 
diff --git a/dz_mongodb/__init__.py b/dz_mongodb/__init__.py
@@ -4,7 +4,7 @@
 import sys
 from typing import List, Dict, Optional
 from urllib import parse
-
+import concurrent.futures
 import singer
 from pymongo import MongoClient
 from singer import metadata, metrics, utils
@@ -13,6 +13,8 @@
 from dz_mongodb.sync_strategies import common
 from dz_mongodb.sync_strategies import full_table
 from dz_mongodb.sync_strategies import incremental
+from pymongo.database import Database
+
 from dz_mongodb.config_utils import validate_config
 from dz_mongodb.db_utils import get_databases, produce_collection_schema
 from dz_mongodb.errors import InvalidReplicationMethodException, NoReadPrivilegeException
@@ -36,6 +38,19 @@
 FULL_TABLE_METHOD = 'FULL_TABLE'
 
 
+def process_collection(database: Database, collection_name: str):
+    collection = database[collection_name]
+    is_view = collection.options().get('viewOn') is not None
+
+    if is_view:
+        LOGGER.info("Skipping view '%s' in database '%s'", collection_name, database.name)
+        return None  # Skip views
+
+    LOGGER.info("Getting collection info for db '%s', collection '%s'", database.name, collection_name)
+    schema = produce_collection_schema(collection)  # Produce the schema
+    return schema
+
+
 def do_discover(client: MongoClient, config: Dict):
     """
     Run discovery mode where the mongodb cluster is scanned and
@@ -55,17 +70,34 @@ def do_discover(client: MongoClient, config: Dict):
 
     collection_names = database.list_collection_names()
 
-    for collection_name in [c for c in collection_names if not c.startswith("system.")]:
-
-        collection = database[collection_name]
-        is_view = collection.options().get('viewOn') is not None
-
-        # Add support for views if needed here
-        if is_view:
-            continue
-
-        LOGGER.info("Getting collection info for db '%s', collection '%s'", database.name, collection_name)
-        streams.append(produce_collection_schema(collection))
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        # List of futures for each collection that is not a system collection
+        futures = {
+            executor.submit(process_collection, database, collection_name): collection_name
+            for collection_name in collection_names
+            if not collection_name.startswith("system.")
+        }
+        
+        for future in concurrent.futures.as_completed(futures):
+            collection_name = futures[future]
+            try:
+                result = future.result()
+                if result:
+                    streams.append(result)  # Store produced schema
+            except Exception as exc:
+                LOGGER.error("Error processing collection '%s': %s", collection_name, exc)
+
+    # for collection_name in [c for c in collection_names if not c.startswith("system.")]:
+
+    #     collection = database[collection_name]
+    #     is_view = collection.options().get('viewOn') is not None
+
+    #     # Add support for views if needed here
+    #     if is_view:
+    #         continue
+
+    #     LOGGER.info("Getting collection info for db '%s', collection '%s'", database.name, collection_name)
+    #     streams.append(produce_collection_schema(collection))
 
     json.dump({'streams': streams}, sys.stdout, indent=2)
 
diff --git a/dz_mongodb/sync_strategies/change_streams.py b/dz_mongodb/sync_strategies/change_streams.py
@@ -80,7 +80,7 @@ def check_resume_token_existance(client: MongoClient, resume_token_ts: datetime)
     """
     oplogRS = client["local"]["oplog.rs"]
     oplog_obj = oplogRS.find_one(sort = [("$natural", pymongo.ASCENDING)])
-    first_oplog_ts =  oplog_obj.get("ts")
+    first_oplog_ts =  oplog_obj.get("ts") if oplog_obj else None
     if not first_oplog_ts:
         raise Exception("unable to read first oplog for resume token verification")
     if resume_token_ts < first_oplog_ts.as_datetime():
@@ -105,6 +105,8 @@ def get_current_resume_token(client: MongoClient, database: Database) -> Timesta
             # no active transactions get current timestamp
             oplogRS = client["local"]["oplog.rs"]
             oplog_obj = oplogRS.find_one(sort = [("$natural", pymongo.DESCENDING)])
+            if not oplog_obj:
+                return None
             return oplog_obj.get("ts")
         
         raw_ts = result.get("startOpTime", {}).get("ts")
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
     long_desc = fh.read()
 
 setup(name='dz-mongodb',
-      version='1.4.5',
+      version='1.4.6',
       description='Singer.io tap for extracting data from MongoDB - Datazip compatible',
       long_description=long_desc,
       long_description_content_type='text/markdown',