4
4
import sys
5
5
from typing import List , Dict , Optional
6
6
from urllib import parse
7
-
7
+ import concurrent . futures
8
8
import singer
9
9
from pymongo import MongoClient
10
10
from singer import metadata , metrics , utils
13
13
from dz_mongodb .sync_strategies import common
14
14
from dz_mongodb .sync_strategies import full_table
15
15
from dz_mongodb .sync_strategies import incremental
16
+ from pymongo .database import Database
17
+
16
18
from dz_mongodb .config_utils import validate_config
17
19
from dz_mongodb .db_utils import get_databases , produce_collection_schema
18
20
from dz_mongodb .errors import InvalidReplicationMethodException , NoReadPrivilegeException
36
38
FULL_TABLE_METHOD = 'FULL_TABLE'
37
39
38
40
41
+ def process_collection (database : Database , collection_name : str ):
42
+ collection = database [collection_name ]
43
+ is_view = collection .options ().get ('viewOn' ) is not None
44
+
45
+ if is_view :
46
+ LOGGER .info ("Skipping view '%s' in database '%s'" , collection_name , database .name )
47
+ return None # Skip views
48
+
49
+ LOGGER .info ("Getting collection info for db '%s', collection '%s'" , database .name , collection_name )
50
+ schema = produce_collection_schema (collection ) # Produce the schema
51
+ return schema
52
+
53
+
39
54
def do_discover (client : MongoClient , config : Dict ):
40
55
"""
41
56
Run discovery mode where the mongodb cluster is scanned and
@@ -55,17 +70,34 @@ def do_discover(client: MongoClient, config: Dict):
55
70
56
71
collection_names = database .list_collection_names ()
57
72
58
- for collection_name in [c for c in collection_names if not c .startswith ("system." )]:
59
-
60
- collection = database [collection_name ]
61
- is_view = collection .options ().get ('viewOn' ) is not None
62
-
63
- # Add support for views if needed here
64
- if is_view :
65
- continue
66
-
67
- LOGGER .info ("Getting collection info for db '%s', collection '%s'" , database .name , collection_name )
68
- streams .append (produce_collection_schema (collection ))
73
+ with concurrent .futures .ThreadPoolExecutor () as executor :
74
+ # List of futures for each collection that is not a system collection
75
+ futures = {
76
+ executor .submit (process_collection , database , collection_name ): collection_name
77
+ for collection_name in collection_names
78
+ if not collection_name .startswith ("system." )
79
+ }
80
+
81
+ for future in concurrent .futures .as_completed (futures ):
82
+ collection_name = futures [future ]
83
+ try :
84
+ result = future .result ()
85
+ if result :
86
+ streams .append (result ) # Store produced schema
87
+ except Exception as exc :
88
+ LOGGER .error ("Error processing collection '%s': %s" , collection_name , exc )
89
+
90
+ # for collection_name in [c for c in collection_names if not c.startswith("system.")]:
91
+
92
+ # collection = database[collection_name]
93
+ # is_view = collection.options().get('viewOn') is not None
94
+
95
+ # # Add support for views if needed here
96
+ # if is_view:
97
+ # continue
98
+
99
+ # LOGGER.info("Getting collection info for db '%s', collection '%s'", database.name, collection_name)
100
+ # streams.append(produce_collection_schema(collection))
69
101
70
102
json .dump ({'streams' : streams }, sys .stdout , indent = 2 )
71
103
0 commit comments