@@ -215,6 +215,53 @@ def _get_col_pubget(collections_df, data_df, pubget_nv_df, pubget_metadata_df):
215
215
return pubget_nv_df
216
216
217
217
218
+ def _get_col_neurosynth (
219
+ neurosynth_nv_df ,
220
+ neurosynth_metadata_df ,
221
+ collections_with_pmid ,
222
+ collections_df ,
223
+ ):
224
+ # Convert private_token to collection_id
225
+ collection_ids = neurosynth_nv_df ["collection_id" ].to_list ()
226
+ neurosynth_nv_df ["collection_id" ] = [
227
+ _convert_collection_id (id_ , collections_df ) for id_ in collection_ids
228
+ ]
229
+
230
+ # Get PMIDs and PMCIDs from metadata
231
+ neurosynth_nv_df = pd .merge (
232
+ neurosynth_nv_df , neurosynth_metadata_df [["pmid" , "doi" ]], on = "pmid"
233
+ )
234
+ neurosynth_nv_df = neurosynth_nv_df .reindex (columns = ["pmid" , "doi" , "collection_id" ])
235
+ neurosynth_nv_df = neurosynth_nv_df .rename (columns = {"doi" : "secondary_doi" })
236
+ neurosynth_nv_df ["pmid" ] = neurosynth_nv_df ["pmid" ].astype ("Int64" )
237
+ neurosynth_nv_df = neurosynth_nv_df .dropna (
238
+ subset = ["collection_id" ]
239
+ ) # Some private collections couldnt be mapped to public ones
240
+
241
+ # Get collections found by neurosynth
242
+ nv_coll = collections_with_pmid ["collection_id" ].to_list ()
243
+ neurosynth_nv_coll = neurosynth_nv_df ["collection_id" ].to_list ()
244
+ matching_ids = np .intersect1d (nv_coll , neurosynth_nv_coll )
245
+
246
+ neurosynth_mask = ~ neurosynth_nv_df ["collection_id" ].isin (matching_ids )
247
+ neurosynth_nv_df = neurosynth_nv_df [neurosynth_mask ]
248
+
249
+ # Select unique collections
250
+ neurosynth_nv_df = neurosynth_nv_df .sort_values ("pmid" )
251
+ neurosynth_nv_df = neurosynth_nv_df .drop_duplicates ("collection_id" , keep = "first" )
252
+
253
+ # Get collection names
254
+ neurosynth_nv_df = pd .merge (
255
+ neurosynth_nv_df , collections_df [["id" , "name" ]], left_on = "collection_id" , right_on = "id"
256
+ )
257
+ neurosynth_nv_df = neurosynth_nv_df .rename (columns = {"name" : "collection_name" })
258
+ neurosynth_nv_df = neurosynth_nv_df .drop (columns = "id" )
259
+ neurosynth_nv_df ["pmcid" ] = neurosynth_nv_df .pmid .apply (get_pmcid_from_pmid )
260
+ neurosynth_nv_df ["source" ] = "neurosynth"
261
+
262
+ return neurosynth_nv_df
263
+
264
+
218
265
def main (project_dir , neurovault_version , pg_query_id ):
219
266
data_dir = op .join (project_dir , "data" )
220
267
nv_data_dir = op .join (data_dir , "neurovault" , neurovault_version )
@@ -235,6 +282,12 @@ def main(project_dir, neurovault_version, pg_query_id):
235
282
pubget_nv_df = pd .read_csv (pubget_nv_fn )
236
283
pubget_metadata_df = pd .read_csv (pubget_metadata_fn )
237
284
285
+ # Load Neurosynth data
286
+ neurosynth_nv_fn = op .join (data_dir , "neurosynth" , "neurovault_collections.csv" )
287
+ neurosynth_metadata_fn = op .join (data_dir , "neurosynth" , "metadata.csv" )
288
+ neurosynth_nv_df = pd .read_csv (neurosynth_nv_fn )
289
+ neurosynth_metadata_df = pd .read_csv (neurosynth_metadata_fn )
290
+
238
291
# 0. Remove Neuroscout collections
239
292
collections_df = collections_df [collections_df .owner_id != NEUROSCOUT_OWNER_ID ]
240
293
print (f"Found { collections_df .shape [0 ]} collections after removing Neuroscout collections" )
@@ -286,6 +339,21 @@ def main(project_dir, neurovault_version, pg_query_id):
286
339
[collections_with_pmid , pubget_nv_df ], ignore_index = True , sort = False
287
340
)
288
341
342
+ # 5. Find NeuroVault collections using pubget search on Neurosynth text
343
+ # =====================================================================
344
+ neurosynth_nv_df = _get_col_neurosynth (
345
+ neurosynth_nv_df ,
346
+ neurosynth_metadata_df ,
347
+ collections_with_pmid ,
348
+ collections_df ,
349
+ )
350
+ print (f"Found { neurosynth_nv_df .shape [0 ]} new collections with using the Neurosynth search" )
351
+
352
+ # Concatenate the collections
353
+ collections_with_pmid = pd .concat (
354
+ [collections_with_pmid , neurosynth_nv_df ], ignore_index = True , sort = False
355
+ )
356
+
289
357
# Add missing collections
290
358
collections_missing = collections_df [
291
359
~ collections_df ["id" ].isin (collections_with_pmid ["collection_id" ])
0 commit comments