Skip to content

Commit 06e1491

Browse files
committed
Update collection to publication matching
1 parent 9221b21 commit 06e1491

File tree

1 file changed

+68
-0
lines changed

1 file changed

+68
-0
lines changed

get_nv_collections.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,53 @@ def _get_col_pubget(collections_df, data_df, pubget_nv_df, pubget_metadata_df):
215215
return pubget_nv_df
216216

217217

218+
def _get_col_neurosynth(
219+
neurosynth_nv_df,
220+
neurosynth_metadata_df,
221+
collections_with_pmid,
222+
collections_df,
223+
):
224+
# Convert private_token to collection_id
225+
collection_ids = neurosynth_nv_df["collection_id"].to_list()
226+
neurosynth_nv_df["collection_id"] = [
227+
_convert_collection_id(id_, collections_df) for id_ in collection_ids
228+
]
229+
230+
# Get PMIDs and PMCIDs from metadata
231+
neurosynth_nv_df = pd.merge(
232+
neurosynth_nv_df, neurosynth_metadata_df[["pmid", "doi"]], on="pmid"
233+
)
234+
neurosynth_nv_df = neurosynth_nv_df.reindex(columns=["pmid", "doi", "collection_id"])
235+
neurosynth_nv_df = neurosynth_nv_df.rename(columns={"doi": "secondary_doi"})
236+
neurosynth_nv_df["pmid"] = neurosynth_nv_df["pmid"].astype("Int64")
237+
neurosynth_nv_df = neurosynth_nv_df.dropna(
238+
subset=["collection_id"]
239+
) # Some private collections couldnt be mapped to public ones
240+
241+
# Get collections found by neurosynth
242+
nv_coll = collections_with_pmid["collection_id"].to_list()
243+
neurosynth_nv_coll = neurosynth_nv_df["collection_id"].to_list()
244+
matching_ids = np.intersect1d(nv_coll, neurosynth_nv_coll)
245+
246+
neurosynth_mask = ~neurosynth_nv_df["collection_id"].isin(matching_ids)
247+
neurosynth_nv_df = neurosynth_nv_df[neurosynth_mask]
248+
249+
# Select unique collections
250+
neurosynth_nv_df = neurosynth_nv_df.sort_values("pmid")
251+
neurosynth_nv_df = neurosynth_nv_df.drop_duplicates("collection_id", keep="first")
252+
253+
# Get collection names
254+
neurosynth_nv_df = pd.merge(
255+
neurosynth_nv_df, collections_df[["id", "name"]], left_on="collection_id", right_on="id"
256+
)
257+
neurosynth_nv_df = neurosynth_nv_df.rename(columns={"name": "collection_name"})
258+
neurosynth_nv_df = neurosynth_nv_df.drop(columns="id")
259+
neurosynth_nv_df["pmcid"] = neurosynth_nv_df.pmid.apply(get_pmcid_from_pmid)
260+
neurosynth_nv_df["source"] = "neurosynth"
261+
262+
return neurosynth_nv_df
263+
264+
218265
def main(project_dir, neurovault_version, pg_query_id):
219266
data_dir = op.join(project_dir, "data")
220267
nv_data_dir = op.join(data_dir, "neurovault", neurovault_version)
@@ -235,6 +282,12 @@ def main(project_dir, neurovault_version, pg_query_id):
235282
pubget_nv_df = pd.read_csv(pubget_nv_fn)
236283
pubget_metadata_df = pd.read_csv(pubget_metadata_fn)
237284

285+
# Load Neurosynth data
286+
neurosynth_nv_fn = op.join(data_dir, "neurosynth", "neurovault_collections.csv")
287+
neurosynth_metadata_fn = op.join(data_dir, "neurosynth", "metadata.csv")
288+
neurosynth_nv_df = pd.read_csv(neurosynth_nv_fn)
289+
neurosynth_metadata_df = pd.read_csv(neurosynth_metadata_fn)
290+
238291
# 0. Remove Neuroscout collections
239292
collections_df = collections_df[collections_df.owner_id != NEUROSCOUT_OWNER_ID]
240293
print(f"Found {collections_df.shape[0]} collections after removing Neuroscout collections")
@@ -286,6 +339,21 @@ def main(project_dir, neurovault_version, pg_query_id):
286339
[collections_with_pmid, pubget_nv_df], ignore_index=True, sort=False
287340
)
288341

342+
# 5. Find NeuroVault collections using pubget search on Neurosynth text
343+
# =====================================================================
344+
neurosynth_nv_df = _get_col_neurosynth(
345+
neurosynth_nv_df,
346+
neurosynth_metadata_df,
347+
collections_with_pmid,
348+
collections_df,
349+
)
350+
print(f"Found {neurosynth_nv_df.shape[0]} new collections with using the Neurosynth search")
351+
352+
# Concatenate the collections
353+
collections_with_pmid = pd.concat(
354+
[collections_with_pmid, neurosynth_nv_df], ignore_index=True, sort=False
355+
)
356+
289357
# Add missing collections
290358
collections_missing = collections_df[
291359
~collections_df["id"].isin(collections_with_pmid["collection_id"])

0 commit comments

Comments
 (0)