NBCLab
diff --git a/‎gen_nimare_dset.py
Lines changed: 15 additions & 8 deletions b/‎gen_nimare_dset.py
Lines changed: 15 additions & 8 deletions
diff --git a/‎gen_nimare_lda_dset.py
Lines changed: 124 additions & 0 deletions b/‎gen_nimare_lda_dset.py
Lines changed: 124 additions & 0 deletions
diff --git a/‎get_nv_images.py
Lines changed: 3 additions & 3 deletions b/‎get_nv_images.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎ibma.py
Lines changed: 8 additions & 3 deletions b/‎ibma.py
Lines changed: 8 additions & 3 deletions
@@ -3,6 +3,7 @@
 
 import pandas as pd
 from nimare.dataset import Dataset
+from nimare.extract import download_abstracts
 from nimare.io import DEFAULT_MAP_TYPE_CONVERSION
 from nimare.transforms import ImageTransformer
 
@@ -84,14 +85,20 @@ def main(project_dir):
     nv_text_df = pd.read_csv(op.join(data_dir, "pmid_text.csv"))
     dset_nv_fn = op.join(data_dir, "neurovault_all_dataset.pkl")
 
-    print(f"Creating full dataset {nv_collections_images_df.shape[0]}", flush=True)
-    dset_nv = convert_to_nimare_dataset(
-        nv_collections_images_df,
-        nv_text_df,
-        image_dir,
-    )
-    dset_nv = ImageTransformer("z").transform(dset_nv)
-    dset_nv = ImageTransformer("t").transform(dset_nv)
+    if not op.isfile(dset_nv_fn):
+        print(f"Creating full dataset {nv_collections_images_df.shape[0]}", flush=True)
+        dset_nv = convert_to_nimare_dataset(
+            nv_collections_images_df,
+            nv_text_df,
+            image_dir,
+        )
+        dset_nv = ImageTransformer("z").transform(dset_nv)
+        dset_nv = ImageTransformer("t").transform(dset_nv)
+    else:
+        dset_nv = Dataset.load(dset_nv_fn)
+
+    # Download abstracts
+    dset_nv = download_abstracts(dset_nv, "[email protected]")
     dset_nv.save(dset_nv_fn)
 
 
 
@@ -0,0 +1,124 @@
+import argparse
+import gzip
+import os.path as op
+import pickle
+
+from nimare.dataset import Dataset
+
+from lda import _annotate_dset, annotate_lda
+from utils import (
+    _add_texts,
+    _cogat_vocabulary,
+    _fetch_neuroquery_dset,
+    _generate_counts,
+)
+
+
+def _get_parser():
+    parser = argparse.ArgumentParser(description="Run LDA workflow")
+    parser.add_argument(
+        "--project_dir",
+        dest="project_dir",
+        required=True,
+        help="Path to project directory",
+    )
+    parser.add_argument(
+        "--n_cores",
+        dest="n_cores",
+        default=4,
+        required=False,
+        help="CPUs",
+    )
+    return parser
+
+
+def main(project_dir, n_cores):
+    project_dir = op.abspath(project_dir)
+    n_cores = int(n_cores)
+
+    data_dir = op.join(project_dir, "data")
+    nq_dir = op.join(data_dir, "neuroquery")
+    cogat_dir = op.join(data_dir, "cogat")
+
+    # At least dset_fn must exist. It is generated by gen_nimare_lda_dset.py
+    dset_fn = op.join(data_dir, "neurovault_all_dataset.pkl")
+    dset_lda_fn = op.join(data_dir, "neurovault_all_lda_dataset.pkl")
+
+    nq_lda_fn = op.join(nq_dir, "neuroquery_lda_model.pkl.gz")
+    nq_lda_dset_fn = op.join(nq_dir, "neuroquery_lda_dataset.pkl.gz")
+    nq_dset_text_fn = op.join(nq_dir, "neuroquery_with-texts_dataset.pkl.gz")
+
+    if not op.isfile(nq_lda_fn):
+        # Load NeuroQuery dataset with texts
+        if not op.isfile(nq_dset_text_fn):
+            nq_dset = _fetch_neuroquery_dset()
+
+            # Add texts to NeuroQuery dataset
+            nq_corpus_fn = op.join(nq_dir, "neuroquery_corpus_small.csv")
+
+            nq_dset = _add_texts(nq_dset, nq_corpus_fn)
+            nq_dset.save(nq_dset_text_fn)
+        else:
+            nq_dset = Dataset.load(nq_dset_text_fn)
+
+        # Get vocabulary from cognitive atlas concepts
+        vocabulary = _cogat_vocabulary(cogat_dir)
+
+        # Generate counts for Neuroquery dataset using the vocabulary from cogat concepts
+        nq_counts_df = _generate_counts(
+            nq_dset.texts,
+            vocabulary=vocabulary,
+            text_column="body",
+            tfidf=False,
+            max_df=len(nq_dset.ids) - 2,
+            min_df=2,
+        )
+
+        nq_lda_dset, model = annotate_lda(
+            nq_dset,
+            nq_counts_df,
+            n_topics=100,
+            max_iter=1000,
+            n_cores=n_cores,
+        )
+        # model.save(nq_lda_fn)
+        with gzip.GzipFile(nq_lda_fn, "wb") as file_object:
+            pickle.dump(model, file_object)
+
+        nq_lda_dset.save(nq_lda_dset_fn)
+    else:
+        model_file = gzip.open(nq_lda_fn, "rb")
+        model = pickle.load(model_file)
+
+        vocabulary = model.distributions_["p_topic_g_word_df"].columns.values
+
+    # LDA model on NeuroVault dataset
+    # Load NeuroVault dataset with Pubmed IDs, images and texts
+    dset = Dataset.load(dset_fn)
+
+    # Generate counts for NeuroVault dataset using the vocabulary from cogat concepts
+    nv_counts_df = _generate_counts(
+        dset.texts,
+        vocabulary=vocabulary,
+        text_column="abstract_y",
+        tfidf=False,
+        max_df=len(dset.ids) - 2,
+        min_df=2,
+    )
+
+    # Transform NeuroVault dataset counts using NQ LDA model
+    doc_topic_weights = model.model.transform(nv_counts_df.values)
+
+    # Annotate NeuroVault dataset with LDA model transformed weights
+    dset_lda = _annotate_dset(dset, model.model, nv_counts_df, doc_topic_weights)
+    dset_lda.save(dset_lda_fn)
+
+
+def _main(argv=None):
+    option = _get_parser().parse_args(argv)
+    kwargs = vars(option)
+    main(**kwargs)
+
+
+if __name__ == "__main__":
+    _main()
@@ -156,13 +156,13 @@ def main(project_dir):
     statisticmap_filtered = statisticmap_merged.query(
         'modality == "fMRI-BOLD"'
         ' & analysis_level == "G"'
-        ' & is_thresholded == "f"'
+        " & number_of_subjects > 10"
         ' & (map_type == "Z" | map_type == "Other" | map_type == "T")'
+        ' & is_thresholded == "f"'
         " & brain_coverage > 40"
-        " & number_of_subjects > 10"
+        ' & not_mni == "f"'
         ' & cognitive_paradigm_cogatlas_id != "trm_4c8a834779883"'  # rest eyes open
         ' & cognitive_paradigm_cogatlas_id != "trm_54e69c642d89b"'  # rest eyes closed
-        ' & not_mni == "f"'
     )
 
     # Relabel the "Other" map type into "Z" or "T" based on the file name and decription
 
@@ -34,18 +34,23 @@ def calculate_means(estimates, n_maps, gamma=0.3, method="mean"):
 
     elif (method == "trimmed") or (method == "winsorized"):
         K_gamma = int(gamma * K / 2)
+        # print(K, K_gamma)
 
         # Sort the estimates along each voxel
         estimates_sorted = np.sort(estimates, axis=0)
 
         # Trimmed mean calculation
-        estimates_trimmed = estimates_sorted[K_gamma:-K_gamma, :]
+        if K_gamma == 0:
+            estimates_trimmed = estimates_sorted  # Use all values
+        else:
+            estimates_trimmed = estimates_sorted[K_gamma:-K_gamma, :]
         trimmed_mean = np.mean(estimates_trimmed, axis=0)
 
         # Windsorized mean calculation
         estimates_winsorized = estimates_sorted.copy()
-        estimates_winsorized[:K_gamma, :] = estimates_sorted[K_gamma, :]
-        estimates_winsorized[-K_gamma:, :] = estimates_sorted[-K_gamma - 1, :]
+        if K_gamma > 0:  # Only modify the array if K_gamma is greater than 0
+            estimates_winsorized[:K_gamma, :] = estimates_sorted[K_gamma, :]
+            estimates_winsorized[-K_gamma:, :] = estimates_sorted[-K_gamma - 1, :]
         winsorized_mean = np.mean(estimates_winsorized, axis=0)
 
         # Windsorized estimate of data variance