Skip to content

Commit 553b847

Browse files
committed
Major update
1 parent 06e1491 commit 553b847

11 files changed

+1384
-68
lines changed

gen_nimare_dset.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import pandas as pd
55
from nimare.dataset import Dataset
6+
from nimare.extract import download_abstracts
67
from nimare.io import DEFAULT_MAP_TYPE_CONVERSION
78
from nimare.transforms import ImageTransformer
89

@@ -84,14 +85,20 @@ def main(project_dir):
8485
nv_text_df = pd.read_csv(op.join(data_dir, "pmid_text.csv"))
8586
dset_nv_fn = op.join(data_dir, "neurovault_all_dataset.pkl")
8687

87-
print(f"Creating full dataset {nv_collections_images_df.shape[0]}", flush=True)
88-
dset_nv = convert_to_nimare_dataset(
89-
nv_collections_images_df,
90-
nv_text_df,
91-
image_dir,
92-
)
93-
dset_nv = ImageTransformer("z").transform(dset_nv)
94-
dset_nv = ImageTransformer("t").transform(dset_nv)
88+
if not op.isfile(dset_nv_fn):
89+
print(f"Creating full dataset {nv_collections_images_df.shape[0]}", flush=True)
90+
dset_nv = convert_to_nimare_dataset(
91+
nv_collections_images_df,
92+
nv_text_df,
93+
image_dir,
94+
)
95+
dset_nv = ImageTransformer("z").transform(dset_nv)
96+
dset_nv = ImageTransformer("t").transform(dset_nv)
97+
else:
98+
dset_nv = Dataset.load(dset_nv_fn)
99+
100+
# Download abstracts
101+
dset_nv = download_abstracts(dset_nv, "[email protected]")
95102
dset_nv.save(dset_nv_fn)
96103

97104

gen_nimare_lda_dset.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
import argparse
2+
import gzip
3+
import os.path as op
4+
import pickle
5+
6+
from nimare.dataset import Dataset
7+
8+
from lda import _annotate_dset, annotate_lda
9+
from utils import (
10+
_add_texts,
11+
_cogat_vocabulary,
12+
_fetch_neuroquery_dset,
13+
_generate_counts,
14+
)
15+
16+
17+
def _get_parser():
18+
parser = argparse.ArgumentParser(description="Run LDA workflow")
19+
parser.add_argument(
20+
"--project_dir",
21+
dest="project_dir",
22+
required=True,
23+
help="Path to project directory",
24+
)
25+
parser.add_argument(
26+
"--n_cores",
27+
dest="n_cores",
28+
default=4,
29+
required=False,
30+
help="CPUs",
31+
)
32+
return parser
33+
34+
35+
def main(project_dir, n_cores):
36+
project_dir = op.abspath(project_dir)
37+
n_cores = int(n_cores)
38+
39+
data_dir = op.join(project_dir, "data")
40+
nq_dir = op.join(data_dir, "neuroquery")
41+
cogat_dir = op.join(data_dir, "cogat")
42+
43+
# At least dset_fn must exist. It is generated by gen_nimare_lda_dset.py
44+
dset_fn = op.join(data_dir, "neurovault_all_dataset.pkl")
45+
dset_lda_fn = op.join(data_dir, "neurovault_all_lda_dataset.pkl")
46+
47+
nq_lda_fn = op.join(nq_dir, "neuroquery_lda_model.pkl.gz")
48+
nq_lda_dset_fn = op.join(nq_dir, "neuroquery_lda_dataset.pkl.gz")
49+
nq_dset_text_fn = op.join(nq_dir, "neuroquery_with-texts_dataset.pkl.gz")
50+
51+
if not op.isfile(nq_lda_fn):
52+
# Load NeuroQuery dataset with texts
53+
if not op.isfile(nq_dset_text_fn):
54+
nq_dset = _fetch_neuroquery_dset()
55+
56+
# Add texts to NeuroQuery dataset
57+
nq_corpus_fn = op.join(nq_dir, "neuroquery_corpus_small.csv")
58+
59+
nq_dset = _add_texts(nq_dset, nq_corpus_fn)
60+
nq_dset.save(nq_dset_text_fn)
61+
else:
62+
nq_dset = Dataset.load(nq_dset_text_fn)
63+
64+
# Get vocabulary from cognitive atlas concepts
65+
vocabulary = _cogat_vocabulary(cogat_dir)
66+
67+
# Generate counts for Neuroquery dataset using the vocabulary from cogat concepts
68+
nq_counts_df = _generate_counts(
69+
nq_dset.texts,
70+
vocabulary=vocabulary,
71+
text_column="body",
72+
tfidf=False,
73+
max_df=len(nq_dset.ids) - 2,
74+
min_df=2,
75+
)
76+
77+
nq_lda_dset, model = annotate_lda(
78+
nq_dset,
79+
nq_counts_df,
80+
n_topics=100,
81+
max_iter=1000,
82+
n_cores=n_cores,
83+
)
84+
# model.save(nq_lda_fn)
85+
with gzip.GzipFile(nq_lda_fn, "wb") as file_object:
86+
pickle.dump(model, file_object)
87+
88+
nq_lda_dset.save(nq_lda_dset_fn)
89+
else:
90+
model_file = gzip.open(nq_lda_fn, "rb")
91+
model = pickle.load(model_file)
92+
93+
vocabulary = model.distributions_["p_topic_g_word_df"].columns.values
94+
95+
# LDA model on NeuroVault dataset
96+
# Load NeuroVault dataset with Pubmed IDs, images and texts
97+
dset = Dataset.load(dset_fn)
98+
99+
# Generate counts for NeuroVault dataset using the vocabulary from cogat concepts
100+
nv_counts_df = _generate_counts(
101+
dset.texts,
102+
vocabulary=vocabulary,
103+
text_column="abstract_y",
104+
tfidf=False,
105+
max_df=len(dset.ids) - 2,
106+
min_df=2,
107+
)
108+
109+
# Transform NeuroVault dataset counts using NQ LDA model
110+
doc_topic_weights = model.model.transform(nv_counts_df.values)
111+
112+
# Annotate NeuroVault dataset with LDA model transformed weights
113+
dset_lda = _annotate_dset(dset, model.model, nv_counts_df, doc_topic_weights)
114+
dset_lda.save(dset_lda_fn)
115+
116+
117+
def _main(argv=None):
118+
option = _get_parser().parse_args(argv)
119+
kwargs = vars(option)
120+
main(**kwargs)
121+
122+
123+
if __name__ == "__main__":
124+
_main()

get_nv_images.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -156,13 +156,13 @@ def main(project_dir):
156156
statisticmap_filtered = statisticmap_merged.query(
157157
'modality == "fMRI-BOLD"'
158158
' & analysis_level == "G"'
159-
' & is_thresholded == "f"'
159+
" & number_of_subjects > 10"
160160
' & (map_type == "Z" | map_type == "Other" | map_type == "T")'
161+
' & is_thresholded == "f"'
161162
" & brain_coverage > 40"
162-
" & number_of_subjects > 10"
163+
' & not_mni == "f"'
163164
' & cognitive_paradigm_cogatlas_id != "trm_4c8a834779883"' # rest eyes open
164165
' & cognitive_paradigm_cogatlas_id != "trm_54e69c642d89b"' # rest eyes closed
165-
' & not_mni == "f"'
166166
)
167167

168168
# Relabel the "Other" map type into "Z" or "T" based on the file name and decription

ibma.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,18 +34,23 @@ def calculate_means(estimates, n_maps, gamma=0.3, method="mean"):
3434

3535
elif (method == "trimmed") or (method == "winsorized"):
3636
K_gamma = int(gamma * K / 2)
37+
# print(K, K_gamma)
3738

3839
# Sort the estimates along each voxel
3940
estimates_sorted = np.sort(estimates, axis=0)
4041

4142
# Trimmed mean calculation
42-
estimates_trimmed = estimates_sorted[K_gamma:-K_gamma, :]
43+
if K_gamma == 0:
44+
estimates_trimmed = estimates_sorted # Use all values
45+
else:
46+
estimates_trimmed = estimates_sorted[K_gamma:-K_gamma, :]
4347
trimmed_mean = np.mean(estimates_trimmed, axis=0)
4448

4549
# Windsorized mean calculation
4650
estimates_winsorized = estimates_sorted.copy()
47-
estimates_winsorized[:K_gamma, :] = estimates_sorted[K_gamma, :]
48-
estimates_winsorized[-K_gamma:, :] = estimates_sorted[-K_gamma - 1, :]
51+
if K_gamma > 0: # Only modify the array if K_gamma is greater than 0
52+
estimates_winsorized[:K_gamma, :] = estimates_sorted[K_gamma, :]
53+
estimates_winsorized[-K_gamma:, :] = estimates_sorted[-K_gamma - 1, :]
4954
winsorized_mean = np.mean(estimates_winsorized, axis=0)
5055

5156
# Windsorized estimate of data variance

0 commit comments

Comments
 (0)