SIMEXP · Sep 15, 2024
diff --git a/‎config/base.yaml
Lines changed: 2 additions & 2 deletions b/‎config/base.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎config/data/default.yaml
Lines changed: 4 additions & 1 deletion b/‎config/data/default.yaml
Lines changed: 4 additions & 1 deletion
diff --git a/‎config/extract.yaml
Lines changed: 3 additions & 2 deletions b/‎config/extract.yaml
Lines changed: 3 additions & 2 deletions
diff --git a/‎config/hydra/hyperparameters.yaml
Lines changed: 15 additions & 13 deletions b/‎config/hydra/hyperparameters.yaml
Lines changed: 15 additions & 13 deletions
diff --git a/‎config/hydra/scaling.yaml renamed to ‎config/hydra/make_data.yaml
Lines changed: 4 additions & 4 deletions b/‎config/hydra/scaling.yaml renamed to ‎config/hydra/make_data.yaml
Lines changed: 4 additions & 4 deletions
diff --git a/‎config/hydra/scaling_cpu.yaml
Lines changed: 45 additions & 0 deletions b/‎config/hydra/scaling_cpu.yaml
Lines changed: 45 additions & 0 deletions
diff --git a/‎config/hydra/scaling_gpu.yaml
Lines changed: 46 additions & 0 deletions b/‎config/hydra/scaling_gpu.yaml
Lines changed: 46 additions & 0 deletions
diff --git a/‎config/model/linearchebnet.yaml renamed to ‎config/model/basic_model.yaml
Lines changed: 10 additions & 6 deletions b/‎config/model/linearchebnet.yaml renamed to ‎config/model/basic_model.yaml
Lines changed: 10 additions & 6 deletions
diff --git a/‎config/model/chebnet_detailed.yaml
Lines changed: 15 additions & 0 deletions b/‎config/model/chebnet_detailed.yaml
Lines changed: 15 additions & 0 deletions
diff --git a/‎config/model/experiment.yaml
Lines changed: 11 additions & 10 deletions b/‎config/model/experiment.yaml
Lines changed: 11 additions & 10 deletions
diff --git a/‎config/predict.yaml
Lines changed: 1 addition & 3 deletions b/‎config/predict.yaml
Lines changed: 1 addition & 3 deletions
diff --git a/‎config/train.yaml
Lines changed: 4 additions & 3 deletions b/‎config/train.yaml
Lines changed: 4 additions & 3 deletions
diff --git a/‎env/requirements.txt
Lines changed: 1 addition & 0 deletions b/‎env/requirements.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎fmri_autoreg b/‎fmri_autoreg
diff --git a/‎pyproject.toml
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/create_holdout_sample.py
Lines changed: 147 additions & 4 deletions b/‎src/create_holdout_sample.py
Lines changed: 147 additions & 4 deletions
diff --git a/‎src/data/load_data.py
Lines changed: 1 addition & 3 deletions b/‎src/data/load_data.py
Lines changed: 1 addition & 3 deletions
diff --git a/‎src/extract.py
Lines changed: 20 additions & 17 deletions b/‎src/extract.py
Lines changed: 20 additions & 17 deletions
diff --git a/‎src/model/extract_features.py
Lines changed: 4 additions & 4 deletions b/‎src/model/extract_features.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/predict.py
Lines changed: 98 additions & 51 deletions b/‎src/predict.py
Lines changed: 98 additions & 51 deletions
diff --git a/‎src/train.py
Lines changed: 84 additions & 107 deletions b/‎src/train.py
Lines changed: 84 additions & 107 deletions
diff --git a/‎src/utils/explore_hyperparameters.py
Lines changed: 50 additions & 0 deletions b/‎src/utils/explore_hyperparameters.py
Lines changed: 50 additions & 0 deletions
diff --git a/‎src/utils/plot_diagnosis.py
Lines changed: 190 additions & 0 deletions b/‎src/utils/plot_diagnosis.py
Lines changed: 190 additions & 0 deletions
diff --git a/‎src/utils/plot_orion.py
Lines changed: 5 additions & 5 deletions b/‎src/utils/plot_orion.py
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/utils/plot_scaling.py
Lines changed: 0 additions & 210 deletions b/‎src/utils/plot_scaling.py
Lines changed: 0 additions & 210 deletions
diff --git a/‎src/utils/plot_scaling_architecture.py
Lines changed: 380 additions & 0 deletions b/‎src/utils/plot_scaling_architecture.py
Lines changed: 380 additions & 0 deletions
diff --git a/‎src/utils/plot_scaling_downstream.py
Lines changed: 119 additions & 69 deletions b/‎src/utils/plot_scaling_downstream.py
Lines changed: 119 additions & 69 deletions
diff --git a/‎src/utils/plot_scaling_downstream_fewshot.py
Lines changed: 132 additions & 0 deletions b/‎src/utils/plot_scaling_downstream_fewshot.py
Lines changed: 132 additions & 0 deletions
diff --git a/‎src/utils/plot_scaling_sample.py
Lines changed: 211 additions & 0 deletions b/‎src/utils/plot_scaling_sample.py
Lines changed: 211 additions & 0 deletions
diff --git a/‎tools/find_batch_size.py
Lines changed: 99 additions & 0 deletions b/‎tools/find_batch_size.py
Lines changed: 99 additions & 0 deletions
diff --git a/‎tools/number_of_workers.py
Lines changed: 55 additions & 0 deletions b/‎tools/number_of_workers.py
Lines changed: 55 additions & 0 deletions
@@ -1,8 +1,8 @@
 ---
 defaults:
   - _self_
-  - hydra: default
+  - hydra: make_data
 
 verbose: 2
-random_state: 42
+random_state: 1
 return_type: float
@@ -1,10 +1,13 @@
 ---
 standardize: false
 n_embed: 197
+time_stride: 1
+lag: 1
+seq_length: 16
 atlas_desc: atlas-MIST_desc-${data.n_embed}
 hold_out_set: 0.20
 validation_set: 0.25
-n_sample: -1
+proportion_sample: 1.0
 class_balance_confounds:
   - site
   - sex
 
@@ -3,7 +3,8 @@ defaults:
   - _self_
   - hydra: extract
 
-horizon: 1
-convlayer_index: -1
+random_state: 435
+horizon: 6
+convlayer_index: -99
 # passing model path is necessary for evaluation
 model_path: ???
@@ -21,14 +21,15 @@ sweeper:
   # default parametrization of the search space
   params:
     model:
-      nb_epochs: uniform(16, 24, discrete=True)
-      FK: choices(["\'128,32,128,32,128,32,128,32,128,32,128,32\'", "\'8,6,8,6,8,6,8,6,8,6,8,6\'", "\'8,3,8,3,8,3\'"])
-      M: choices(["\'32,16,8,1\'", "\'16,8,1\'"])
-      lr: uniform(1e-4, 0.3)
-      lr_thres: uniform(1e-6, 1)
-      dropout: uniform(1e-4, 0.3)
-      batch_size: loguniform(128, 256, discrete=True)
-      seq_length: uniform(12, 32, discrete=True)
+      lr: loguniform(1e-5, 1e-2)
+      weight_decay: uniform(1e-6, 1e-4)
+      lr_thres: loguniform(1e-5, 1e-3)
+      lr_patience: choices([4, 5, 6])
+      dropout: uniform(0, 0.5)
+      bn_momentum: uniform(0, 0.99)
+      GCL: choices([3,6,12])
+      F: choices([8,16,32,64])
+      K: choices([3,6,9])
 
   experiment:
     name: experiment
@@ -55,7 +56,7 @@ sweeper:
 
   worker:
     n_workers: -1
-    max_broken: 10
+    max_broken: 20
     max_trials: 100
 
   storage:
@@ -67,11 +68,11 @@ sweeper:
 
 launcher:
   submitit_folder: ${hydra.sweep.dir}/.submitit/%j
-  timeout_min: 720
-  cpus_per_task: 4
+  timeout_min: 180
+  cpus_per_task: 5
   gpus_per_node: 1
   tasks_per_node: 1
-  mem_gb: 4
+  mem_gb: 8
   nodes: 1
   name: ${hydra.job.name}
   stderr_to_stdout: false
@@ -91,4 +92,5 @@ launcher:
   max_num_timeout: 0
   additional_parameters: {mail-user: '${oc.env:SLACK_EMAIL_BOT}', mail-type: ALL}
   array_parallelism: 256
-  setup:
+  setup: [export HYDRA_FULL_ERROR=1, export NCCL_DEBUG=INFO, 'rsync -tv --info=progress2 /lustre03/project/6003287/hwang1/rs-autoregression-prediction/outputs/sample_for_pretraining/seed-${random_state}/./sample_seed-${random_state}_data-train.h5
+        $SLURM_TMPDIR/data_$SLURM_JOB_ID.h5']
@@ -8,7 +8,7 @@ run:
   dir: ${oc.env:SCRATCH}/autoreg/${hydra.job.name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S}
 sweep:
   dir: ${oc.env:SCRATCH}/autoreg/${hydra.job.name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S}
-  subdir: seed-${random_state}_n-${data.n_sample}
+  subdir: seed-${random_state}
 
 job_logging:
   handlers:
@@ -19,10 +19,10 @@ job_logging:
 launcher:
   submitit_folder: ${hydra.sweep.dir}/.submitit/%j
   timeout_min: 600
-  cpus_per_task: 4
-  gpus_per_node: 1
+  cpus_per_task: 1
+  gpus_per_node:
   tasks_per_node: 1
-  mem_gb: 4
+  mem_gb: 2
   nodes: 1
   name: ${hydra.job.name}
   stderr_to_stdout: false
 
@@ -0,0 +1,45 @@
+---
+defaults:
+  - _self_
+  - override launcher: submitit_slurm
+
+# output directory, generated dynamically on each run
+run:
+  dir: ${oc.env:SCRATCH}/autoreg/${hydra.job.name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S}
+sweep:
+  dir: ${oc.env:SCRATCH}/autoreg/${hydra.job.name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S}
+  subdir: ${hydra.job.override_dirname}
+
+job_logging:
+  handlers:
+    file:
+      # Incorporates fix from https://github.com/facebookresearch/hydra/pull/2242
+      filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+
+launcher:
+  submitit_folder: ${hydra.sweep.dir}/.submitit/%j
+  timeout_min: 60
+  cpus_per_task: 10
+  gpus_per_node:
+  tasks_per_node: 1
+  mem_gb: 16
+  nodes: 1
+  name: ${hydra.job.name}
+  stderr_to_stdout: false
+  _target_: hydra_plugins.hydra_submitit_launcher.submitit_launcher.SlurmLauncher
+  partition:
+  qos:
+  comment:
+  constraint:
+  exclude:
+  gres:
+  cpus_per_gpu:
+  gpus_per_task:
+  mem_per_gpu:
+  mem_per_cpu:
+  account: ${oc.env:SLURM_COMPUTE_ACCOUNT}
+  signal_delay_s: 120
+  max_num_timeout: 0
+  additional_parameters: {mail-user: '${oc.env:SLACK_EMAIL_BOT}', mail-type: ALL}
+  array_parallelism: 256
+  setup: []
@@ -0,0 +1,46 @@
+---
+defaults:
+  - _self_
+  - override launcher: submitit_slurm
+
+# output directory, generated dynamically on each run
+run:
+  dir: ${oc.env:SCRATCH}/autoreg/${hydra.job.name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S}
+sweep:
+  dir: ${oc.env:SCRATCH}/autoreg/${hydra.job.name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S}
+  subdir: ${hydra.job.override_dirname}
+
+job_logging:
+  handlers:
+    file:
+      # Incorporates fix from https://github.com/facebookresearch/hydra/pull/2242
+      filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+
+launcher:
+  submitit_folder: ${hydra.sweep.dir}/.submitit/%j
+  timeout_min: 180
+  cpus_per_task: 5
+  gpus_per_node: 1
+  tasks_per_node: 1
+  mem_gb: 8
+  nodes: 1
+  name: ${hydra.job.name}
+  stderr_to_stdout: false
+  _target_: hydra_plugins.hydra_submitit_launcher.submitit_launcher.SlurmLauncher
+  partition:
+  qos:
+  comment:
+  constraint:
+  exclude:
+  gres:
+  cpus_per_gpu:
+  gpus_per_task:
+  mem_per_gpu:
+  mem_per_cpu:
+  account: ${oc.env:SLURM_COMPUTE_ACCOUNT}
+  signal_delay_s: 120
+  max_num_timeout: 0
+  additional_parameters: {mail-user: '${oc.env:SLACK_EMAIL_BOT}', mail-type: ALL}
+  array_parallelism: 256
+  setup: [export HYDRA_FULL_ERROR=1, export NCCL_DEBUG=INFO, 'rsync -tv --info=progress2 /lustre03/project/6003287/hwang1/rs-autoregression-prediction/outputs/sample_for_pretraining/seed-${random_state}/./sample_seed-${random_state}_data-train.h5
+        $SLURM_TMPDIR/data_$SLURM_JOB_ID.h5']
@@ -2,12 +2,16 @@
 defaults:
   - _self_
   - experiment
-  
-model: LinearChebnet
+
+# these defaults are from FP's paper
+model: Chebnet
 FC_type: nonshared_uni
-FK: 8,3,8,3,8,3
-M: '1'
-use_bn: true
+GCL: 3
+F: 8
+K: 3
+FCL: 1
+M: 8
+aggrs: add
 dropout: 0
+use_bn: true
 bn_momentum: 0.1
-
 
@@ -0,0 +1,15 @@
+---
+defaults:
+  - _self_
+  - experiment
+
+model: Chebnet
+FC_type: nonshared_uni
+use_bn: true
+dropout: 0
+bn_momentum: 0.1
+layers:
+  - {F: 8, K: 3, aggr: add}
+  - {F: 8, K: 3, aggr: add}
+  - {F: 8, K: 3, aggr: add}
+  - {M: 1}
@@ -1,12 +1,13 @@
 ---
-nb_epochs: 20
-lr: 0.01
-lr_patience: 4
-lr_thres: 0.001
-weight_decay: 0
-batch_size: 100
-num_workers: 4
-time_stride: 1
-lag: 1
-seq_length: 16
+# these defaults are from FP's paper
+# https://doi.org/10.1162/imag_a_00228
+nb_epochs: 20  # best was 100 in the paper, use 20 as default for faster iteration for scaling
+batch_size: 512
 edge_index_thres: 0.9
+lr: 1e-2  # default 1e-2, Common ranges include 1e-3 to 1e-1.
+weight_decay: 0  # this has to be really low, like 0 - 0.0001 range
+lr_patience: 4  # default 4
+lr_thres: 1e-3  # default 1e-3
+early_stopping:
+  min_delta: 1e-03
+  tolerance: 3
@@ -1,12 +1,10 @@
 ---
 defaults:
   - _self_
-  - extract
+  - hydra: scaling_cpu
 
 predict_variable: sex
 predict_variable_type: binary
-phenotype_file: inputs/connectomes/sourcedata/ukbb/ukbb_pheno.tsv
 percentage_sample: 100
-random_state: 42
 # passing extracted feature path is necessary for evaluation
 feature_path: ???
@@ -1,11 +1,12 @@
 ---
 defaults:
   - _self_
-  - model: chebnet
+  - model: basic_model
   - data: ukbb
   - hydra: default
 
 verbose: 2
-random_state: 42
+random_state: 1
 return_type: float
-data_split: ???
+# checkpoints: 0,2,4,6,8,10,12,14,16,18,20
+num_workers: 4
@@ -11,3 +11,4 @@ seaborn==0.13.2
 hydra-core==1.3.2
 hpbandster==0.7.4
 configspace==0.7.1
+torchinfo==1.8.0
@@ -22,6 +22,7 @@ exclude = [
     "**/tests/*",
     "*build/",
     "code/fmri-autoreg",
+    "src/utils/plot_*.py"
 ]
 
 ignore = [
 
@@ -1,12 +1,23 @@
+"""
+python src/create_holdout_sample.py --multirun \
+    +data=ukbb ++random_state=1,2,3,5,7,10,42,435,764,9999
+This is script will create all the input/labels for the full dataset.
+"""
 import json
 import logging
+import os
 from pathlib import Path
 
+import h5py
 import hydra
 import matplotlib.pyplot as plt
+import numpy as np
 import pandas as pd
 import seaborn as sns
+from fmri_autoreg.data.load_data import get_edge_index, load_data, make_seq
 from omegaconf import DictConfig
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
 
 log = logging.getLogger(__name__)
 
@@ -17,7 +28,12 @@ def main(params: DictConfig) -> None:
 
     output_dir = hydra.core.hydra_config.HydraConfig.get().runtime.output_dir
     output_dir = Path(output_dir)
+    rng = np.random.default_rng(params["random_state"])
+    log.info(f"Current working directory : {os.getcwd()}")
+    log.info(f"Output directory  : {output_dir}")
+    log.info(f"Random seed {params['random_state']}")
 
+    # create hold out sample using the full dataset and save things
     sample = create_hold_out_sample(
         phenotype_path=params["data"]["phenotype_file"],
         phenotype_meta=params["data"]["phenotype_json"],
@@ -35,6 +51,9 @@ def main(params: DictConfig) -> None:
         json.dump(sample, f, indent=2)
 
     # plot the distribution of confounds of downstreams balanced samples
+    log.info("Holdout sample created")
+    report_dir = output_dir / "report"
+    report_dir.mkdir(exist_ok=True)
     demographics = {}
     for d in sample["test_downstreams"].keys():
         d_subjects = sample["test_downstreams"][d]
@@ -50,7 +69,7 @@ def main(params: DictConfig) -> None:
         )
         for ax, c in zip(axes, params["data"]["class_balance_confounds"]):
             sns.histplot(x=c, data=df, hue=d, kde=True, ax=ax)
-        fig.savefig(output_dir / f"{d}.png")
+        fig.savefig(report_dir / f"{d}.png")
         demographics[d] = {
             "patient": {
                 "condition": d,
@@ -81,14 +100,14 @@ def main(params: DictConfig) -> None:
                 "proportion_kept_sd": df[df[d] == 0]["proportion_kept"].std(),
             },
         }
-
+    # save the summary
     demographics_summary = pd.DataFrame()
     for d in demographics.keys():
         df = pd.DataFrame.from_dict(demographics[d], orient="index")
         df.set_index([df.index, "condition"], inplace=True)
         demographics_summary = pd.concat([demographics_summary, df])
     demographics_summary.round(decimals=2).to_csv(
-        output_dir / "demographics_summary.tsv", sep="\t"
+        report_dir / "demographics_summary.tsv", sep="\t"
     )
 
     for key in sample.keys():
@@ -104,7 +123,131 @@ def main(params: DictConfig) -> None:
         fig.suptitle(f"{key} sample (N={len(d_subjects)})")
         for ax, c in zip(axes, params["data"]["class_balance_confounds"]):
             sns.histplot(x=c, data=df, kde=True, ax=ax)
-        fig.savefig(output_dir / f"{key}.png")
+        fig.savefig(report_dir / f"{key}.png")
+
+    log.info("Sample report created")
+
+    full_train_sample = [f"sub-{s}" for s in sample["train"]]
+    test_participant_ids = [f"sub-{s}" for s in sample["hold_out"]]
+    rng.shuffle(full_train_sample)
+
+    # pre generate labels for training samples
+    prefix = f"sample_seed-{params['random_state']}"
+    data_h5 = Path(output_dir) / f"{prefix}_data-train.h5"
+    original_reference = Path(output_dir) / f"{prefix}_split.json"
+    data_reference = {}
+
+    # further split the training sample into training and validation
+
+    log.info(
+        f"Create dataset of {len(full_train_sample)} subjects "
+        "for pretrain model. "
+    )
+    train_participant_ids, val_participant_ids = train_test_split(
+        full_train_sample,
+        test_size=params["data"]["validation_set"],
+        shuffle=False,
+        random_state=params["random_state"],
+    )
+    data_ids = (
+        train_participant_ids,
+        val_participant_ids,
+        test_participant_ids,
+    )
+    # save reference to the h5 path in the original data file
+    data_reference = create_reference(params, data_ids)
+
+    # generate labels for the autoregressive model
+    with h5py.File(data_h5, "a") as f:
+        for n_embed in data_reference.keys():
+            base = f"n_embed-{n_embed}"
+            log.info(f"Creating dataset for n_embed-{n_embed}")
+            f.create_group(base)
+            for split in ["train", "val"]:
+                cur_group = f.create_group(f"{base}/{split}")
+
+                if split == "train":
+                    # use the training set (exclude validation set)
+                    # to create the connectome
+                    edges = get_edge_index(
+                        data_file=params["data"]["data_file"],
+                        dset_paths=data_reference[n_embed]["train"],
+                    )
+                    f[f"n_embed-{n_embed}"]["train"].create_dataset(
+                        "connectome", data=edges
+                    )
+
+                for dset in tqdm(
+                    data_reference[n_embed][split],
+                    desc=f"Creating {split} set",
+                ):
+                    data = load_data(
+                        path=params["data"]["data_file"],
+                        h5dset_path=dset,
+                        standardize=False,
+                        dtype="data",
+                    )
+                    x, y = make_seq(
+                        data,
+                        params["data"]["seq_length"],
+                        params["data"]["time_stride"],
+                        params["data"]["lag"],
+                    )
+                    if x.shape[0] == 0 or x is None:
+                        log.warning(
+                            f"Skipping {dset} as label couldn't be created."
+                        )
+                        continue
+                    if cur_group.get("input") is None:
+                        cur_group.create_dataset(
+                            name="input",
+                            data=x,
+                            dtype=np.float32,
+                            maxshape=(
+                                None,
+                                n_embed,
+                                params["data"]["seq_length"],
+                            ),
+                            chunks=(
+                                x.shape[0],
+                                n_embed,
+                                params["data"]["seq_length"],
+                            ),
+                        )
+                        cur_group.create_dataset(
+                            name="label",
+                            data=y,
+                            dtype=np.float32,
+                            maxshape=(None, n_embed),
+                            chunks=(y.shape[0], n_embed),
+                        )
+                    else:
+                        cur_group["input"].resize(
+                            (cur_group["input"].shape[0] + x.shape[0]), axis=0
+                        )
+                        cur_group["input"][-x.shape[0] :] = x
+
+                        cur_group["label"].resize(
+                            (cur_group["label"].shape[0] + y.shape[0]), axis=0
+                        )
+                        cur_group["label"][-y.shape[0] :] = y
+    with open(original_reference, "a") as f:
+        json.dump(data_reference, f, indent=2)
+
+
+def create_reference(params, data_ids):
+    data_reference = {}
+    from src.data.load_data import load_ukbb_dset_path
+
+    for n_embed in [64, 197, 444]:
+        data_reference[n_embed] = {}
+        for d in zip(["train", "val", "test"], data_ids):
+            data_reference[n_embed][d[0]] = load_ukbb_dset_path(
+                participant_id=d[1],
+                atlas_desc=f"atlas-MIST_desc-{n_embed}",
+                segment=params["data"]["segment"],
+            )
+    return data_reference
 
 
 if __name__ == "__main__":
 
@@ -316,7 +316,6 @@ def get_model_data(
     phenotype_file: Union[Path, str],
     measure: str = "connectome",
     label: str = "sex",
-    pooling_target: str = "max",
     log: logging = logging,
 ) -> Dict[str, np.ndarray]:
     """Get the data from pretrained model for the downstrean task.
@@ -374,11 +373,10 @@ def get_model_data(
         if subject in participant_id:
             df_phenotype.loc[subject, "path"] = p
     selected_path = df_phenotype.loc[participant_id, "path"].values.tolist()
-    log.info(len(selected_path))
     data = load_data(data_file, selected_path, dtype="data")
 
     if "r2" in measure:
-        data = np.concatenate(data).squeeze()
+        data = np.array(data)[:, 0, :]
         if measure == "avgr2":
             data = data.mean(axis=1).reshape(-1, 1)
         data = StandardScaler().fit_transform(data)
 
@@ -17,11 +17,12 @@
 import hydra
 import torch
 from fmri_autoreg.models.predict_model import predict_horizon
-from fmri_autoreg.tools import load_model
+from fmri_autoreg.tools import chebnet_argument_resolver, load_model
 from omegaconf import DictConfig, OmegaConf
 from tqdm import tqdm
 
 log = logging.getLogger(__name__)
+LABEL_DIR = Path(__file__).parents[1] / "outputs" / "sample_for_pretraining"
 
 
 @hydra.main(version_base="1.3", config_path="../config", config_name="extract")
@@ -50,16 +51,18 @@ def main(params: DictConfig) -> None:
     log.info(f"predicting horizon: {horizons}")
 
     # load test set subject path from the training
-    with open(model_path.parent / "train_test_split.json", "r") as f:
+    with open(
+        LABEL_DIR
+        / f"seed-{params['random_state']}"
+        / f"sample_seed-{params['random_state']}_split.json",
+        "r",
+    ) as f:
         subj = json.load(f)
 
-    subj_list = subj["test"]
-
-    # save test data path to a text file for easy future reference
-    with open(output_dir / "test_set_connectome.txt", "w") as f:
-        for item in subj_list:
-            f.write("%s\n" % item)
-
+    subj_list = subj[str(params["data"]["n_embed"])]["test"]
+    model_params = chebnet_argument_resolver(
+        OmegaConf.to_container(params["model"])
+    )
     log.info("Load model")
     model = load_model(model_path)
     if isinstance(model, torch.nn.Module):
@@ -79,13 +82,12 @@ def main(params: DictConfig) -> None:
             # get the prediction of t+1
             r2, Z, Y = predict_horizon(
                 model=model,
-                seq_length=params["model"]["seq_length"],
+                seq_length=params["data"]["seq_length"],
                 horizon=horizon,
                 data_file=params["data"]["data_file"],
                 dset_path=h5_dset_path,
-                batch_size=params["model"]["batch_size"],
-                stride=params["model"]["time_stride"],
-                standardize=False,  # the ts is already standardized
+                batch_size=None,
+                stride=params["data"]["time_stride"],
             )
             # save the original output to a h5 file
             with h5py.File(output_horizon_path, "a") as f:
@@ -108,17 +110,17 @@ def main(params: DictConfig) -> None:
             data_file=params["data"]["data_file"],
             h5_dset_path=h5_dset_path,
             model=model,
-            seq_length=params["model"]["seq_length"],
-            time_stride=params["model"]["time_stride"],
-            lag=params["model"]["lag"],
+            seq_length=params["data"]["seq_length"],
+            time_stride=params["data"]["time_stride"],
+            lag=params["data"]["lag"],
         )
         # save the original output to a h5 file
         with h5py.File(output_conv_path, "a") as f:
             new_ds_path = h5_dset_path.replace("timeseries", "convlayers")
             f[new_ds_path] = convlayers.numpy()
         convlayers_F = [
             int(F)
-            for i, F in enumerate(params["model"]["FK"].split(","))
+            for i, F in enumerate(model_params["FK"].split(","))
             if i % 2 == 0
         ]
         # get the pooling features of the assigned layer
@@ -138,6 +140,7 @@ def main(params: DictConfig) -> None:
     # save the original output to a h5 file
     with h5py.File(output_conv_path, "a") as f:
         f.attrs["convolution_layers_F"] = convlayers_F
+    log.info("Extraction completed.")
 
 
 if __name__ == "__main__":
 
@@ -74,15 +74,15 @@ def pooling_convlayers(
     convlayers: torch.tensor,
     pooling_methods: str = "average",
     pooling_target: str = "parcel",
-    layer_index: int = -1,
+    layer_index: int = -99,
     layer_structure: Tuple[int] = None,
 ) -> np.array:
     """Pooling the conv layers.
 
     Args:
         convlayers (torch.tensor) : shape
             (time series, parcel, stack layer feature F)
-        layer_index (int) : the index of the layer to be pooled, -1
+        layer_index (int) : the index of the layer to be pooled, -99
             means pooling all layers.
         pooling_methods (str) : "average", "max", "std"
         pooling_target (str) : keep "parcel" or "timeseries" and parcels
@@ -96,14 +96,14 @@ def pooling_convlayers(
         raise ValueError(f"Pooling method {pooling_methods} is not supported.")
     if pooling_target not in ["parcel", "timeseries"]:
         raise ValueError(f"Pooling target {pooling_target} is not supported.")
-    if layer_index > len(layer_structure):
+    if layer_structure and layer_index > len(layer_structure):
         raise ValueError(
             "The layer index should be smaller than the length of the "
             f"layer structure. layer index is {layer_index} but there "
             f"are {len(layer_structure)} layers."
         )
 
-    if layer_index != -1:  # select the layer to be pooled
+    if layer_index != -99:  # select the layer to be pooled
         if sum(layer_structure) != convlayers.shape[-1]:
             raise ValueError(
                 "The sum of layer structure should be equal to the "
 
@@ -1,17 +1,9 @@
 """
 Execute at the root of the repo, not in the code directory.
 
-To execute the code,
-you need to create a directory structure as follows:
-```
-.
-└── <name of your analysis>/
-    ├── extract  -> symlink to the output of the `extract` script
-    └── model  -> symlink to the output of a fitted model
 ```
 python src/predict.py --multirun \
   feature_path=/path/to/<name of your analysis>/extract \
-  ++phenotype_file=/path/to/phenotype.tsv
 ```
 
 Currently the script hard coded to predict sex or age.
@@ -31,7 +23,11 @@
     Ridge,
     RidgeClassifier,
 )
-from sklearn.model_selection import ShuffleSplit, StratifiedKFold
+from sklearn.model_selection import (
+    ShuffleSplit,
+    StratifiedKFold,
+    StratifiedShuffleSplit,
+)
 from sklearn.neural_network import MLPClassifier, MLPRegressor
 from sklearn.svm import LinearSVC, LinearSVR
 
@@ -41,6 +37,16 @@
         "data_file_pattern": None,
         "plot_label": "Connectome",
     },
+    "avgr2": {
+        "data_file": None,
+        "data_file_pattern": "r2map",
+        "plot_label": "t+1\n average R2",
+    },
+    "r2map": {
+        "data_file": None,
+        "data_file_pattern": "r2map",
+        "plot_label": "t+1\nR2 map",
+    },
     "conv_avg": {
         "data_file": None,
         "data_file_pattern": "average",
@@ -61,16 +67,6 @@
         "data_file_pattern": "1dconv",
         "plot_label": "Conv layers \n 1D convolution",
     },
-    "avgr2": {
-        "data_file": None,
-        "data_file_pattern": "r2map",
-        "plot_label": "t+1\n average R2",
-    },
-    "r2map": {
-        "data_file": None,
-        "data_file_pattern": "r2map",
-        "plot_label": "t+1\nR2 map",
-    },
 }
 
 log = logging.getLogger(__name__)
@@ -85,40 +81,98 @@ def train(dataset, tng, tst, clf, clf_name):
     }
 
 
+LABEL_DIR = Path(__file__).parents[1] / "outputs" / "sample_for_pretraining"
+
+
 @hydra.main(version_base="1.3", config_path="../config", config_name="predict")
 def main(params: DictConfig) -> None:
-    from src.data.load_data import get_model_data, load_h5_data_path
+    from src.data.load_data import (
+        get_model_data,
+        load_h5_data_path,
+        load_ukbb_dset_path,
+    )
 
     # parse parameters
     output_dir = hydra.core.hydra_config.HydraConfig.get().runtime.output_dir
     output_dir = Path(output_dir)
     log.info(f"Output data {output_dir}")
     feature_path = Path(params["feature_path"])
-    phenotype_file = Path(params["phenotype_file"])
-    convlayers_path = feature_path / "feature_convlayers.h5"
-    feature_t1_file = feature_path / f"feature_horizon-{params['horizon']}.h5"
-    test_subjects = feature_path / "test_set_connectome.txt"
+    extract_config = OmegaConf.load(feature_path / ".hydra/config.yaml")
     model_config = OmegaConf.load(
-        feature_path.parent / "model/.hydra/config.yaml"
+        Path(extract_config["model_path"]).parent / ".hydra/config.yaml"
+    )
+
+    phenotype_file = Path(model_config["data"]["phenotype_file"])
+    convlayers_path = feature_path / "feature_convlayers.h5"
+    feature_t1_file = (
+        feature_path / f"feature_horizon-{extract_config['horizon']}.h5"
     )
     params = OmegaConf.merge(model_config, params)
     log.info(params)
-
-    # load test set subject path from the training
-    with open(test_subjects, "r") as f:
-        hold_outs = f.read().splitlines()
     percentage_sample = params["percentage_sample"]
-    if percentage_sample != 100:
-        proportion = percentage_sample / 100
-        sample_select = ShuffleSplit(
-            n_splits=1,
-            train_size=proportion,
-            random_state=params["random_state"],
-        )
-        sample_index, _ = next(sample_select.split(hold_outs))
-        subj = [hold_outs[i] for i in sample_index]
+
+    if params["predict_variable"] in ["age", "sex"]:
+        sample_file = list(
+            (LABEL_DIR / f"seed-{model_config['random_state']}").glob(
+                "sample*split.json"
+            )
+        )[0]
+
+        # load test set subject path from the training
+        with open(sample_file, "r") as f:
+            hold_outs = json.load(f)[f"{model_config['data']['n_embed']}"][
+                "test"
+            ]
+
+        if percentage_sample != 100:
+            proportion = percentage_sample / 100
+            sample_select = ShuffleSplit(
+                n_splits=1,
+                train_size=proportion,
+                random_state=params["random_state"],
+            )
+            sample_index, _ = next(sample_select.split(hold_outs))
+            subj = [hold_outs[i] for i in sample_index]
+        else:
+            subj = hold_outs.copy()
     else:
-        subj = hold_outs.copy()
+        sample_file = (
+            LABEL_DIR
+            / f"seed-{model_config['random_state']}"
+            / "downstream_sample.json"
+        )
+        with open(sample_file, "r") as f:
+            hold_outs = json.load(f)["test_downstreams"][
+                params["predict_variable"]
+            ]  # these are subject ids
+        diagnosis_data = (
+            pd.read_csv(phenotype_file, sep="\t")
+            .set_index("participant_id")
+            .loc[hold_outs, :]
+        )
+
+        percentage_sample = params["percentage_sample"]
+        if percentage_sample != 100:
+            proportion = percentage_sample / 100
+            sample_select = StratifiedShuffleSplit(
+                n_splits=1,
+                train_size=proportion,
+                random_state=params["random_state"],
+            )
+            sample_index, _ = next(
+                sample_select.split(
+                    diagnosis_data.index,
+                    diagnosis_data[params["predict_variable"]],
+                )
+            )
+            subj = [diagnosis_data.index[i] for i in sample_index]
+
+        else:
+            subj = hold_outs.copy()
+        subj = [f"sub-{s}" for s in subj]
+        subj = load_ukbb_dset_path(
+            subj, params["data"]["atlas_desc"], params["data"]["segment"]
+        )
 
     log.info(
         f"Downstream prediction on {len(subj)}, "
@@ -143,31 +197,29 @@ def main(params: DictConfig) -> None:
                 C=100,
                 penalty="l2",
                 class_weight="balanced",
-                max_iter=1000000,
+                max_iter=10000,
                 random_state=params["random_state"],
             ),
             "LogisticR": LogisticRegression(
                 penalty="l2",
                 class_weight="balanced",
-                max_iter=100000,
+                max_iter=1000,
                 random_state=params["random_state"],
                 n_jobs=-1,
             ),
             "Ridge": RidgeClassifier(
                 class_weight="balanced",
                 random_state=params["random_state"],
-                max_iter=100000,
+                max_iter=1000,
             ),
         }
     elif params["predict_variable_type"] == "numerical":  # need to fix this
         clf_options = {
             "SVM": LinearSVR(
-                C=100, max_iter=1000000, random_state=params["random_state"]
+                C=100, max_iter=10000, random_state=params["random_state"]
             ),
             "LinearR": LinearRegression(n_jobs=-1),
-            "Ridge": Ridge(
-                random_state=params["random_state"], max_iter=100000
-            ),
+            "Ridge": Ridge(random_state=params["random_state"], max_iter=1000),
         }
     else:
         raise ValueError(
@@ -180,11 +232,6 @@ def main(params: DictConfig) -> None:
         log.info(f"Load data {baseline_details[measure]['data_file']}")
         if measure == "connectome":
             dset_path = baseline_details[measure]["data_file_pattern"]
-        elif percentage_sample == 100:
-            dset_path = load_h5_data_path(
-                baseline_details[measure]["data_file"],
-                baseline_details[measure]["data_file_pattern"],
-            )
         else:
             dset_path = []
             for connectome_path in subj:
 
@@ -8,10 +8,9 @@
 model training
 ```
 python src/train.py --multirun hydra=scaling \
-  ++data.n_sample=100,200,300,-1
+  ++data.proportion_sample=1,0.5,0.25,0.1
 ```
 """
-import json
 import logging
 import os
 import pickle as pk
@@ -23,50 +22,41 @@
 import numpy as np
 import pandas as pd
 import torch
-from fmri_autoreg.data.load_data import make_input_labels
+from fmri_autoreg.data.load_data import get_edge_index_threshold
 from fmri_autoreg.models.train_model import train
-from omegaconf import DictConfig
+from fmri_autoreg.tools import chebnet_argument_resolver
+from omegaconf import DictConfig, OmegaConf
 from seaborn import lineplot
-from sklearn.model_selection import train_test_split
+from torchinfo import summary
 
-
-def convert_bytes(num):
-    for x in ["bytes", "KB", "MB", "GB", "TB"]:
-        if num < 1024.0:
-            return f"{num:.1f} {x}"
-        num /= 1024.0
-
-
-log = logging.getLogger(__name__)
+LABEL_DIR = Path(__file__).parents[1] / "outputs" / "sample_for_pretraining"
 
 
 @hydra.main(version_base="1.3", config_path="../config", config_name="train")
 def main(params: DictConfig) -> None:
     """Train model using parameters dict and save results."""
     # import local library here because sumbitit and hydra being weird
     # if not interactive session of slurm, import submit it
-    from src.data.load_data import load_ukbb_dset_path
-
-    rng = np.random.default_rng(params["random_state"])
-
     if (
         "SLURM_JOB_ID" in os.environ
         and os.environ["SLURM_JOB_NAME"] != "interactive"
     ):
-        # import submitit
-        # env = submitit.JobEnvironment()
         pid = os.getpid()
         # A logger for this file
         log = logging.getLogger(f"Process ID {pid}")
         log.info(f"Process ID {pid}")
-        # use SLURM_TMPDIR for data_dir
-        data_dir = Path(os.environ["SLURM_TMPDIR"]) / f"pid_{pid}"
-        data_dir.mkdir()
+        tng_data_h5 = (
+            Path(os.environ["SLURM_TMPDIR"])
+            / f"data_{os.environ['SLURM_JOB_ID']}.h5"
+        )
     else:
         log = logging.getLogger(__name__)
-        data_dir = Path(
-            hydra.core.hydra_config.HydraConfig.get().runtime.output_dir
-        )
+        tng_data_h5 = list(
+            (LABEL_DIR / f"seed-{params['random_state']}").glob("*train.h5")
+        )[
+            0
+        ]  # will be shuffled after loading
+
     output_dir = hydra.core.hydra_config.HydraConfig.get().runtime.output_dir
     log.info(f"Current working directory : {os.getcwd()}")
     log.info(f"Output directory  : {output_dir}")
@@ -80,111 +70,98 @@ def main(params: DictConfig) -> None:
     # flatten the parameters
     device = "cuda:0" if torch.cuda.is_available() else "cpu"
     train_param = {**params["model"], **params["data"]}
+    train_param["num_workers"] = params["num_workers"]
     train_param["torch_device"] = device
     train_param["random_state"] = params["random_state"]
+    if "checkpoints" in params:
+        train_param["checkpoints"] = params["checkpoints"]
     log.info(f"Working on {device}.")
 
-    # load data path
-    n_sample = params["data"]["n_sample"]
-
-    data_split_json = params["data_split"]
-
-    with open(data_split_json, "r") as f:
-        train_subject = json.load(f)["train"]
-        test_subject = json.load(f)["holdout"]
-
-    rng.shuffle(train_subject)
-
-    if n_sample > 0:
-        train_subject = train_subject[:n_sample]
-
-    train_subject = [f"sub-{s}" for s in train_subject]
-    test_subject = [f"sub-{s}" for s in test_subject]
-
-    train_participant_ids, val_participant_ids = train_test_split(
-        train_subject,
-        test_size=params["data"]["validation_set"],
-        shuffle=True,
-        random_state=params["random_state"],
-    )
-
-    data_reference = {}
-    data_reference["train"] = load_ukbb_dset_path(
-        participant_id=train_participant_ids,
-        atlas_desc=params["data"]["atlas_desc"],
-        segment=params["data"]["segment"],
+    # get path data
+    try:
+        with h5py.File(tng_data_h5, "r") as h5file:
+            connectome = h5file[f"n_embed-{train_param['n_embed']}"]["train"][
+                "connectome"
+            ][:]
+    except OSError:
+        log.error(f"File {tng_data_h5} corrupted.")
+        return 1
+
+    # get edge index
+    edge_index = get_edge_index_threshold(
+        connectome, train_param["edge_index_thres"]
     )
-    data_reference["val"] = load_ukbb_dset_path(
-        participant_id=val_participant_ids,
-        atlas_desc=params["data"]["atlas_desc"],
-        segment=params["data"]["segment"],
-    )
-    data_reference["test"] = load_ukbb_dset_path(
-        participant_id=test_subject,
-        atlas_desc=params["data"]["atlas_desc"],
-        segment=params["data"]["segment"],
-    )
-    with open(Path(output_dir) / "train_test_split.json", "w") as f:
-        json.dump(data_reference, f, indent=2)
-    n_sample_pretrain = len(data_reference["train"]) + len(
-        data_reference["val"]
-    )
-    log.info(
-        f"Experiment on {n_sample_pretrain} subjects for pretrain model. "
-    )
-
-    tng_data_h5 = data_dir / "data_train.h5"
-    val_data_h5 = data_dir / "data_val.h5"
-    tng_data_h5, edge_index = make_input_labels(
-        data_file=params["data"]["data_file"],
-        dset_paths=data_reference["train"],
-        params=train_param,
-        output_file_path=tng_data_h5,
-        compute_edge_index=compute_edge_index,
-        log=log,
-    )
-    val_data_h5, _ = make_input_labels(
-        data_file=params["data"]["data_file"],
-        dset_paths=data_reference["val"],
-        params=train_param,
-        output_file_path=val_data_h5,
-        compute_edge_index=False,
-        log=log,
-    )
-    if params["verbose"] > 1:
-        log.info(
-            f"Training data: {convert_bytes(os.path.getsize(tng_data_h5))}"
-        )
-        log.info(
-            f"Validation data: {convert_bytes(os.path.getsize(val_data_h5))}"
-        )
-
-    train_data = (tng_data_h5, val_data_h5, edge_index)
+    log.info("Loaded connectome.")
+    train_data = (tng_data_h5, edge_index)
     del edge_index
 
     with h5py.File(tng_data_h5, "r") as h5file:
-        n_seq = h5file["input"].shape[0]
-    if n_seq < train_param["batch_size"]:
+        n_tng_inputs = h5file[f"n_embed-{train_param['n_embed']}"]["train"][
+            "input"
+        ].shape[0]
+        n_tng_inputs *= train_param["proportion_sample"]
+
+    if n_tng_inputs < train_param["batch_size"]:
         log.info(
             "Batch size is greater than the number of sequences. "
             "Setting batch size to number of sequences. "
-            f"New batch size: {n_seq}. "
+            f"New batch size: {n_tng_inputs}. "
             f"Old batch size: {train_param['batch_size']}."
         )
-        train_param["batch_size"] = n_seq
+        train_param["batch_size"] = n_tng_inputs
+    if compute_edge_index:  # chebnet
+        train_param = chebnet_argument_resolver(train_param)
+    # save train_param
+    with open(os.path.join(output_dir, "train_param.yaml"), "w") as f:
+        OmegaConf.save(config=train_param, f=f)
 
     log.info("Start training.")
     (
         model,
         mean_r2_tng,
         mean_r2_val,
         losses,
-        _,
+        checkpoints,
     ) = train(train_param, train_data, verbose=params["verbose"])
+
     # save training results
     np.save(os.path.join(output_dir, "mean_r2_tng.npy"), mean_r2_tng)
     np.save(os.path.join(output_dir, "mean_r2_val.npy"), mean_r2_val)
     np.save(os.path.join(output_dir, "training_losses.npy"), losses)
+    if "checkpoints" in params:
+        # save a list of dictionaries as pd dataframe
+        checkpoints = pd.DataFrame(checkpoints)
+        checkpoints.to_csv(
+            os.path.join(output_dir, "checkpoints.tsv"), sep="\t"
+        )
+    if params["verbose"] > 3:
+        # get model info
+        with open(os.path.join(output_dir, "model_info.txt"), "w") as f:
+            model_stats = summary(model)
+            summary_str = str(model_stats)
+            f.write(summary_str)
+
+        # get model info
+        with open(
+            os.path.join(output_dir, "model_info_with_input.txt"), "w"
+        ) as f:
+            model_stats = summary(
+                model,
+                input_size=(
+                    train_param["batch_size"],
+                    train_param["n_embed"],
+                    train_param["seq_length"],
+                ),
+                col_names=[
+                    "input_size",
+                    "output_size",
+                    "num_params",
+                    "kernel_size",
+                ],
+            )
+            summary_str = str(model_stats)
+            f.write(summary_str)
+
     log.info(f"Mean r2 tng: {mean_r2_tng}")
     log.info(f"Mean r2 val: {mean_r2_val}")
 
@@ -196,7 +173,7 @@ def main(params: DictConfig) -> None:
     training_losses = pd.DataFrame(losses)
     plt.figure()
     g = lineplot(data=training_losses)
-    g.set_title(f"Training Losses (N={n_sample})")
+    g.set_title(f"Training Losses (number of inputs={n_tng_inputs})")
     g.set_xlabel("Epoc")
     g.set_ylabel("Loss (MSE)")
     plt.savefig(Path(output_dir) / "training_losses.png")
 
@@ -0,0 +1,50 @@
+import itertools
+import re
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+
+output_dirs = Path(
+    "outputs/autoreg/train/multiruns/nembed-197_hyperparameters"
+).glob("**/train.log")
+data = []
+
+
+def peek(iterable):
+    try:
+        first = next(iterable)
+    except StopIteration:
+        return None
+    return itertools.chain([first], iterable)
+
+
+for p in output_dirs:
+    experiment = {
+        f.groups()[0]: float(f.groups()[1])
+        for f in re.finditer(r"model\.([a-z_]*)=([\d\.e?-]*)", p.parent.name)
+    }
+    experiment["mean_r2_val"] = np.nan
+    experiment["runtime"] = np.nan
+    if (p.parent / "model.pkl").exists():
+        with open(p, "r") as log:
+            report = log.read()
+        mean_r2_val = re.search(r"Mean r2 val: ([\-\.\d]*)", report).groups()[
+            0
+        ]
+        starttime = re.search(r"\[([\d\-\s:,]*)\].*Process ID", report).group(
+            1
+        )
+        endtime = re.search(r"\[([\d\-\s:,]*)\].*model trained", report).group(
+            1
+        )
+        starttime = pd.to_datetime(starttime)
+        endtime = pd.to_datetime(endtime)
+        runtime = endtime - starttime
+        experiment["mean_r2_val"] = mean_r2_val
+        experiment["runtime"] = runtime.total_seconds() / 60
+    data.append(experiment)
+
+data = pd.DataFrame(data)
+data = data.sort_values("mean_r2_val", ascending=False)
+data.to_csv("_explore_hyperparameters.tsv", sep="\t", index=False)
@@ -0,0 +1,190 @@
+import re
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from matplotlib.lines import Line2D
+
+DIAGNOSIS_PATH = "outputs/neuroips-workshop_2024/downstreams_last-layer/data/data.proportion_sample_1.0"  # noqa: E501
+
+feature_fullname = {
+    "connectome": "Connectome\n(baseline)",
+    "avgr2": "t+1\naverage R2",
+    "r2map": "t+1\nR2 map",
+    "conv_avg": "Conv layers\navg pooling",
+    "conv_std": "Conv layers\nstd pooling",
+    "conv_max": "Conv layers\nmax pooling",
+    "conv_conv1d": "Conv layers\n1D convolution",
+}
+
+diagnosis_fullname = {
+    "sex": "Sex",
+    "DEP": "Depressive\ndisorder",
+    "ALCO": "Alcohol Abuse",
+    "EPIL": "Epilepsy",
+    "MS": "Multiple\nsclerosis",
+    "PARK": "Parkinson",
+    "BIPOLAR": "Bipolar",
+    "ADD": "Alzheimer -\nDementia",
+    "SCZ": "Schizophrenia",
+}
+
+
+def main():
+    diagnosis_path = Path(DIAGNOSIS_PATH)
+    diagnosis_files = diagnosis_path.glob("**/*.tsv")
+    sns.set_theme(style="whitegrid")
+    sns.set_context("paper", font_scale=1.5)
+    fig, axs = plt.subplots(1, 3, figsize=(13, 6), sharey=True)
+    # fig, ax = plt.subplots(1, 1, figsize=(6, 6))
+    n_subjects_diagnosis = {}
+    for ax, classifier in zip(axs, ["SVM", "LogisticR", "Ridge"]):
+        # classifier = "LogisticR"
+        data_clf = []
+        diagnosis_files = diagnosis_path.glob("**/*.tsv")
+        for p in diagnosis_files:
+            filename = p.name
+            diagnosis = filename.split("_")[-1].split(".")[0]
+            df = pd.read_csv(p, sep="\t")
+            df = df.loc[df.classifier == classifier, :]
+            df = df.groupby("feature")["score"].agg("mean").reset_index()
+            if diagnosis != "sex":
+                with open(p.parent / "predict.log", "r") as f:
+                    log = f.read()
+                    n_subjects = (
+                        int(
+                            re.search(
+                                r"Downstream prediction on ([\d]*),", log
+                            ).group(1)
+                        )
+                        / 2
+                    )
+                n_subjects_diagnosis[diagnosis] = int(n_subjects)
+            else:
+                with open(p.parent / "predict.log", "r") as f:
+                    log = f.read()
+                    n_holdout = int(
+                        re.search(
+                            r"Downstream prediction on ([\d]*),", log
+                        ).group(1)
+                    )
+                n_subjects_diagnosis[
+                    diagnosis
+                ] = 3341  # number of male subjects
+            df["diagnosis"] = diagnosis_fullname[diagnosis]
+            data_clf.append(df)
+        data_clf = pd.concat(data_clf)
+        data_clf = data_clf.reset_index(drop=True)
+
+        # for each diagnosis, get index of results better than connectome
+        idx_better = []
+        for _, diagnosis in enumerate(diagnosis_fullname.values()):
+            baseline = data_clf.loc[
+                (data_clf.feature == "connectome")
+                & (data_clf.diagnosis == diagnosis),
+                "score",
+            ].values[0]
+            baseline_idx = data_clf.loc[
+                (data_clf.feature == "connectome")
+                & (data_clf.diagnosis == diagnosis),
+                "score",
+            ].index[0]
+            better = (
+                data_clf.loc[
+                    (data_clf.feature != "connectome")
+                    & (data_clf.diagnosis == diagnosis),
+                    "score",
+                ]
+                >= baseline
+            )
+            better = (
+                data_clf.loc[
+                    (data_clf.feature != "connectome")
+                    & (data_clf.diagnosis == diagnosis),
+                    "score",
+                ]
+                .index[better]
+                .tolist()
+            )
+            better.append(baseline_idx)
+            idx_better += better
+        idx_better.sort()
+        # get index that is the opposite of idx_better
+        idx_better = np.array(idx_better)
+        idx = np.zeros(data_clf.shape[0], dtype=bool)
+        idx[idx_better] = True
+        sns.stripplot(
+            x="diagnosis",
+            y="score",
+            hue="feature",
+            data=data_clf.iloc[~idx, :],
+            ax=ax,
+            legend=False,
+            order=diagnosis_fullname.values(),
+            hue_order=feature_fullname.keys(),
+            marker="$\circ$",
+            size=10,
+            jitter=0.2,
+        )
+        sns.stripplot(
+            x="diagnosis",
+            y="score",
+            hue="feature",
+            data=data_clf.iloc[idx, :],
+            ax=ax,
+            legend=classifier == "Ridge",
+            order=diagnosis_fullname.values(),
+            hue_order=feature_fullname.keys(),
+            size=8,
+            jitter=0.25,
+        )
+
+        ax.hlines(y=0.5, xmin=0.5, xmax=8.5, color="k", linestyle="--")
+        ax.hlines(
+            y=n_subjects_diagnosis["sex"] / n_holdout,
+            xmin=-0.5,
+            xmax=0.5,
+            color="k",
+            linestyle="--",
+        )
+        ax.set_title(f"{classifier}")
+        ax.set_ylim(0.4, 1)
+        ax.set_ylabel("Accuracy score")
+        tick_lables = []
+        for tl in diagnosis_fullname:
+            if tl == "sex":
+                tick_lables.append(
+                    tl.upper()
+                    + " ($N_{male}=$"
+                    + f"${n_subjects_diagnosis[tl]}$)"
+                )
+            else:
+                tick_lables.append(tl + f" ($N={n_subjects_diagnosis[tl]}$)")
+
+        ax.set_xticklabels(tick_lables, rotation=90)
+        chance = Line2D([0], [0], color="black", label="Chance", ls="--")
+        # get legend handles and labels
+        han, lab = axs[-1].get_legend_handles_labels()
+        han.append(chance)
+        legend_labels = [feature_fullname[i] for i in lab]
+        legend_labels.append("Chance")
+        # append cahnce line to the legend
+        axs[-1].legend(handles=han, labels=legend_labels)
+        sns.move_legend(axs[-1], "upper left", bbox_to_anchor=(1, 1))
+        fig.suptitle(
+            "Downstream prediction (Training set proportion = "
+            f"{DIAGNOSIS_PATH.split('_')[-1].replace('-', ' ')})"
+        )
+        # fig.suptitle(f"Downstream prediction on Full hold out sample)")
+        plt.tight_layout()
+        plt.savefig(
+            Path(DIAGNOSIS_PATH).parents[1]
+            / "reports"
+            / f"{Path(DIAGNOSIS_PATH).name}_overview_LR.png"
+        )
+
+
+if __name__ == "__main__":
+    main()
@@ -4,26 +4,26 @@
     "type": "legacy",
     "database": {
         "type": "pickleddb",
-        "host": "outputs/autoreg/train/multiruns/2024-04-11_07-38-37/database.pkl",
+        "host": "outputs/autoreg/train/multiruns/2024-08-21_11-09-24/database.pkl",
     },
 }
 
 experiment = get_experiment("experiment", storage=storage)
 
 fig = experiment.plot.regret()
 fig.write_html(
-    "outputs/autoreg/train/multiruns/2024-04-11_07-38-37/regret.html"
+    "outputs/autoreg/train/multiruns/2024-08-21_11-09-24/regret.html"
 )
 
 fig = experiment.plot.parallel_coordinates()
 fig.write_html(
-    "outputs/autoreg/train/multiruns/2024-04-11_07-38-37/parallel_coordinates.html"
+    "outputs/autoreg/train/multiruns/2024-08-21_11-09-24/parallel_coordinates.html"
 )
 
 fig = experiment.plot.lpi()
-fig.write_html("outputs/autoreg/train/multiruns/2024-04-11_07-38-37/lpi.html")
+fig.write_html("outputs/autoreg/train/multiruns/2024-08-21_11-09-24/lpi.html")
 
 fig = experiment.plot.partial_dependencies()
 fig.write_html(
-    "outputs/autoreg/train/multiruns/2024-04-11_07-38-37/partial_dependencies.html"
+    "outputs/autoreg/train/multiruns/2024-08-21_11-09-24/partial_dependencies.html"
 )
@@ -0,0 +1,380 @@
+"""
+look through the `outputs/` directory, find instance of completed
+training, and get the number of subjects used, mean R2 of test set,
+plot the number of subjects (y-axis) against R2 (x axis)
+"""
+import itertools
+import json
+import re
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import yaml
+
+sns.set_theme(style="whitegrid")
+
+
+def peek(iterable):
+    try:
+        first = next(iterable)
+    except StopIteration:
+        return None
+    return itertools.chain([first], iterable)
+
+
+def main():
+    path_success_job = Path(
+        "outputs/neuroips-workshop_2024/scale-by-architecture"
+    ).glob("scale-*/**/training_losses.npy")
+    path_success_job = peek(path_success_job)
+
+    scaling_stats = pd.DataFrame()
+    for p in path_success_job:
+        # parse the path and get number of subjects
+        log_file = p.parent / "train.log"
+        with open(log_file, "r") as f:
+            log_text = f.read()
+        # parse the path and get number of subjects
+        n_sample = int(
+            re.search(r"Using ([\d]*) samples for training", log_text).group(1)
+        )
+        # get random seed
+        random_seed = int(re.search(r"Random seed ([\d]*)", log_text).group(1))
+        # load r2_val.npy get mean r2
+        mean_r2_val = np.load(p.parent / "mean_r2_val.npy").tolist()
+        mean_r2_tng = np.load(p.parent / "mean_r2_tng.npy").tolist()
+        # get runtime from log file text
+        starttime = re.search(
+            r"\[([\d\-\s:,]*)\].*Process ID", log_text
+        ).group(1)
+        endtime = re.search(
+            r"\[([\d\-\s:,]*)\].*model trained", log_text
+        ).group(1)
+        starttime = pd.to_datetime(starttime)
+        endtime = pd.to_datetime(endtime)
+        runtime = endtime - starttime
+
+        # convert to log scale
+        runtime = runtime.total_seconds() / 60
+        runtime_log = np.log10(runtime)
+
+        # read trian_param.json (which is an ymal...)
+        if (p.parent / "train_param.json").exists():
+            with open(p.parent / "train_param.json") as f:
+                train_param = yaml.safe_load(f)
+        else:
+            with open(p.parent / "train_param.ymal") as f:
+                train_param = yaml.safe_load(f)
+
+        train_param["M"] = int(train_param["M"].split(",")[0])
+        # total number of parameters
+        model_info_file = p.parent / "model_info_with_input.txt"
+        with open(model_info_file, "r") as f:
+            model_info = f.read()
+
+        total_parameters = int(
+            re.search(r"Total params: ([\d,]*)", model_info)
+            .group(1)
+            .replace(",", "")
+        )
+        total_mult = float(
+            re.search(r"Total mult-adds \(M\): ([\d.]*)", model_info).group(1)
+        )
+        total_size = float(
+            re.search(
+                r"Estimated Total Size \(MB\): ([\d.]*)", model_info
+            ).group(1)
+        )
+
+        # # load connectome accuracy
+        # prediction = pd.read_csv(
+        #     p.parent / "simple_classifiers_sex.tsv", sep="\t", index_col=0
+        # )
+        # prediction = prediction.loc[
+        #     prediction["classifier"] == "SVM", ["feature", "score"]
+        # ]
+        # prediction = prediction.set_index("feature")
+        # prediction = prediction.T.reset_index(drop=True)
+
+        df = pd.DataFrame(
+            [
+                n_sample,
+                train_param["GCL"],
+                train_param["F"],
+                train_param["K"],
+                train_param["FCL"],
+                train_param["M"],
+                random_seed,
+                mean_r2_val,
+                mean_r2_tng,
+                runtime,
+                runtime_log,
+                total_parameters,
+                total_mult,
+                total_size,
+            ],
+            index=[
+                "n_sample_train",
+                "GCL",
+                "F",
+                "K",
+                "FCL",
+                "M",
+                "random_seed",
+                "mean_r2_val",
+                "mean_r2_tng",
+                "runtime",
+                "runtime_log",
+                "total_parameters",
+                "total_mult",
+                "total_size",
+            ],
+        ).T
+        # df = pd.concat([df, prediction], axis=1)
+        scaling_stats = pd.concat([scaling_stats, df], axis=0)
+
+    # sort by n_sample
+    scaling_stats = scaling_stats.sort_values(by="n_sample_train")
+    # for each n_sample, sort by random seed
+    scaling_stats = scaling_stats.groupby("n_sample_train").apply(
+        lambda x: x.sort_values(by="random_seed")
+    )
+    scaling_stats = scaling_stats.reset_index(drop=True)
+
+    scaling_stats.to_csv(
+        "outputs/neuroips-workshop_2024/scale-by-architecture/reports/scaling_data.tsv",
+        "\t",
+    )
+
+    mask_compare_FCL = (
+        (scaling_stats["GCL"] == 3)
+        & (scaling_stats["F"] == 8)
+        & (scaling_stats["K"] == 3)
+    )
+
+    # fig, axs = plt.subplots(1, 2, figsize=(12, 6))
+    fig = plt.figure()
+    ax1 = fig.add_subplot(121)
+    plot_compare_FCL = sns.heatmap(
+        scaling_stats[mask_compare_FCL].pivot_table(
+            index="M", columns="FCL", values="mean_r2_val"
+        ),
+        cmap="coolwarm",
+        square=True,
+        linewidth=0.5,
+        vmax=0.185,
+        vmin=0.16,
+        annot=True,
+        fmt=".3f",
+        cbar_kws={"label": "Mean R2 of validation set"},
+        ax=ax1,
+    )
+    plot_compare_FCL.set_title(
+        "Testing different parameters of MLP\nGCN architecture fixed; batch size ~8k"
+    )
+    plot_compare_FCL.set_xlabel("Number of fully connected layer")
+    plot_compare_FCL.set_ylabel("Number of neurons per layer")
+    # plot_compare_FCL.figure.savefig("outputs/neuroips-workshop_2024/scale-by-architecture/reports/compare_FCL.png")
+    # plt.close()
+
+    mask_compare_GCL = (scaling_stats["M"] == 8) & (scaling_stats["FCL"] == 1)
+
+    # 3d scatter plot of F, GCL, K
+    # fig = plt.figure()
+    ax = fig.add_subplot(122, projection="3d")
+    im = ax.scatter(
+        scaling_stats[mask_compare_GCL]["F"],
+        scaling_stats[mask_compare_GCL]["K"],
+        scaling_stats[mask_compare_GCL]["GCL"],
+        c=scaling_stats[mask_compare_GCL]["mean_r2_val"],
+        cmap="coolwarm",
+        s=100,
+        vmin=0.16,
+        vmax=0.185,
+    )
+    ax.set_xlabel("F")
+    ax.set_xticks([8, 16, 32])
+    ax.set_ylabel("K")
+    ax.set_yticks([3, 5, 10])
+    ax.set_zlabel("Number of layers")
+    ax.set_zticks([3, 6, 9, 12])
+    ax.set_title(
+        "Testing different parameters of chebnet\nMLP architecture fixed; batch size ~8k"
+    )
+    # fig.colorbar(im, ax=ax, label="Mean R2 of validation set")
+    fig.savefig(
+        "outputs/neuroips-workshop_2024/scale-by-architecture/reports/compare_F-GCL-K.png"
+    )
+    plt.close()
+
+    for g in [3, 6, 9, 12]:
+        cur_df = mask_compare_GCL & (scaling_stats["GCL"] == g)
+        plot_compare_FCL = sns.heatmap(
+            scaling_stats[cur_df].pivot_table(
+                index="F", columns="K", values="mean_r2_val"
+            ),
+            square=True,
+            linewidth=0.5,
+            vmax=0.185,
+            vmin=0.16,
+            annot=True,
+            fmt=".3f",
+            cmap="coolwarm",
+        )
+        plot_compare_FCL.set_title("Mean R2 of validation set")
+        plot_compare_FCL.set_xlabel("K")
+        plot_compare_FCL.set_ylabel("F")
+        plot_compare_FCL.figure.savefig(
+            f"outputs/neuroips-workshop_2024/scale-by-architecture/reports/compare_GCL-{g}.png"
+        )
+        plt.close()
+
+    for f in [8, 16, 32]:
+        cur_df = mask_compare_GCL & (scaling_stats["F"] == f)
+        plot_compare_FCL = sns.heatmap(
+            scaling_stats[cur_df].pivot_table(
+                index="GCL", columns="K", values="mean_r2_val"
+            ),
+            square=True,
+            linewidth=0.5,
+            vmax=0.185,
+            vmin=0.16,
+            annot=True,
+            fmt=".3f",
+            cmap="coolwarm",
+        )
+        plot_compare_FCL.set_title("Mean R2 of validation set")
+        plot_compare_FCL.set_xlabel("K")
+        plot_compare_FCL.set_ylabel("Number of convolution layer")
+        plot_compare_FCL.figure.savefig(
+            f"outputs/neuroips-workshop_2024/scale-by-architecture/reports/compare_F-{f}.png"
+        )
+        plt.close()
+
+    for k in [3, 5, 10]:
+        cur_df = mask_compare_GCL & (scaling_stats["K"] == k)
+        plot_compare_FCL = sns.heatmap(
+            scaling_stats[cur_df].pivot_table(
+                index="GCL", columns="F", values="mean_r2_val"
+            ),
+            square=True,
+            linewidth=0.5,
+            vmax=0.185,
+            vmin=0.16,
+            annot=True,
+            fmt=".3f",
+            cmap="coolwarm",
+        )
+        plot_compare_FCL.set_title("Mean R2 of validation set")
+        plot_compare_FCL.set_xlabel("F")
+        plot_compare_FCL.set_ylabel("Number of convolution layer")
+        plot_compare_FCL.figure.savefig(
+            f"outputs/neuroips-workshop_2024/scale-by-architecture/reports/compare_K-{k}.png"
+        )
+        plt.close()
+
+    # # stats[name] = scaling_stats
+    # # alternative data to show missing experiment
+    # # random seed as column and runtime as value
+    # scaling_overview = scaling_stats.pivot(
+    #     index="n_sample", columns="random_seed", values="mean_r2_val"
+    # )
+
+    # # give a summary of the random seed and n_sample pair
+    # # with no runtime. this is because the experiment failed
+    # incomplete_n_sample = scaling_overview.isna().sum(axis=1)
+    # incomplete_n_sample = incomplete_n_sample[incomplete_n_sample > 0]
+    # # make sure all possible n_sample are included
+    # for n_sample in scaling_overview.index:
+    #     if n_sample not in incomplete_n_sample.index:
+    #         incomplete_n_sample[n_sample] = 0
+    # incomplete_n_sample = incomplete_n_sample.sort_index()
+    # missing_experiment = {}
+    # for n_sample in incomplete_n_sample.index:
+    #     missing_experiment[n_sample] = scaling_overview.columns[
+    #         scaling_overview.loc[n_sample].isna()
+    #     ].tolist()
+    # # save to json
+    # with open(
+    #     "outputs/ccn2024/scaling_missing_experiment.json",
+    #     "w",
+    # ) as f:
+    #     json.dump(missing_experiment, f, indent=2)
+
+    # plt.figure(figsize=(7, 4.5))
+    # # plot
+    # sns.lineplot(
+    #     data=scaling_stats,
+    #     x="n_sample_train",
+    #     y="mean_r2_tng",
+    #     marker="o",
+    #     label="Traing set",
+    # )
+    # sns.lineplot(
+    #     data=scaling_stats,
+    #     x="n_sample_train",
+    #     y="mean_r2_val",
+    #     marker="o",
+    #     label="Validation set",
+    # )
+    # plt.ylim(0.10, 0.19)
+    # plt.xlabel("Number of subject in model training")
+    # plt.ylabel("R-squared")
+    # plt.legend()
+    # plt.title("R-squared of t+1 prediction")
+    # plt.savefig("outputs/ccn2024/scaling_r2_tng_plot.png")
+    # plt.close()
+
+    # plt.figure(figsize=(7, 4.5))
+    # sns.lineplot(
+    #     data=scaling_stats,
+    #     x="n_sample_train",
+    #     y="runtime_log",
+    #     marker="o",
+    # )
+    # plt.xlabel("Number of subject in model training")
+    # plt.ylabel("log10(runtime) (minutes)")
+    # plt.title("Runtime of training a group model")
+    # plt.savefig("outputs/ccn2024/scaling_runtime_plot.png")
+    # plt.close()
+
+    # plt.figure(figsize=(7, 4.5))
+    # # plot
+    # features = prediction.columns.tolist()
+    # for y, label in zip(
+    #     features,
+    #     [
+    #         "connectomes",
+    #         "average pooling",
+    #         "standard deviation pooling",
+    #         "max pooling",
+    #         "1D convolution",
+    #         "average R-squared",
+    #         "R-squared map",
+    #     ],
+    # ):
+    #     if label in [
+    #         "connectomes",
+    #         "standard deviation pooling",
+    #         "R-squared map",
+    #     ]:
+    #         sns.lineplot(
+    #             data=scaling_stats,
+    #             x="n_sample_downstream",
+    #             y=y,
+    #             marker="o",
+    #             label=label,
+    #         )
+    # plt.xlabel("Number of subject in prediction task")
+    # plt.ylabel("Accuracy")
+    # plt.legend()
+    # plt.title("Sex prediction accuracy with SVM")
+    # plt.savefig("outputs/ccn2024/_scaling_connectome.png")
+    # plt.close()
+
+
+if __name__ == "__main__":
+    main()
@@ -4,7 +4,6 @@
 plot the number of subjects (y-axis) against R2 (x axis)
 """
 import itertools
-import json
 import re
 from pathlib import Path
 
@@ -22,80 +21,131 @@ def peek(iterable):
     return itertools.chain([first], iterable)
 
 
-def main():
-    path_success_job = Path("outputs/autoreg/predict/downstream").glob(
-        "**/simple_classifiers_sex.tsv"
-    )
-    # path_success_job = peek(path_success_job)
+feature_fullname = {
+    "connectome": "Connectome\n(baseline)",
+    "avgr2": "t+1\naverage R2",
+    "r2map": "t+1\nR2 map",
+    "conv_avg": "Conv layers\navg pooling",
+    "conv_std": "Conv layers\nstd pooling",
+    "conv_max": "Conv layers\nmax pooling",
+    "conv_conv1d": "Conv layers\n1D convolution",
+}
+
+diagnosis_fullname = {
+    "sex": "Sex",
+    "DEP": "Depressive\ndisorder",
+    "ALCO": "Alcohol Abuse",
+    "EPIL": "Epilepsy",
+    "MS": "Multiple sclerosis",
+    "PARK": "Parkinson",
+    "BIPOLAR": "Bipolar",
+    "ADD": "Alzheimer - Dementia",
+    "SCZ": "Schizophrenia",
+}
+PREDICTION_DATA = Path(
+    "outputs/neuroips-workshop_2024/downstreams_last-layer/data"
+)
 
-    scaling_stats = pd.DataFrame()
-    for p in path_success_job:
-        log_file = p.parent / "predict.log"
-        with open(log_file, "r") as f:
-            log_text = f.read()
-        # parse the path and get number of subjects
-        n_sample = re.search(
-            r"Subjects with phenotype data: ([\d]*)", log_text
-        ).group(1)
-        n_sample = int(n_sample)
-        percent_sample = re.search(
-            r"([\d]*)% of the full sample", log_text
-        ).group(1)
-        # get random seed
-        random_seed = re.search(r"'random\_state': ([\d]+)", log_text).group(1)
-        # load connectome accuracy
-        prediction = pd.read_csv(p, sep="\t", index_col=0)
-        prediction["percent_sample"] = int(percent_sample)
-        prediction["n_sample"] = n_sample
-        prediction["random_seed"] = random_seed
 
-        scaling_stats = pd.concat([scaling_stats, prediction], axis=0)
+def main():
+    sns.set_theme(style="whitegrid")
+    sns.set_context("paper", font_scale=1.5)
+    pal = sns.color_palette()
+    for d in diagnosis_fullname.keys():
+        path_success_job = PREDICTION_DATA.glob(
+            f"data.proportion_sample_*/**/simple_classifiers_{d}.tsv"
+        )
+        scaling_stats = pd.DataFrame()
+        for p in path_success_job:
+            log_file = p.parent / "predict.log"
+            with open(log_file, "r") as f:
+                log_text = f.read()
+            n_holdout = int(
+                re.search(
+                    r"Downstream prediction on ([\d]*),", log_text
+                ).group(1)
+            )
+            # parse the path and get number of subjects
+            percent_training_sample = p.parents[0].name.split("_")[-1]
+            percent_training_sample = float(percent_training_sample) * 100
+            percent_holdout_sample = re.search(
+                r"([\d]*)% of the full sample", log_text
+            ).group(1)
+            # get random seed
+            random_seed = re.search(
+                r"'random\_state': ([\d]+)", log_text
+            ).group(1)
+            # load connectome accuracy
+            prediction = pd.read_csv(p, sep="\t", index_col=0)
+            prediction["percent_holdout_sample"] = int(percent_holdout_sample)
+            prediction["percent_training_sample"] = percent_training_sample
+            prediction["random_seed"] = random_seed
 
-    # sort by n_sample
-    scaling_stats = scaling_stats.sort_values(by="n_sample")
-    # for each n_sample, sort by random seed
-    scaling_stats = scaling_stats.groupby("n_sample").apply(
-        lambda x: x.sort_values(by="random_seed")
-    )
-    scaling_stats = scaling_stats.reset_index(drop=True)
+            scaling_stats = pd.concat([scaling_stats, prediction], axis=0)
 
-    scaling_stats.to_csv(
-        "outputs/autoreg/predict/downstream/downstream_scaling_data.csv"
-    )
+        # sort by n_sample
+        scaling_stats = scaling_stats.sort_values(by="percent_training_sample")
+        # # for each n_sample, sort by random seed
+        # scaling_stats = scaling_stats.groupby("percent_training_sample").apply(
+        #     lambda x: x.sort_values(by="random_seed")
+        # )
+        scaling_stats = scaling_stats.reset_index(drop=True)
 
-    mask = scaling_stats["classifier"] == "SVM"
-    plt.figure(figsize=(7, 4.5))
-    # plot
-    features = prediction["feature"].unique().tolist()
-    for y, label in zip(
-        features,
-        [
-            "connectomes",
-            "average pooling",
-            "standard deviation pooling",
-            "max pooling",
-            "1D convolution",
-            "average R-squared",
-            "R-squared map",
-        ],
-    ):
-        feat_mask = scaling_stats["feature"] == y
-        cur_mask = mask & feat_mask
-        sns.lineplot(
-            data=scaling_stats[cur_mask],
-            x="percent_sample",
-            y="score",
-            marker="o",
-            label=label,
+        scaling_stats.to_csv(
+            PREDICTION_DATA.parent / f"reports/downstream_scaling_{d}.tsv",
+            sep="\t",
+        )
+        # replace feature name
+        scaling_stats["percent_training_sample"] = np.log10(
+            scaling_stats["percent_training_sample"]
+        )
+        scaling_stats["feature"] = scaling_stats["feature"].replace(
+            feature_fullname
         )
-    plt.xlabel("Percent of subject in the downstream prediction.")
-    plt.ylabel("Accuracy")
-    plt.legend()
-    plt.title(
-        "Sex prediction accuracy with SVM with saturated pretrained model."
-    )
-    plt.savefig("outputs/autoreg/predict/downstream/downstream_scaling.png")
-    plt.close()
+        for clf in scaling_stats["classifier"].unique():
+            mask = scaling_stats["classifier"] == clf
+            no_connectome = (
+                scaling_stats["feature"] != "Connectome\n(baseline)"
+            )
+            is_connectome = ~no_connectome
+            plt.figure(figsize=(7, 4.5))
+            benchmark = scaling_stats[mask & is_connectome]["score"].mean()
+            plt.axhline(
+                y=benchmark, color=pal[0], linestyle="-.", label="Connectome"
+            )
+            # plot
+            features = prediction["feature"].unique().tolist()
+            sns.lineplot(
+                data=scaling_stats[mask & no_connectome],
+                x="percent_training_sample",
+                y="score",
+                hue="feature",
+                marker="o",
+                errorbar="ci",
+                palette=pal[1 : len(features)],
+            )
+            if d != "sex":
+                plt.axhline(y=0.5, color="k", linestyle="--", label="Chance")
+            else:
+                plt.axhline(
+                    y=3341 / n_holdout,
+                    color="k",
+                    linestyle="--",
+                    label="Chance",
+                )
+            plt.xlabel("Percent of subject in the pretrained model")
+            plt.xticks(scaling_stats["percent_training_sample"].unique())
+            plt.ylabel("Accuracy")
+            plt.legend(bbox_to_anchor=(1, 1))
+            plt.title(
+                f"{diagnosis_fullname[d]} prediction accuracy with {clf}"
+            )
+            plt.tight_layout()
+            plt.savefig(
+                PREDICTION_DATA.parent
+                / f"reports/downstream_scaling_{d}_{clf}.png"
+            )
+            plt.close()
 
 
 if __name__ == "__main__":
 
@@ -0,0 +1,132 @@
+"""
+look through the `outputs/` directory, find instance of completed
+training, and get the number of subjects used, mean R2 of test set,
+plot the number of subjects (y-axis) against R2 (x axis)
+"""
+import itertools
+import re
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+
+
+def peek(iterable):
+    try:
+        first = next(iterable)
+    except StopIteration:
+        return None
+    return itertools.chain([first], iterable)
+
+
+feature_fullname = {
+    "connectome": "Connectome\n(baseline)",
+    "avgr2": "t+1\naverage R2",
+    "r2map": "t+1\nR2 map",
+    "conv_avg": "Conv layers\navg pooling",
+    "conv_std": "Conv layers\nstd pooling",
+    "conv_max": "Conv layers\nmax pooling",
+    "conv_conv1d": "Conv layers\n1D convolution",
+}
+
+diagnosis_fullname = {
+    "sex": "Sex",
+    "DEP": "Depressive\ndisorder",
+    "ALCO": "Alcohol Abuse",
+    "EPIL": "Epilepsy",
+    "MS": "Multiple sclerosis",
+    "PARK": "Parkinson",
+    "BIPOLAR": "Bipolar",
+    "ADD": "Alzheimer - Dementia",
+    "SCZ": "Schizophrenia",
+}
+PREDICTION_DATA = Path(
+    "outputs/neuroips-workshop_2024/downstreams_fewshot/data"
+)
+
+
+def main():
+    sns.set_theme(style="whitegrid")
+    sns.set_context("paper", font_scale=1.5)
+    pal = sns.color_palette()
+    for d in diagnosis_fullname.keys():
+        path_success_job = PREDICTION_DATA.glob(
+            f"**/simple_classifiers_{d}.tsv"
+        )
+        scaling_stats = pd.DataFrame()
+        for p in path_success_job:
+            log_file = p.parent / "predict.log"
+            with open(log_file, "r") as f:
+                log_text = f.read()
+            n_holdout = int(
+                re.search(
+                    r"Downstream prediction on ([\d]*),", log_text
+                ).group(1)
+            )
+            # parse the path and get number of subjects
+            percent_holdout_sample = re.search(
+                r"([\d]*)% of the full sample", log_text
+            ).group(1)
+            # get random seed
+            print(percent_holdout_sample)
+            random_seed = re.search(
+                r"'random\_state': ([\d]+)", log_text
+            ).group(1)
+            # load connectome accuracy
+            prediction = pd.read_csv(p, sep="\t", index_col=0)
+            prediction["percent_holdout_sample"] = int(percent_holdout_sample)
+            prediction["random_seed"] = random_seed
+
+            scaling_stats = pd.concat([scaling_stats, prediction], axis=0)
+
+        # sort by n_sample
+        scaling_stats = scaling_stats.sort_values(by="percent_holdout_sample")
+        # # for each n_sample, sort by random seed
+        # scaling_stats = scaling_stats.groupby("percent_training_sample").apply(
+        #     lambda x: x.sort_values(by="random_seed")
+        # )
+        scaling_stats = scaling_stats.reset_index(drop=True)
+
+        scaling_stats.to_csv(
+            PREDICTION_DATA.parent / f"reports/downstream_fewshot_{d}.tsv",
+            sep="\t",
+        )
+        # replace feature name
+        scaling_stats["feature"] = scaling_stats["feature"].replace(
+            feature_fullname
+        )
+        for clf in scaling_stats["classifier"].unique():
+            mask = scaling_stats["classifier"] == clf
+            plt.figure(figsize=(7, 4.5))
+            # plot
+            features = prediction["feature"].unique().tolist()
+            sns.lineplot(
+                data=scaling_stats[mask],
+                x="percent_holdout_sample",
+                y="score",
+                hue="feature",
+                hue_order=feature_fullname.values(),
+                marker="o",
+                errorbar=("ci", 95),
+            )
+            if d != "sex":
+                plt.axhline(y=0.5, color="k", linestyle="--", label="Chance")
+            plt.xlabel("Percent of subject in the downstream task")
+            plt.xticks(scaling_stats["percent_holdout_sample"].unique())
+            plt.ylabel("Accuracy")
+            plt.legend(bbox_to_anchor=(1, 1))
+            plt.title(
+                f"{diagnosis_fullname[d]} prediction accuracy with {clf}"
+            )
+            plt.tight_layout()
+            plt.savefig(
+                PREDICTION_DATA.parent
+                / f"reports/downstream_fewshot_{d}_{clf}.png"
+            )
+            plt.close()
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,211 @@
+"""
+look through the `outputs/` directory, find instance of completed
+training, and get the number of subjects used, mean R2 of test set,
+plot the number of subjects (y-axis) against R2 (x axis)
+"""
+import itertools
+import json
+import re
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import yaml
+
+sns.set_theme(style="whitegrid")
+sns.set_context("paper", font_scale=1.5)
+
+
+def peek(iterable):
+    try:
+        first = next(iterable)
+    except StopIteration:
+        return None
+    return itertools.chain([first], iterable)
+
+
+# BASE_PATH = "outputs/neuroips-workshop_2024/scale-sample_bestmodel_different-num_workers"  # noqa
+BASE_PATH = "outputs/neuroips-workshop_2024/scale-sample_bestmodel"
+
+
+def main():
+    path_success_job = Path(BASE_PATH).glob("data/**/training_losses.npy")
+    path_success_job = peek(path_success_job)
+
+    scaling_stats = pd.DataFrame()
+    for p in path_success_job:
+        # parse the path and get number of subjects
+        log_file = p.parent / "train.log"
+        with open(log_file, "r") as f:
+            log_text = f.read()
+        # parse the path and get number of subjects
+        n_sample = int(
+            re.search(r"Using ([\d]*) samples for training", log_text).group(1)
+        )
+        # get random seed
+        random_seed = int(re.search(r"Random seed ([\d]*)", log_text).group(1))
+        # load r2_val.npy get mean r2
+        mean_r2_val = np.load(p.parent / "mean_r2_val.npy").tolist()
+        mean_r2_tng = np.load(p.parent / "mean_r2_tng.npy").tolist()
+        # get runtime from log file text
+        starttime = re.search(
+            r"\[([\d\-\s:,]*)\].*Process ID", log_text
+        ).group(1)
+        endtime = re.search(
+            r"\[([\d\-\s:,]*)\].*model trained", log_text
+        ).group(1)
+        starttime = pd.to_datetime(starttime)
+        endtime = pd.to_datetime(endtime)
+        runtime = endtime - starttime
+
+        # convert to log scale
+        runtime = runtime.total_seconds() / 60
+        runtime_log = np.log10(runtime)
+        # total number of parameters
+        model_info_file = p.parent / "model_info_with_input.txt"
+        if model_info_file.exists():
+            with open(model_info_file, "r") as f:
+                model_info = f.read()
+            total_parameters = int(
+                re.search(r"Total params: ([\d,]*)", model_info)
+                .group(1)
+                .replace(",", "")
+            )
+            total_mult = float(
+                re.search(
+                    r"Total mult-adds \(M\): ([\d.]*)", model_info
+                ).group(1)
+            )
+            total_size = float(
+                re.search(
+                    r"Estimated Total Size \(MB\): ([\d.]*)", model_info
+                ).group(1)
+            )
+        else:
+            total_parameters = np.nan
+            total_mult = np.nan
+            total_size = np.nan
+        # # load connectome accuracy
+        # prediction = pd.read_csv(
+        #     p.parent / "simple_classifiers_sex.tsv", sep="\t", index_col=0  # noqa
+        # )
+        # prediction = prediction.loc[
+        #     prediction["classifier"] == "SVM", ["feature", "score"]
+        # ]
+        # prediction = prediction.set_index("feature")
+        # prediction = prediction.T.reset_index(drop=True)
+
+        df = pd.DataFrame(
+            [
+                n_sample,
+                random_seed,
+                mean_r2_val,
+                mean_r2_tng,
+                runtime,
+                runtime_log,
+                total_parameters,
+                total_mult,
+                total_size,
+            ],
+            index=[
+                "n_sample_train",
+                "random_seed",
+                "mean_r2_val",
+                "mean_r2_tng",
+                "runtime",
+                "runtime_log",
+                "total_parameters",
+                "total_mult",
+                "total_size",
+            ],
+        ).T
+        # df = pd.concat([df, prediction], axis=1)
+        scaling_stats = pd.concat([scaling_stats, df], axis=0)
+
+    # sort by n_sample
+    scaling_stats = scaling_stats.sort_values(by="n_sample_train")
+    # for each n_sample, sort by random seed
+    scaling_stats = scaling_stats.groupby("n_sample_train").apply(
+        lambda x: x.sort_values(by="random_seed")
+    )
+    scaling_stats = scaling_stats.reset_index(drop=True)
+    scaling_stats["percent_sample"] = scaling_stats["n_sample_train"] / 2328583
+    scaling_stats["percent_sample"] = (
+        scaling_stats["percent_sample"].round(3) * 100
+    )
+
+    scaling_stats.to_csv(Path(BASE_PATH) / "reports/scaling_data.tsv", "\t")
+
+    # alternative data to show missing experiment
+    # random seed as column and runtime as value
+    scaling_overview = scaling_stats.pivot(
+        index="percent_sample", columns="random_seed", values="mean_r2_val"
+    )
+
+    # give a summary of the random seed and n_sample pair
+    # with no runtime. this is because the experiment failed
+    incomplete_n_sample = scaling_overview.isna().sum(axis=1)
+    incomplete_n_sample = incomplete_n_sample[incomplete_n_sample > 0]
+    # make sure all possible n_sample are included
+    for n_sample in scaling_overview.index:
+        if n_sample not in incomplete_n_sample.index:
+            incomplete_n_sample[n_sample] = 0
+    incomplete_n_sample = incomplete_n_sample.sort_index()
+    missing_experiment = {}
+    for n_sample in incomplete_n_sample.index:
+        missing_experiment[n_sample] = scaling_overview.columns[
+            scaling_overview.loc[n_sample].isna()
+        ].tolist()
+    # save to json
+    with open(
+        Path(BASE_PATH) / "reports/scaling_missing_experiment.json",
+        "w",
+    ) as f:
+        json.dump(missing_experiment, f, indent=2)
+
+    plt.figure(figsize=(5, 5))
+    # plot
+    sns.lineplot(
+        data=scaling_stats,
+        x="percent_sample",
+        y="mean_r2_tng",
+        marker="o",
+        label="Traing set",
+    )
+    sns.lineplot(
+        data=scaling_stats,
+        x="percent_sample",
+        y="mean_r2_val",
+        marker="o",
+        label="Validation set",
+    )
+    plt.ylim(0.145, 0.185)
+    plt.xticks([0, 5, 10, 25, 50, 100])
+    plt.xlabel("Percentage of training sample")
+    plt.ylabel("R-squared")
+    plt.legend()
+    plt.title("R-squared of t+1 prediction")
+    plt.tight_layout()
+    plt.savefig(Path(BASE_PATH) / "reports/scaling_r2_tng_plot.png")
+    plt.close()
+
+    plt.figure(figsize=(5, 5))
+    sns.lineplot(
+        data=scaling_stats,
+        x="percent_sample",
+        y="runtime_log",
+        marker="o",
+    )
+    plt.xticks([0, 5, 10, 25, 50, 100])
+    plt.xlabel("Percentage of training sample")
+    plt.ylabel("log10(runtime) (minutes)")
+    plt.title("Runtime of training")
+    plt.tight_layout()
+    plt.savefig(Path(BASE_PATH) / "reports/scaling_runtime_plot.png")
+    plt.close()
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,99 @@
+"""
+Resource:
+salloc --time=2:00:00 --mem=16G --cpus-per-task=16 --gpus-per-node=1
+Aim:
+ - Fill up the GPU memory with the largest batch size possible
+"""
+
+import typing as t
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fmri_autoreg.models.models import Chebnet
+
+DATASET_SIZE = 2328583  # number of data point in training set
+SEQ = 16
+
+
+# make a random correlation matrix
+def get_edges(n_emb):
+    ts = np.random.rand(n_emb, 117)
+    corr = np.corrcoef(ts)
+    thres_index = int(corr.shape[0] * corr.shape[1] * 0.9)
+    thres_value = np.sort(corr.flatten())[thres_index]
+    adj_mat = corr * (corr >= thres_value)
+    edge_index = np.nonzero(adj_mat)
+    return edge_index
+
+
+def get_batch_size(
+    model: nn.Module,
+    device: torch.device,
+    input_shape: t.Tuple[int, int, int],
+    output_shape: t.Tuple[int],
+    dataset_size: int,
+    max_batch_size: int = None,
+    num_iterations: int = 5,
+) -> int:
+    model.to(device)
+    model.train(True)
+    optimizer = torch.optim.Adam(model.parameters())
+
+    batch_size = 2
+    while True:
+        if max_batch_size is not None and batch_size >= max_batch_size:
+            batch_size = max_batch_size
+            break
+        if batch_size >= dataset_size:
+            batch_size = batch_size // 2
+            break
+        try:
+            for _ in range(num_iterations):
+                # dummy inputs and targets
+                inputs = torch.rand(*(batch_size, *input_shape), device=device)
+                targets = torch.rand(
+                    *(batch_size, *output_shape), device=device
+                )
+                outputs = model(inputs)
+                loss = F.mse_loss(targets, outputs)
+                loss.backward()
+                optimizer.step()
+                optimizer.zero_grad()
+            batch_size *= 2
+        except RuntimeError:
+            batch_size //= 2
+            break
+    del model, optimizer
+    torch.cuda.empty_cache()
+    return batch_size
+
+
+if __name__ == "__main__":
+    for n_emb in [64, 197, 444]:
+        edge_index = get_edges(n_emb)
+        print("our hypothetical biggest model")
+        model = Chebnet(
+            n_emb=n_emb,
+            seq_len=16,
+            edge_index=edge_index,
+            FK="16,3,16,3,16,3,16,3,16,3,16,3",
+            M="8,1",
+            FC_type="nonshared_uni",
+            aggrs="add",
+            dropout=0.1,
+            bn_momentum=0.1,
+            use_bn=True,
+        )
+        batch_size = get_batch_size(
+            model,
+            torch.device("cuda"),
+            (n_emb, SEQ),
+            (n_emb,),
+            DATASET_SIZE,
+            num_iterations=10,
+        )
+        print(f"atlas {n_emb}, input length {SEQ}, batch size {batch_size}")
+        del model
+        del edge_index
@@ -0,0 +1,55 @@
+from time import time
+
+import h5py
+from fmri_autoreg.data.load_data import Dataset
+from torch.utils.data import DataLoader, Subset
+from tqdm import tqdm
+
+proportion_sample = 1
+tng_data_h5 = (
+    "outputs/sample_for_pretraining/seed-42/sample_seed-42_data-train.h5"
+)
+IS_GPU = False
+N_EMBED = [64, 197, 444]
+BATCHSIZE = [512]
+
+with open("outputs/performance_info/cpu_number_of_workers.tsv", "w") as f:
+    f.write("batch_size\tn_embed\tnum_workers\tepoch_second\n")
+
+for n_embed in N_EMBED:
+    if proportion_sample != 1:
+        with h5py.File(tng_data_h5, "r") as f:
+            tng_length = f[f"n_embed-{n_embed}"]["train"]["input"].shape[0]
+        tng_index = list(range(int(tng_length * proportion_sample)))
+        tng_dataset = Subset(
+            Dataset(
+                tng_data_h5, n_embed=f"n_embed-{n_embed}", set_type="train"
+            ),
+            tng_index,
+        )
+    else:
+        tng_dataset = Dataset(
+            tng_data_h5, n_embed=f"n_embed-{n_embed}", set_type="train"
+        )
+    for batch_size in [512]:
+        for num_workers in range(8, 34, 2):
+            train_loader = DataLoader(
+                tng_dataset,
+                shuffle=True,
+                num_workers=num_workers,
+                batch_size=batch_size,
+                pin_memory=IS_GPU,
+            )
+            start = time()
+            for _ in tqdm(
+                range(1, 3),
+                desc=f"batch_size={batch_size}; n_embed={n_embed}; Number of workers: {num_workers}",
+            ):
+                for _, _ in enumerate(train_loader, 0):
+                    pass
+            end = time()
+            taken = (end - start) / 2
+            with open(
+                "outputs/performance_info/cpu_number_of_workers.tsv", "a"
+            ) as f:
+                f.write(f"{batch_size}\t{n_embed}\t{num_workers}\t{taken}\n")
Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,7 @@ exclude = [`
`22`	`22`	`"*/tests/",`
`23`	`23`	`"*build/",`
`24`	`24`	`"code/fmri-autoreg",`
	`25`	`+ "src/utils/plot_*.py"`
`25`	`26`	`]`
`26`	`27`
`27`	`28`	`ignore = [`