Balanced sample (#13)

Hao-Ting Wang · web-flow · commit 4e8285eaed14 · 2024-07-30T14:03:02.000-04:00
* Inital version of creating a hold out set; need to modify the training script late; need to output the cohort demographic info

* save demographic info summary

* hydrafy

* ENH adapt the training script to fit the new input
diff --git a/config/base.yaml b/config/base.yaml
@@ -0,0 +1,8 @@
+---
+defaults:
+  - _self_
+  - hydra: default
+
+verbose: 2
+random_state: 42
+return_type: float
diff --git a/config/data/default.yaml b/config/data/default.yaml
@@ -0,0 +1,13 @@
+---
+standardize: false
+n_embed: 197
+atlas_desc: atlas-MIST_desc-${data.n_embed}
+hold_out_set: 0.20
+validation_set: 0.25
+n_sample: -1
+class_balance_confounds:
+  - site
+  - sex
+  - age
+  - mean_fd_raw
+  - proportion_kept
diff --git a/config/data/ukbb.yaml b/config/data/ukbb.yaml
@@ -1,14 +1,9 @@
-data_file: inputs/connectomes/ukbb.h5
-standardize: false
-n_embed: 197
-n_sample: -1
+---
+defaults:
+  - _self_
+  - default
 
-split:  # training and evaluation
-  _target_: src.data.load_data.load_ukbb_dset_path
-  path: ${data.data_file}
-  atlas_desc: atlas-MIST_desc-${data.n_embed}
-  n_sample: ${data.n_sample}
-  val_set: 0.20
-  test_set: 0.20
-  segment: 1
-  random_state: ${random_state}
+data_file: inputs/connectomes/ukbb_libral_scrub_20240716_connectome.h5
+phenotype_file: inputs/connectomes/ukbb_libral_scrub_20240716_phenotype.tsv
+phenotype_json: inputs/connectomes/ukbb_libral_scrub_20240716_phenotype.json
+segment: 1
diff --git a/config/train.yaml b/config/train.yaml
@@ -8,3 +8,4 @@ defaults:
 verbose: 2
 random_state: 42
 return_type: float
+data_split: ???
diff --git a/src/create_holdout_sample.py b/src/create_holdout_sample.py
@@ -0,0 +1,111 @@
+import json
+import logging
+from pathlib import Path
+
+import hydra
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+from omegaconf import DictConfig
+
+log = logging.getLogger(__name__)
+
+
+@hydra.main(version_base="1.3", config_path="../config", config_name="base")
+def main(params: DictConfig) -> None:
+    from src.data.load_data import create_hold_out_sample
+
+    output_dir = hydra.core.hydra_config.HydraConfig.get().runtime.output_dir
+    output_dir = Path(output_dir)
+
+    sample = create_hold_out_sample(
+        phenotype_path=params["data"]["phenotype_file"],
+        phenotype_meta=params["data"]["phenotype_json"],
+        class_balance_confounds=params["data"]["class_balance_confounds"],
+        hold_out_set=params["data"]["hold_out_set"],
+        random_state=params["random_state"],
+    )
+
+    data = pd.read_csv(params["data"]["phenotype_file"], sep="\t", index_col=0)
+
+    with open(params["data"]["phenotype_json"], "r") as f:
+        meta = json.load(f)
+
+    with open(output_dir / "downstream_sample.json", "w") as f:
+        json.dump(sample, f, indent=2)
+
+    # plot the distribution of confounds of downstreams balanced samples
+    demographics = {}
+    for d in sample["test_downstreams"].keys():
+        d_subjects = sample["test_downstreams"][d]
+        df = data.loc[d_subjects, :]
+        fig, axes = plt.subplots(
+            1,
+            len(params["data"]["class_balance_confounds"]),
+            figsize=(20, len(params["data"]["class_balance_confounds"]) + 1),
+        )
+        fig.suptitle(
+            f"Confound balanced sample (N={len(d_subjects)}): "
+            f"{meta[d]['instance']['1']['description']}"
+        )
+        for ax, c in zip(axes, params["data"]["class_balance_confounds"]):
+            sns.histplot(x=c, data=df, hue=d, kde=True, ax=ax)
+        fig.savefig(output_dir / f"{d}.png")
+        demographics[d] = {
+            "patient": {
+                "condition": d,
+                "total": df[df[d] == 1].shape[0],
+                "n_female": df[df[d] == 1].shape[0]
+                - df[df[d] == 1]["sex"].sum(),
+                "age_mean": df[df[d] == 1]["age"].mean(),
+                "age_sd": df[df[d] == 1]["age"].std(),
+                "mean_fd_mean": df[df[d] == 1]["mean_fd_raw"].mean(),
+                "mean_fd_sd": df[df[d] == 1]["mean_fd_raw"].std(),
+                "proportion_kept_mean": df[df[d] == 1][
+                    "proportion_kept"
+                ].mean(),
+                "proportion_kept_sd": df[df[d] == 1]["proportion_kept"].std(),
+            },
+            "control": {
+                "condition": d,
+                "total": df[df[d] == 0].shape[0],
+                "n_female": df[df[d] == 0].shape[0]
+                - df[df[d] == 0]["sex"].sum(),
+                "age_mean": df[df[d] == 0]["age"].mean(),
+                "age_sd": df[df[d] == 0]["age"].std(),
+                "mean_fd_mean": df[df[d] == 0]["mean_fd_raw"].mean(),
+                "mean_fd_sd": df[df[d] == 0]["mean_fd_raw"].std(),
+                "proportion_kept_mean": df[df[d] == 0][
+                    "proportion_kept"
+                ].mean(),
+                "proportion_kept_sd": df[df[d] == 0]["proportion_kept"].std(),
+            },
+        }
+
+    demographics_summary = pd.DataFrame()
+    for d in demographics.keys():
+        df = pd.DataFrame.from_dict(demographics[d], orient="index")
+        df.set_index([df.index, "condition"], inplace=True)
+        demographics_summary = pd.concat([demographics_summary, df])
+    demographics_summary.round(decimals=2).to_csv(
+        output_dir / "demographics_summary.tsv", sep="\t"
+    )
+
+    for key in sample.keys():
+        if key == "test_downstreams":
+            continue
+        d_subjects = sample[key]
+        df = data.loc[d_subjects, :]
+        fig, axes = plt.subplots(
+            1,
+            len(params["data"]["class_balance_confounds"]),
+            figsize=(20, len(params["data"]["class_balance_confounds"]) + 1),
+        )
+        fig.suptitle(f"{key} sample (N={len(d_subjects)})")
+        for ax, c in zip(axes, params["data"]["class_balance_confounds"]):
+            sns.histplot(x=c, data=df, kde=True, ax=ax)
+        fig.savefig(output_dir / f"{key}.png")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/data/load_data.py b/src/data/load_data.py
@@ -8,6 +8,7 @@
 import h5py
 import numpy as np
 import pandas as pd
+from general_class_balancer import general_class_balancer as gcb
 from nilearn.connectome import ConnectivityMeasure
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import StandardScaler
@@ -93,32 +94,96 @@ def split_data_by_site(
         return tng_data, test_data
 
 
+def create_hold_out_sample(
+    phenotype_path: Union[Path, str],
+    phenotype_meta: Union[Path, str],
+    class_balance_confounds: List[str],
+    hold_out_set: float = 0.25,
+    random_state: int = 42,
+) -> Dict:
+    """Create experiment sample with patients in the hold out set.
+
+    Args:
+        phenotype_path (Union[Path, str]): Path to the tsv file.
+            Column index 0 must be participant_id.
+        phenotype_meta (Union[Path, str]): Path to the json file.
+        confounds (List[str]): list of confounds to use for class
+            balancing.
+        hold_out_set (float, optional): proportion of the test set size
+            in relation to the full sample. Defaults to 0.25.
+        random_state (int, optional): random state for reproducibility.
+    Returns:
+        dict: dictionary with list of participant ID for training and
+            hold out set, and the downstream task samples.
+    """
+    with open(phenotype_meta, "r") as f:
+        meta = json.load(f)
+
+    data = pd.read_csv(phenotype_path, sep="\t", index_col=0)
+
+    diagnosis_groups = list(meta["diagnosis"]["labels"].keys())
+    diagnosis_groups.remove("HC")
+
+    n_sample = data.shape[0]
+
+    # create a hold out set for downstream analysis including all
+    # the patients
+    any_patients = data[diagnosis_groups].sum(axis=1) > 0
+    patients = list(data[any_patients].index)
+    controls = list(data[~any_patients].index)
+
+    n_patients = len(patients)
+    n_control = n_sample - n_patients
+    n_control_in_hold_out_set = int(n_sample * hold_out_set - n_patients)
+
+    corrected_hold_out_set = n_control_in_hold_out_set / n_control
+    controls_site = list(data[~any_patients]["site"])
+    train, hold_out = train_test_split(
+        controls,
+        test_size=corrected_hold_out_set,
+        random_state=random_state,
+        stratify=controls_site,
+    )
+    hold_out += patients
+
+    # get controls that matches patients confounds
+    data_hold_out = data.loc[hold_out]
+    downstreams = {}
+    for d in diagnosis_groups:
+        select_sample = gcb.class_balance(
+            classes=data_hold_out[d].values.astype(int),
+            confounds=data_hold_out[class_balance_confounds].values.T,
+            plim=0.05,
+            random_seed=random_state,  # fix random seed for reproducibility
+        )
+        selected = data_hold_out.index[select_sample].tolist()
+        selected.sort()
+        downstreams[d] = selected
+    train.sort()
+    hold_out.sort()
+    return {
+        "train": train,
+        "hold_out": hold_out,
+        "test_downstreams": downstreams,
+    }
+
+
 def load_ukbb_dset_path(
-    path: Union[Path, str],
+    participant_id: List[str],
     atlas_desc: str,
-    n_sample: int = 50,
-    val_set: float = 0.25,
-    test_set: float = 0.25,
     segment: Union[int, List[int]] = -1,
-    random_state: int = 42,
 ) -> Dict:
-    """Load time series of UK Biobank.
+    """Load time series path in h5 file of UK Biobank.
 
     We segmented the time series per subject as independent samples,
     hence it's important to make sure the same subject is not in both
     training and testing set.
 
     Args:
-        path (Union[Path, str]): Path to the hdf5 file.
+        participant_id List[str]: List of participant ID.
         atlas_desc (str): Regex pattern to look for suitable data,
             such as the right `desc` field for atlas,
             e.g., "atlas-MIST_desc-197".
-        n_sample (int, optional): number of subjects to use.
-            Defaults to 50, and -1 would take the full sample.
-        val_set (float, optional): proportion of the validation set
-            size in relation to the full sample. Defaults to 0.25.
-        test_set (float, optional): proportion of the test set size
-            in relation to the full sample. Defaults to 0.25.
         segment (Union[int, List[int]], optional): segments of the
             time series to use. 0 for the full time series.
             Defaults to -1 to load all four segments.
@@ -144,41 +209,19 @@ def load_ukbb_dset_path(
     elif segment <= 4:
         segment = [segment]
 
-    # get the participant IDs to use
-    with h5py.File(path, "r") as h5file:
-        participant_id = list(h5file["ukbb"].keys())
-
-    if n_sample == -1:
-        pass
-    elif n_sample < len(participant_id):
-        total_proportion_sample = n_sample / len(participant_id)
-        participant_id, _ = train_test_split(
-            participant_id,
-            test_size=(1 - total_proportion_sample),
-            random_state=random_state,
-        )
-
     # construct path
     subject_path_template = (
         "/ukbb/{sub}/{sub}_task-rest_{atlas_desc}_{seg}timeseries"
     )
-    data_list = []
+    h5_path = []
     for sub in participant_id:
         for seg in segment:
             seg = f"seg-{seg}_" if seg is not None else ""
             cur_sub_path = subject_path_template.format(
                 sub=sub, seg=seg, atlas_desc=atlas_desc
             )
-            data_list.append(cur_sub_path)
-    # train-test-val split
-    train, test = train_test_split(
-        data_list, test_size=test_set, random_state=random_state
-    )
-    # calculate the proportion of val_set in the training loop
-    train, val = train_test_split(
-        train, test_size=val_set / (1 - test_set), random_state=random_state
-    )
-    return {"train": train, "val": val, "test": test}
+            h5_path.append(cur_sub_path)
+    return h5_path
 
 
 def load_data(
diff --git a/src/train.py b/src/train.py