|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +set -e -o pipefail |
| 4 | + |
| 5 | + |
| 6 | +# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh (and may eventually |
| 7 | +# be called by more scripts). It contains the common feature preparation and iVector-related parts |
| 8 | +# of the script. See those scripts for examples of usage. |
| 9 | + |
| 10 | + |
| 11 | +stage=0 |
| 12 | +nj=30 |
| 13 | + |
| 14 | +train_set=train_cleaned # you might set this to e.g. train. |
| 15 | +gmm=tri3_cleaned # This specifies a GMM-dir from the features of the type you're training the system on; |
| 16 | + # it should contain alignments for 'train_set'. |
| 17 | +online_cmvn_iextractor=false |
| 18 | + |
| 19 | +num_threads_ubm=8 |
| 20 | +nnet3_affix=_cleaned # affix for exp/nnet3 directory to put iVector stuff in, so it |
| 21 | + # becomes exp/nnet3_cleaned or whatever. |
| 22 | + |
| 23 | +. ./cmd.sh |
| 24 | +. ./path.sh |
| 25 | +. utils/parse_options.sh |
| 26 | + |
| 27 | + |
| 28 | +gmm_dir=exp/${gmm} |
| 29 | +ali_dir=exp/${gmm}_ali_${train_set}_sp |
| 30 | + |
| 31 | +for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do |
| 32 | + if [ ! -f $f ]; then |
| 33 | + echo "$0: expected file $f to exist" |
| 34 | + exit 1 |
| 35 | + fi |
| 36 | +done |
| 37 | + |
| 38 | + |
| 39 | +# lowres features, alignments |
| 40 | +if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 2 ]; then |
| 41 | + echo "$0: data/${train_set}_sp/feats.scp already exists. Refusing to overwrite the features " |
| 42 | + echo " to avoid wasting time. Please remove the file and continue if you really mean this." |
| 43 | + exit 1; |
| 44 | +fi |
| 45 | + |
| 46 | +if [ $stage -le 1 ]; then |
| 47 | + echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)" |
| 48 | + utils/data/perturb_data_dir_speed_3way.sh \ |
| 49 | + data/${train_set} data/${train_set}_sp |
| 50 | + |
| 51 | + for datadir in ${train_set}_sp dev test; do |
| 52 | + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires |
| 53 | + done |
| 54 | +fi |
| 55 | + |
| 56 | +if [ $stage -le 2 ]; then |
| 57 | + echo "$0: making MFCC features for low-resolution speed-perturbed data" |
| 58 | + steps/make_mfcc.sh --nj $nj \ |
| 59 | + --cmd "$train_cmd" data/${train_set}_sp |
| 60 | + steps/compute_cmvn_stats.sh data/${train_set}_sp |
| 61 | + echo "$0: fixing input data-dir to remove nonexistent features, in case some " |
| 62 | + echo ".. speed-perturbed segments were too short." |
| 63 | + utils/fix_data_dir.sh data/${train_set}_sp |
| 64 | +fi |
| 65 | + |
| 66 | +if [ $stage -le 3 ]; then |
| 67 | + if [ -f $ali_dir/ali.1.gz ]; then |
| 68 | + echo "$0: alignments in $ali_dir appear to already exist. Please either remove them " |
| 69 | + echo " ... or use a later --stage option." |
| 70 | + exit 1 |
| 71 | + fi |
| 72 | + echo "$0: aligning with the perturbed low-resolution data" |
| 73 | + steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ |
| 74 | + data/${train_set}_sp data/lang $gmm_dir $ali_dir |
| 75 | +fi |
| 76 | + |
| 77 | + |
| 78 | +if [ $stage -le 5 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then |
| 79 | + echo "$0: data/${train_set}_sp_hires/feats.scp already exists." |
| 80 | + echo " ... Please either remove it, or rerun this script with stage > 2." |
| 81 | + exit 1 |
| 82 | +fi |
| 83 | + |
| 84 | +if [ $stage -le 5 ]; then |
| 85 | + echo "$0: creating high-resolution MFCC features" |
| 86 | + |
| 87 | + # this shows how you can split across multiple file-systems. we'll split the |
| 88 | + # MFCC dir across multiple locations. You might want to be careful here, if you |
| 89 | + # have multiple copies of Kaldi checked out and run the same recipe, not to let |
| 90 | + # them overwrite each other. |
| 91 | + mfccdir=data/${train_set}_sp_hires/data |
| 92 | + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then |
| 93 | + utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/tedlium-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage |
| 94 | + fi |
| 95 | + |
| 96 | + # do volume-perturbation on the training data prior to extracting hires |
| 97 | + # features; this helps make trained nnets more invariant to test data volume. |
| 98 | + utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires |
| 99 | + |
| 100 | + for datadir in ${train_set}_sp dev test; do |
| 101 | + steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ |
| 102 | + --cmd "$train_cmd" data/${datadir}_hires |
| 103 | + steps/compute_cmvn_stats.sh data/${datadir}_hires |
| 104 | + utils/fix_data_dir.sh data/${datadir}_hires |
| 105 | + done |
| 106 | +fi |
| 107 | + |
| 108 | +if [ $stage -le 6 ]; then |
| 109 | + echo "$0: computing a subset of data to train the diagonal UBM." |
| 110 | + |
| 111 | + mkdir -p exp/nnet3${nnet3_affix}/diag_ubm |
| 112 | + temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm |
| 113 | + |
| 114 | + # train a diagonal UBM using a subset of about a quarter of the data |
| 115 | + num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk) |
| 116 | + num_utts=$[$num_utts_total/4] |
| 117 | + utils/data/subset_data_dir.sh data/${train_set}_sp_hires \ |
| 118 | + $num_utts ${temp_data_root}/${train_set}_sp_hires_subset |
| 119 | + |
| 120 | + echo "$0: computing a PCA transform from the hires data." |
| 121 | + steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \ |
| 122 | + --splice-opts "--left-context=3 --right-context=3" \ |
| 123 | + --max-utts 10000 --subsample 2 \ |
| 124 | + ${temp_data_root}/${train_set}_sp_hires_subset \ |
| 125 | + exp/nnet3${nnet3_affix}/pca_transform |
| 126 | + |
| 127 | + echo "$0: training the diagonal UBM." |
| 128 | + # Use 512 Gaussians in the UBM. |
| 129 | + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \ |
| 130 | + --num-frames 700000 \ |
| 131 | + --num-threads $num_threads_ubm \ |
| 132 | + ${temp_data_root}/${train_set}_sp_hires_subset 512 \ |
| 133 | + exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm |
| 134 | +fi |
| 135 | + |
| 136 | +if [ $stage -le 7 ]; then |
| 137 | + # Train the iVector extractor. µUse all of the speed-perturbed data since iVector extractors |
| 138 | + # can be sensitive to the amount of data. The script defaults to an iVector dimension of 100. |
| 139 | + echo "$0: training the iVector extractor" |
| 140 | + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 15 \ |
| 141 | + --num-threads 4 --num-processes 2 \ |
| 142 | + --online-cmvn-iextractor $online_cmvn_iextractor \ |
| 143 | + data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm \ |
| 144 | + exp/nnet3${nnet3_affix}/extractor || exit 1; |
| 145 | +fi |
| 146 | + |
| 147 | +if [ $stage -le 8 ]; then |
| 148 | + # note, we don't encode the 'max2' in the name of the ivectordir even though |
| 149 | + # that's the data we extract the ivectors from, as it's still going to be |
| 150 | + # valid for the non-'max2' data, the utterance list is the same. |
| 151 | + ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires |
| 152 | + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then |
| 153 | + utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/tedlium-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage |
| 154 | + fi |
| 155 | + # We now extract iVectors on the speed-perturbed training data . With |
| 156 | + # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats |
| 157 | + # each of these pairs as one speaker; this gives more diversity in iVectors.. |
| 158 | + # Note that these are extracted 'online' (they vary within the utterance). |
| 159 | + |
| 160 | + # Having a larger number of speakers is helpful for generalization, and to |
| 161 | + # handle per-utterance decoding well (the iVector starts at zero at the beginning |
| 162 | + # of each pseudo-speaker). |
| 163 | + temp_data_root=${ivectordir} |
| 164 | + utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \ |
| 165 | + data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2 |
| 166 | + |
| 167 | + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \ |
| 168 | + ${temp_data_root}/${train_set}_sp_hires_max2 \ |
| 169 | + exp/nnet3${nnet3_affix}/extractor $ivectordir |
| 170 | + |
| 171 | + # Also extract iVectors for the test data, but in this case we don't need the speed |
| 172 | + # perturbation (sp) or small-segment concatenation (comb). |
| 173 | + for data in dev test; do |
| 174 | + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "$nj" \ |
| 175 | + data/${data}_hires exp/nnet3${nnet3_affix}/extractor \ |
| 176 | + exp/nnet3${nnet3_affix}/ivectors_${data}_hires |
| 177 | + done |
| 178 | +fi |
| 179 | + |
| 180 | + |
| 181 | +exit 0; |
0 commit comments