Skip to content

Commit 3c61cf5

Browse files
committed
run aishell with latest recipe
1 parent 76fef40 commit 3c61cf5

File tree

7 files changed

+839
-0
lines changed

7 files changed

+839
-0
lines changed

egs/aishell/s10/conf/mfcc_hires.conf

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# config for high-resolution MFCC features, intended for neural network training
2+
# Note: we keep all cepstra, so it has the same info as filterbank features,
3+
# but MFCC is more easily compressible (because less correlated) which is why
4+
# we prefer this method.
5+
--use-energy=false # use average of log energy, not energy.
6+
--num-mel-bins=40 # similar to Google's setup.
7+
--num-ceps=40 # there is no dimensionality reduction.
8+
--low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so
9+
# there might be some information at the low end.
10+
--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600)

egs/aishell/s10/conf/online_cmvn.conf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
#!/bin/bash
2+
3+
set -e -o pipefail
4+
5+
6+
# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh (and may eventually
7+
# be called by more scripts). It contains the common feature preparation and iVector-related parts
8+
# of the script. See those scripts for examples of usage.
9+
10+
11+
stage=0
12+
nj=30
13+
14+
train_set=train_cleaned # you might set this to e.g. train.
15+
gmm=tri3_cleaned # This specifies a GMM-dir from the features of the type you're training the system on;
16+
# it should contain alignments for 'train_set'.
17+
online_cmvn_iextractor=false
18+
19+
num_threads_ubm=8
20+
nnet3_affix=_cleaned # affix for exp/nnet3 directory to put iVector stuff in, so it
21+
# becomes exp/nnet3_cleaned or whatever.
22+
23+
. ./cmd.sh
24+
. ./path.sh
25+
. utils/parse_options.sh
26+
27+
28+
gmm_dir=exp/${gmm}
29+
ali_dir=exp/${gmm}_ali_${train_set}_sp
30+
31+
for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
32+
if [ ! -f $f ]; then
33+
echo "$0: expected file $f to exist"
34+
exit 1
35+
fi
36+
done
37+
38+
39+
# lowres features, alignments
40+
if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 2 ]; then
41+
echo "$0: data/${train_set}_sp/feats.scp already exists. Refusing to overwrite the features "
42+
echo " to avoid wasting time. Please remove the file and continue if you really mean this."
43+
exit 1;
44+
fi
45+
46+
if [ $stage -le 1 ]; then
47+
echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
48+
utils/data/perturb_data_dir_speed_3way.sh \
49+
data/${train_set} data/${train_set}_sp
50+
51+
for datadir in ${train_set}_sp dev test; do
52+
utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
53+
done
54+
fi
55+
56+
if [ $stage -le 2 ]; then
57+
echo "$0: making MFCC features for low-resolution speed-perturbed data"
58+
steps/make_mfcc.sh --nj $nj \
59+
--cmd "$train_cmd" data/${train_set}_sp
60+
steps/compute_cmvn_stats.sh data/${train_set}_sp
61+
echo "$0: fixing input data-dir to remove nonexistent features, in case some "
62+
echo ".. speed-perturbed segments were too short."
63+
utils/fix_data_dir.sh data/${train_set}_sp
64+
fi
65+
66+
if [ $stage -le 3 ]; then
67+
if [ -f $ali_dir/ali.1.gz ]; then
68+
echo "$0: alignments in $ali_dir appear to already exist. Please either remove them "
69+
echo " ... or use a later --stage option."
70+
exit 1
71+
fi
72+
echo "$0: aligning with the perturbed low-resolution data"
73+
steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
74+
data/${train_set}_sp data/lang $gmm_dir $ali_dir
75+
fi
76+
77+
78+
if [ $stage -le 5 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then
79+
echo "$0: data/${train_set}_sp_hires/feats.scp already exists."
80+
echo " ... Please either remove it, or rerun this script with stage > 2."
81+
exit 1
82+
fi
83+
84+
if [ $stage -le 5 ]; then
85+
echo "$0: creating high-resolution MFCC features"
86+
87+
# this shows how you can split across multiple file-systems. we'll split the
88+
# MFCC dir across multiple locations. You might want to be careful here, if you
89+
# have multiple copies of Kaldi checked out and run the same recipe, not to let
90+
# them overwrite each other.
91+
mfccdir=data/${train_set}_sp_hires/data
92+
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
93+
utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/tedlium-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
94+
fi
95+
96+
# do volume-perturbation on the training data prior to extracting hires
97+
# features; this helps make trained nnets more invariant to test data volume.
98+
utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
99+
100+
for datadir in ${train_set}_sp dev test; do
101+
steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
102+
--cmd "$train_cmd" data/${datadir}_hires
103+
steps/compute_cmvn_stats.sh data/${datadir}_hires
104+
utils/fix_data_dir.sh data/${datadir}_hires
105+
done
106+
fi
107+
108+
if [ $stage -le 6 ]; then
109+
echo "$0: computing a subset of data to train the diagonal UBM."
110+
111+
mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
112+
temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
113+
114+
# train a diagonal UBM using a subset of about a quarter of the data
115+
num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
116+
num_utts=$[$num_utts_total/4]
117+
utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
118+
$num_utts ${temp_data_root}/${train_set}_sp_hires_subset
119+
120+
echo "$0: computing a PCA transform from the hires data."
121+
steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
122+
--splice-opts "--left-context=3 --right-context=3" \
123+
--max-utts 10000 --subsample 2 \
124+
${temp_data_root}/${train_set}_sp_hires_subset \
125+
exp/nnet3${nnet3_affix}/pca_transform
126+
127+
echo "$0: training the diagonal UBM."
128+
# Use 512 Gaussians in the UBM.
129+
steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
130+
--num-frames 700000 \
131+
--num-threads $num_threads_ubm \
132+
${temp_data_root}/${train_set}_sp_hires_subset 512 \
133+
exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
134+
fi
135+
136+
if [ $stage -le 7 ]; then
137+
# Train the iVector extractor. µUse all of the speed-perturbed data since iVector extractors
138+
# can be sensitive to the amount of data. The script defaults to an iVector dimension of 100.
139+
echo "$0: training the iVector extractor"
140+
steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 15 \
141+
--num-threads 4 --num-processes 2 \
142+
--online-cmvn-iextractor $online_cmvn_iextractor \
143+
data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm \
144+
exp/nnet3${nnet3_affix}/extractor || exit 1;
145+
fi
146+
147+
if [ $stage -le 8 ]; then
148+
# note, we don't encode the 'max2' in the name of the ivectordir even though
149+
# that's the data we extract the ivectors from, as it's still going to be
150+
# valid for the non-'max2' data, the utterance list is the same.
151+
ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
152+
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
153+
utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/tedlium-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
154+
fi
155+
# We now extract iVectors on the speed-perturbed training data . With
156+
# --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
157+
# each of these pairs as one speaker; this gives more diversity in iVectors..
158+
# Note that these are extracted 'online' (they vary within the utterance).
159+
160+
# Having a larger number of speakers is helpful for generalization, and to
161+
# handle per-utterance decoding well (the iVector starts at zero at the beginning
162+
# of each pseudo-speaker).
163+
temp_data_root=${ivectordir}
164+
utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
165+
data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
166+
167+
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
168+
${temp_data_root}/${train_set}_sp_hires_max2 \
169+
exp/nnet3${nnet3_affix}/extractor $ivectordir
170+
171+
# Also extract iVectors for the test data, but in this case we don't need the speed
172+
# perturbation (sp) or small-segment concatenation (comb).
173+
for data in dev test; do
174+
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "$nj" \
175+
data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
176+
exp/nnet3${nnet3_affix}/ivectors_${data}_hires
177+
done
178+
fi
179+
180+
181+
exit 0;
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#!/bin/bash
2+
3+
# Copyright 2016 Vimal Manohar
4+
# 2016 Johns Hopkins University (author: Daniel Povey)
5+
# Apache 2.0
6+
7+
# This script demonstrates how to re-segment training data selecting only the
8+
# "good" audio that matches the transcripts.
9+
# The basic idea is to decode with an existing in-domain acoustic model, and a
10+
# biased language model built from the reference, and then work out the
11+
# segmentation from a ctm like file.
12+
13+
# For nnet3 and chain results after cleanup, see the scripts in
14+
# local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh
15+
16+
# GMM Results for speaker-independent (SI) and speaker adaptive training (SAT) systems on dev and test sets
17+
# [will add these later].
18+
19+
set -e
20+
set -o pipefail
21+
set -u
22+
23+
stage=0
24+
cleanup_stage=0
25+
data=data/train
26+
cleanup_affix=cleaned
27+
srcdir=exp/tri3
28+
nj=100
29+
decode_nj=16
30+
decode_num_threads=4
31+
32+
. ./path.sh
33+
. ./cmd.sh
34+
. utils/parse_options.sh
35+
36+
cleaned_data=${data}_${cleanup_affix}
37+
38+
dir=${srcdir}_${cleanup_affix}_work
39+
cleaned_dir=${srcdir}_${cleanup_affix}
40+
41+
if [ $stage -le 1 ]; then
42+
# This does the actual data cleanup.
43+
steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj $nj --cmd "$train_cmd" \
44+
$data data/lang $srcdir $dir $cleaned_data
45+
fi
46+
47+
if [ $stage -le 2 ]; then
48+
steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
49+
$cleaned_data data/lang $srcdir ${srcdir}_ali_${cleanup_affix}
50+
fi
51+
52+
if [ $stage -le 3 ]; then
53+
steps/train_sat.sh --cmd "$train_cmd" \
54+
5000 100000 $cleaned_data data/lang ${srcdir}_ali_${cleanup_affix} ${cleaned_dir}
55+
fi
56+
57+
if [ $stage -le 4 ]; then
58+
# Test with the models trained on cleaned-up data.
59+
utils/mkgraph.sh data/lang ${cleaned_dir} ${cleaned_dir}/graph
60+
61+
for dset in dev test; do
62+
steps/decode_fmllr.sh --nj $decode_nj --num-threads $decode_num_threads \
63+
--cmd "$decode_cmd" --num-threads 4 \
64+
${cleaned_dir}/graph data/${dset} ${cleaned_dir}/decode_${dset}
65+
steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
66+
data/${dset} ${cleaned_dir}/decode_${dset} ${cleaned_dir}/decode_${dset}_rescore
67+
done
68+
fi

0 commit comments

Comments
 (0)