[egs,scripts] Scripts and an example of BPE-based sub-word decoding (#…

…3101)
kaldi-asr · danpovey · Mar 10, 2019 · Mar 14, 2019 · Mar 15, 2019 · Mar 15, 2019
commit 61b2347d421e383de9d7a6cf32f033012c625185
diff --git a/egs/gale_arabic/s5c/RESULT b/egs/gale_arabic/s5c/RESULT
@@ -0,0 +1,4 @@
+%WER 41.98 [ 29249 / 69668, 2672 ins, 5990 del, 20587 sub ] exp/tri1_subword/decode/wer_15_0.0
+%WER 37.66 [ 26239 / 69668, 2660 ins, 5255 del, 18324 sub ] exp/tri2b_subword/decode/wer_17_0.0
+%WER 35.26 [ 24565 / 69668, 2879 ins, 4892 del, 16794 sub ] exp/tri3b_subword/decode/wer_17_0.5
+%WER 17.29 [ 12049 / 69668, 1244 ins, 2758 del, 8047 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_10_0.5
diff --git a/egs/gale_arabic/s5c/cmd.sh b/egs/gale_arabic/s5c/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="retry.pl queue.pl --mem 2G"
+export decode_cmd="retry.pl queue.pl --mem 4G"
+export mkgraph_cmd="retry.pl queue.pl --mem 8G"
diff --git a/egs/gale_arabic/s5c/conf/decode.config b/egs/gale_arabic/s5c/conf/decode.config
@@ -0,0 +1 @@
+link decode_dnn.config
diff --git a/egs/gale_arabic/s5c/conf/mfcc.conf b/egs/gale_arabic/s5c/conf/mfcc.conf
@@ -0,0 +1 @@
+--use-energy=false   # only non-default option.
diff --git a/egs/gale_arabic/s5c/conf/mfcc_hires.conf b/egs/gale_arabic/s5c/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 
+--num-mel-bins=40    
+--num-ceps=40   
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600)
diff --git a/egs/gale_arabic/s5c/conf/online_cmvn.conf b/egs/gale_arabic/s5c/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/online/run_online_decoding_nnet2.sh
diff --git a/egs/gale_arabic/s5c/local/bad_segments b/egs/gale_arabic/s5c/local/bad_segments
@@ -0,0 +1,10 @@
+ARABIYA_FROMIRAQ_ARB_20070302_175801_2326286_2327450
+ARABIYA_BILARABI_ARB_20061005_201400_221375_223694
+LBC_NAHAR_ARB_20060911_142800_3683267_3685290
+LBC_NAHAR_ARB_20070303_145800_3249800_3251128
+LBC_NAHAR_ARB_20070303_145800_3623646_3624152
+LBC_NAHAR_ARB_20070305_035800_481003_484069
+ALAM_WITHEVENT_ARB_20070227_205800_3141876_3144152
+ALAM_NEWSRPT_ARB_20070130_015801_2875054_2876396
+ALJZ_TODHARV_ARB_20060914_155800_2947717_2949041
+ALJZ_TODHARV_ARB_20070107_145800_2417848_2419238
diff --git a/egs/gale_arabic/s5c/local/chain/compare_wer.sh b/egs/gale_arabic/s5c/local/chain/compare_wer.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# ./local/chain/compare_wer.sh exp/chain/cnn1a
+# System                          cnn1a
+# WER                              0.61
+# CER                              0.15
+# Final train prob              -0.0377
+# Final valid prob              -0.0380
+# Final train prob (xent)       -0.0830
+# Final valid prob (xent)       -0.0838
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+  exit 1
+fi
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "# WER                        "
+for x in $*; do
+  wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER                        "
+for x in $*; do
+  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/gale_arabic/s5c/local/chain/run_chain_common.sh b/egs/gale_arabic/s5c/local/chain/run_chain_common.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+# this script has common stages shared across librispeech chain recipes.
+# It generates a new topology in a new lang directory, gets the alignments as
+# lattices, and builds a tree for the new topology
+set -e
+
+stage=11
+
+# input directory names. These options are actually compulsory, and they have
+# been named for convenience
+gmm_dir=
+ali_dir=
+lores_train_data_dir=
+
+num_leaves=6000
+
+# output directory names. They are also compulsory.
+lang=
+lat_dir=
+tree_dir=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+[ -z $lang ] && echo "Set --lang, this specifies the new lang directory which will have the new topology" && exit 1;
+[ -z $lat_dir ] && echo "Set --lat-dir, this specifies the experiment directory to store lattice" && exit 1;
+[ -z $tree_dir ] && echo "Set --tree-dir, this specifies the directory to store new tree " && exit 1;
+
+for f in $gmm_dir/final.mdl $ali_dir/ali.1.gz $lores_train_data_dir/feats.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 12 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat ${ali_dir}/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \
+    $lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 13 ]; then
+  # Build a tree using our new topology. We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" $num_leaves ${lores_train_data_dir} $lang $ali_dir $tree_dir
+fi
+
+exit 0;
diff --git a/egs/gale_arabic/s5c/local/chain/run_tdnn.sh b/egs/gale_arabic/s5c/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
diff --git a/egs/gale_arabic/s5c/local/chain/run_tdnn_lstm.sh b/egs/gale_arabic/s5c/local/chain/run_tdnn_lstm.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1a.sh
diff --git a/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_1a.sh b/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,220 @@
+#!/bin/bash
+
+# ./local/chain/compare_wer.sh exp/chain/tdnn_1a_sp
+# System                      tdnn_1a_sp
+# WER                             16.47
+# CER                              6.68
+# Final train prob              -0.0652
+# Final valid prob              -0.0831
+# Final train prob (xent)       -0.8965
+# Final valid prob (xent)       -0.9964
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_1a_sp/
+# exp/chain/tdnn_1a_sp/: num-iters=441 nj=3..16 num-params=18.6M dim=40+100->5816 combine=-0.063->-0.062 (over 6) xent:train/valid[293,440,final]=(-1.22,-0.912,-0.896/-1.29,-1.01,-0.996) logprob:train/valid[293,440,final]=(-0.097,-0.066,-0.065/-0.108,-0.084,-0.083)
+
+
+set -e -o pipefail
+stage=0
+nj=30
+train_set=train
+test_set=test
+gmm=tri3b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=_1a   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# training chunk-options
+chunk_width=150,110,100
+get_egs_stage=-10
+
+# training options
+srand=0
+remove_egs=true
+run_ivector_common=true
+run_chain_common=true
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+if $run_ivector_common; then
+  local/nnet3/run_ivector_common.sh \
+    --stage $stage --nj $nj \
+    --train-set $train_set --gmm $gmm \
+    --num-threads-ubm $num_threads_ubm \
+    --nnet3-affix "$nnet3_affix"
+fi
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain${nnet3_affix}/tree_a_sp
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+# Please take this as a reference on how to specify all the options of
+# local/chain/run_chain_common.sh
+if $run_chain_common; then
+  local/chain/run_chain_common.sh --stage $stage \
+                                  --gmm-dir $gmm_dir \
+                                  --ali-dir $ali_dir \
+                                  --lores-train-data-dir ${lores_train_data_dir} \
+                                  --lang $lang \
+                                  --lat-dir $lat_dir \
+                                  --num-leaves 7000 \
+                                  --tree-dir $tree_dir || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs 6 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.00025 \
+    --trainer.optimization.final-effective-lrate 0.000025 \
+    --trainer.num-chunk-per-minibatch=64,32 \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+    --egs.stage $get_egs_stage \
+    --reporting.email="$reporting_email" \
+    --cleanup.remove-egs=$remove_egs \
+    --feat-dir=$train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir=$lat_dir \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 17 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test \
+    $tree_dir $tree_dir/graph || exit 1;
+fi
+
+if [ $stage -le 18 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+    steps/nnet3/decode.sh \
+      --acwt 1.0 --post-decode-acwt 10.0 \
+      --extra-left-context 0 --extra-right-context 0 \
+      --extra-left-context-initial 0 \
+      --extra-right-context-final 0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$decode_cmd"  --num-threads 4 \
+      --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test_set}_hires \
+      $tree_dir/graph data/${test_set}_hires ${dir}/decode_${test_set} || exit 1
+fi
diff --git a/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,222 @@
+#!/bin/bash
+
+#started from tedlium recipe with few edits
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=17
+nj=30
+decode_nj=30
+min_seg_len=1.55
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+xent_regularize=0.1
+train_set=train
+gmm=tri2b # the gmm for the target data gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# decode options
+extra_left_context=50
+extra_right_context=0
+frames_per_chunk=150
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1a  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 3 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+    --acwt 1.0 --post-decode-acwt 10.0 \
+    --extra-left-context $extra_left_context  \
+    --extra-right-context $extra_right_context  \
+    --frames-per-chunk "$frames_per_chunk" \
+    --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_test_hires \
+    --scoring-opts "--min-lmwt 5 " \
+    $dir/graph data/test_hires $dir/decode || exit 1;
+fi
+exit 0
diff --git a/egs/gale_arabic/s5c/local/nnet3/run_ivector_common.sh b/egs/gale_arabic/s5c/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,182 @@
+#!/bin/bash
+
+set -e -o pipefail
+
+# This script is called from scripts like local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more scripts).  It
+# contains the common feature preparation and iVector-related parts of the
+# script.  See those scripts for examples of usage.
+
+
+stage=0
+nj=100
+train_set=train   # you might set this to e.g. train.
+test_sets="test"
+gmm=tri3b # This specifies a GMM-dir from the features of the type you're training the system on;
+                         # it should contain alignments for 'train_set'.
+
+num_threads_ubm=32
+nnet3_affix= # affix for exp/nnet3 directory to put iVector stuff
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+
+
+if [ $stage -le 2 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then
+  echo "$0: data/${train_set}_sp_hires/feats.scp already exists."
+  echo " ... Please either remove it, or rerun this script with stage > 2."
+  exit 1
+fi
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: preparing directory for speed-perturbed data"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating high-resolution MFCC features"
+
+  # this shows how you can split across multiple file-systems.  we'll split the
+  # MFCC dir across multiple locations.  You might want to be careful here, if you
+  # have multiple copies of Kaldi checked out and run the same recipe, not to let
+  # them overwrite each other.
+  mfccdir=data/${train_set}_sp_hires/data
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires
+    steps/compute_cmvn_stats.sh data/${datadir}_hires
+    utils/fix_data_dir.sh data/${datadir}_hires
+  done
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  # train a diagonal UBM using a subset of about a quarter of the data
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+      $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads $num_threads_ubm \
+    ${temp_data_root}/${train_set}_sp_hires_subset 512 \
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+fi
+
+if [ $stage -le 4 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+    data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data; the utterance list is the same.
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+  # We extract iVectors on the speed-perturbed training data .  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online' (they vary within the utterance).
+
+  # Having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (the iVector starts at zero at the beginning
+  # of each pseudo-speaker).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in ${test_sets}; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+  done
+fi
+
+if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 7 ]; then
+  echo "$0: data/${train_set}_sp/feats.scp already exists.  Refusing to overwrite the features "
+  echo " to avoid wasting time.  Please remove the file and continue if you really mean this."
+  exit 1;
+fi
+
+
+if [ $stage -le 6 ]; then
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh \
+    data/${train_set} data/${train_set}_sp
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: making MFCC features for low-resolution speed-perturbed data (needed for alignments)"
+  steps/make_mfcc.sh --nj $nj \
+    --cmd "$train_cmd" data/${train_set}_sp
+  steps/compute_cmvn_stats.sh data/${train_set}_sp
+  echo "$0: fixing input data-dir to remove nonexistent features, in case some "
+  echo ".. speed-perturbed segments were too short."
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 8 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir
+fi
+
+
+exit 0;
diff --git a/egs/gale_arabic/s5c/local/nnet3/run_lstm.sh b/egs/gale_arabic/s5c/local/nnet3/run_lstm.sh
@@ -0,0 +1 @@
+tuning/run_lstm_1a.sh
diff --git a/egs/gale_arabic/s5c/local/nnet3/run_tdnn.sh b/egs/gale_arabic/s5c/local/nnet3/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
diff --git a/egs/gale_arabic/s5c/local/nnet3/tuning/run_lstm_1a.sh b/egs/gale_arabic/s5c/local/nnet3/tuning/run_lstm_1a.sh
@@ -0,0 +1,161 @@
+#!/bin/bash
+
+#started from tedlium recipe with few edits
+
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train
+gmm=tri2b  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+
+# Options which are not passed through to run_ivector_common.sh
+affix=
+common_egs_dir=
+reporting_email=
+
+# LSTM options
+train_stage=-10
+splice_indexes="-2,-1,0,1,2 0 0"
+lstm_delay=" -1 -2 -3 "
+label_delay=5
+num_lstm_layers=3
+cell_dim=1024
+hidden_dim=1024
+recurrent_projection_dim=256
+non_recurrent_projection_dim=256
+chunk_width=20
+chunk_left_context=40
+chunk_right_context=0
+max_param_change=2.0
+
+# training options
+srand=0
+num_epochs=6
+initial_effective_lrate=0.0003
+final_effective_lrate=0.00003
+num_jobs_initial=2
+num_jobs_final=3
+momentum=0.5
+num_chunk_per_minibatch=100
+samples_per_iter=20000
+remove_egs=true
+
+#decode options
+extra_left_context=
+extra_right_context=
+frames_per_chunk=
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+graph_dir=$gmm_dir/graph
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/lstm${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}_sp
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs"
+  config_extra_opts=()
+  [ ! -z "$lstm_delay" ] && config_extra_opts+=(--lstm-delay "$lstm_delay")
+  steps/nnet3/lstm/make_configs.py  "${config_extra_opts[@]}" \
+    --feat-dir $train_data_dir \
+    --ivector-dir $train_ivector_dir \
+    --ali-dir $ali_dir \
+    --num-lstm-layers $num_lstm_layers \
+    --splice-indexes "$splice_indexes " \
+    --cell-dim $cell_dim \
+    --hidden-dim $hidden_dim \
+    --recurrent-projection-dim $recurrent_projection_dim \
+    --non-recurrent-projection-dim $non_recurrent_projection_dim \
+    --label-delay $label_delay \
+    --self-repair-scale-nonlinearity 0.00001 \
+  $dir/configs || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.samples-per-iter=$samples_per_iter \
+    --trainer.optimization.num-jobs-initial=$num_jobs_initial \
+    --trainer.optimization.num-jobs-final=$num_jobs_final \
+    --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate=$final_effective_lrate \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.rnn.num-chunk-per-minibatch=$num_chunk_per_minibatch \
+    --trainer.optimization.momentum=$momentum \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --cleanup.preserve-model-interval=1 \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  rm $dir/.error 2>/dev/null || true
+  steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+    --extra-left-context $extra_left_context \
+    --extra-right-context $extra_right_context \
+    --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_test_hires \
+    ${graph_dir} data/test_hires ${dir}/decode || exit 1
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+    data/test_hires ${dir}/decode_test ${dir}/decode_test_rescore || exit 1
+fi
+
+exit 0;
diff --git a/egs/gale_arabic/s5c/local/nnet3/tuning/run_tdnn_1a.sh b/egs/gale_arabic/s5c/local/nnet3/tuning/run_tdnn_1a.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+
+# started from tedlium recipe with few edits
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train
+gmm=tri2b  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+tdnn_affix=  #affix for TDNN directory e.g. "a" or "b", in case we change the configuration.
+
+# Options which are not passed through to run_ivector_common.sh
+train_stage=-10
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -7,2 -3,3 0 0"
+remove_egs=true
+relu_dim=850
+num_epochs=3
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+graph_dir=$gmm_dir/graph
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/tdnn${tdnn_affix}_sp
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/tdnn/train.sh --stage $train_stage \
+    --num-epochs $num_epochs --num-jobs-initial 2 --num-jobs-final 2 \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir ${train_ivector_dir} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate 0.0015 --final-effective-lrate 0.00015 \
+    --cmd "$decode_cmd" \
+    --relu-dim "$relu_dim" \
+    --remove-egs "$remove_egs" \
+    $train_data_dir data/lang $ali_dir $dir
+fi
+
+if [ $stage -le 13 ]; then
+  steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+    --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_test_hires \
+    ${graph_dir} data/test_hires ${dir}/decode || exit 1 
+fi
+
+exit 0;
diff --git a/egs/gale_arabic/s5c/local/normalize_transcript_BW.pl b/egs/gale_arabic/s5c/local/normalize_transcript_BW.pl
@@ -0,0 +1,111 @@
+#!/usr/bin/env perl
+
+# Copyright 2014 QCRI (author: Ahmed Ali)
+# Apache 2.0
+
+use warnings;
+use strict;
+use Encode;
+use utf8;
+
+
+
+if (@ARGV !=2 )
+    {#
+	print "usage: $0 <inFile> <onlyArabicFile>\n"; 
+	exit (1);   
+    }
+
+# <\check usage>
+my $inFile = shift (@ARGV);
+my $ouFile = shift(@ARGV);
+
+
+open INFILE, "<$inFile" || die "unable to open the input file $inFile\n";
+binmode INFILE, ":encoding(utf8)";
+
+
+open OUTPUTFILE, ">$ouFile" or die "unable to open the output mlf file $ouFile\n";
+binmode OUTPUTFILE, ":encoding(utf8)";
+
+
+while (<INFILE>) {
+  s/[^اأإآبتثجحخدذرزسشصضطظعغفقكلمنهويىئءؤة0-9]+/ /g;  ## Removes non Arabic or numbers
+  my $BW = convertUTF8ToBuckwalter ($_);
+  print OUTPUTFILE "$BW"."\n";
+}
+close INFILE;
+close OUTPUTFILE;
+
+
+
+# this function is copied from MADATools.pm: MADA Tools
+ sub convertUTF8ToBuckwalter {
+
+    my ($line)= (@_);
+    #$line = $UTF8_ENCODING_OBJ->decode($line);  ## Same as Encode::decode("utf8",$line), but faster since object already created
+    $line =~ s/\x{0621}/\'/g;   ## HAMZA
+    $line =~ s/\x{0622}/\|/g;   ## ALEF WITH MADDA ABOVE
+    $line =~ s/\x{0623}/\>/g;   ## ALEF WITH HAMZA ABOVE
+    $line =~ s/\x{0624}/\&/g;   ## WAW WITH HAMZA ABOVE
+    $line =~ s/\x{0625}/\</g;   ## ALEF WITH HAMZA BELOW
+    $line =~ s/\x{0626}/\}/g;   ## YEH WITH HAMZA ABOVE
+    $line =~ s/\x{0627}/A/g;    ## ALEF
+    $line =~ s/\x{0628}/b/g;    ## BEH
+    $line =~ s/\x{0629}/p/g;    ## TEH MARBUTA
+    $line =~ s/\x{062A}/t/g;    ## TEH
+    $line =~ s/\x{062B}/v/g;    ## THEH
+    $line =~ s/\x{062C}/j/g;    ## JEEM
+    $line =~ s/\x{062D}/H/g;    ## HAH
+    $line =~ s/\x{062E}/x/g;    ## KHAH
+    $line =~ s/\x{062F}/d/g;    ## DAL
+    $line =~ s/\x{0630}/\*/g;   ## THAL
+    $line =~ s/\x{0631}/r/g;    ## REH
+    $line =~ s/\x{0632}/z/g;    ## ZAIN
+    $line =~ s/\x{0633}/s/g;    ## SEEN
+    $line =~ s/\x{0634}/\$/g;   ## SHEEN
+    $line =~ s/\x{0635}/S/g;    ## SAD
+    $line =~ s/\x{0636}/D/g;    ## DAD
+    $line =~ s/\x{0637}/T/g;    ## TAH
+    $line =~ s/\x{0638}/Z/g;    ## ZAH
+    $line =~ s/\x{0639}/E/g;    ## AIN
+    $line =~ s/\x{063A}/g/g;    ## GHAIN
+    $line =~ s/\x{0640}/_/g;    ## TATWEEL
+    $line =~ s/\x{0641}/f/g;    ## FEH
+    $line =~ s/\x{0642}/q/g;    ## QAF
+    $line =~ s/\x{0643}/k/g;    ## KAF
+    $line =~ s/\x{0644}/l/g;    ## LAM
+    $line =~ s/\x{0645}/m/g;    ## MEEM
+    $line =~ s/\x{0646}/n/g;    ## NOON
+    $line =~ s/\x{0647}/h/g;    ## HEH
+    $line =~ s/\x{0648}/w/g;    ## WAW
+    $line =~ s/\x{0649}/Y/g;    ## ALEF MAKSURA
+    $line =~ s/\x{064A}/y/g;    ## YEH
+
+    ## Diacritics
+    $line =~ s/\x{064B}/F/g;    ## FATHATAN
+    $line =~ s/\x{064C}/N/g;    ## DAMMATAN
+    $line =~ s/\x{064D}/K/g;    ## KASRATAN
+    $line =~ s/\x{064E}/a/g;    ## FATHA
+    $line =~ s/\x{064F}/u/g;    ## DAMMA
+    $line =~ s/\x{0650}/i/g;    ## KASRA
+    $line =~ s/\x{0651}/\~/g;   ## SHADDA
+    $line =~ s/\x{0652}/o/g;    ## SUKUN
+    $line =~ s/\x{0670}/\`/g;   ## SUPERSCRIPT ALEF
+
+    $line =~ s/\x{0671}/\{/g;   ## ALEF WASLA
+    $line =~ s/\x{067E}/P/g;    ## PEH
+    $line =~ s/\x{0686}/J/g;    ## TCHEH
+    $line =~ s/\x{06A4}/V/g;    ## VEH
+    $line =~ s/\x{06AF}/G/g;    ## GAF
+
+
+    ## Punctuation should really be handled by the utf8 cleaner or other method
+#   $line =~ s/\xa2/\,/g; # comma
+#    $line =~ s//\,/g; # comma
+#    $line =~ s//\,/g;
+#    $line =~ s//\;/g; # semicolon
+#    $line =~ s//\?/g; # questionmark
+
+    return $line;
+}
diff --git a/egs/gale_arabic/s5c/local/prepare_data.sh b/egs/gale_arabic/s5c/local/prepare_data.sh
@@ -0,0 +1,104 @@
+#!/bin/bash 
+
+# Copyright 2014 QCRI (author: Ahmed Ali)
+# Apache 2.0
+
+dir1=/export/corpora/LDC/LDC2013S02/
+dir2=/export/corpora/LDC/LDC2013S07/
+dir3=/export/corpora/LDC/LDC2014S07/
+text1=/export/corpora/LDC/LDC2013T17/
+text2=/export/corpora/LDC/LDC2013T04/
+text3=/export/corpora/LDC/LDC2014T17/
+gale_data=GALE
+
+mkdir -p $gale_data 
+# check that sox is installed 
+which sox  &>/dev/null
+if [[ $? != 0 ]]; then 
+ echo "$0: sox is not installed"; exit 1
+fi
+
+for dvd in $dir1 $dir2 $dir3; do
+  dvd_full_path=$(utils/make_absolute.sh $dvd)
+  if [[ ! -e $dvd_full_path ]]; then 
+    echo "$0: missing $dvd_full_path"; exit 1;
+  fi
+  find $dvd_full_path \( -name "*.wav" -o -name "*.flac" \)  | while read file; do
+    id=$(basename $file | awk '{gsub(".wav","");gsub(".flac","");print}')
+    echo "$id sox $file -r 16000 -t wav - |"
+  done 
+done | sort -u > $gale_data/wav.scp
+echo "$0:data prep audio succeded"
+
+gale_data=$(utils/make_absolute.sh "GALE" );
+top_pwd=`pwd`
+txtdir=$gale_data/txt
+mkdir -p $txtdir; cd $txtdir
+
+for cdx in $text1 $text2 $text3; do
+  echo "$0:Preparing $cdx"
+  if [[ $cdx  == *.tgz ]] ; then
+     tar -xvf $cdx
+  elif [  -d "$cdx" ]; then
+    ln -s $cdx `basename $cdx`
+  else
+    echo "$0:I don't really know what I shall do with $cdx " >&2
+  fi
+done
+
+find -L . -type f -name "*.tdf" | while read file; do
+sed '1,3d' $file  # delete the first 3 lines
+done >  all.tmp$$
+
+perl -e '
+    ($inFile,$idFile,$txtFile)= split /\s+/, $ARGV[0];
+    open(IN, "$inFile");
+    open(ID, ">$idFile");
+    open(TXT, ">$txtFile");
+    while (<IN>) {
+      @arr= split /\t/,$_;
+      $start=sprintf ("%0.3f",$arr[2]);$rStart=$start;$start=~s/\.//; $start=~s/^0+$/0/; $start=~s/^0+([^0])/$1/; # remove zeros at the beginning
+      $end=sprintf ("%0.3f",$arr[3]);$rEnd=$end;$end=~s/^0+([^0])/$1/;$end=~s/\.//;
+      if ( ($arr[11] !~ m/report/) && ($arr[11] !~ m/conversational/) ){$arr[11]="UNK";}
+      $id="$arr[11] $arr[0] $arr[0]_${start}_${end} $rStart $rEnd\n";
+      next if ($rStart == $rEnd);
+      $id =~ s/.sph//g;
+      print ID $id;
+      print TXT "$arr[7]\n";
+ }' "all.tmp$$ allid.tmp$$ contentall.tmp$$"
+
+perl ${top_pwd}/local/normalize_transcript_BW.pl contentall.tmp$$ contentall.buck.tmp$$
+paste allid.tmp$$ contentall.buck.tmp$$ | sed 's: $::' | awk '{if (NF>5) {print $0}}'  > all_1.tmp$$
+
+
+awk '{$1="";print $0}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/all
+awk '{if ($1 == "report") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' >  $gale_data/report
+awk '{if ($1 == "conversational") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/conversational
+
+cd ..;
+rm -fr $txtdir
+cd $top_pwd
+echo "$0:dat a prep text succeeded"
+
+mkdir -p data
+dir=$(utils/make_absolute.sh data/)
+grep -f local/test_list $gale_data/all | grep -v -f local/bad_segments > $gale_data/all.test
+grep -v -f local/test_list $gale_data/all | grep -v -f local/bad_segments > $gale_data/all.train 
+
+for x in test train; do
+ outdir=data/$x
+ file=$gale_data/all.$x 
+ mkdir -p $outdir
+ awk '{print $2 " " $2}' $file | sort -u > $outdir/utt2spk 
+ cp -pr $outdir/utt2spk $outdir/spk2utt
+ awk '{print $2 " " $1 " " $3 " " $4}' $file  | sort -u > $outdir/segments
+ awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "\n"}' $file | sort -u > $outdir/text
+done 
+
+grep -f local/test_list $gale_data/wav.scp > $dir/test/wav.scp
+
+cat $gale_data/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline<seg) >0) {seen[$2]=1;}}
+ {if (seen[$1]) { print $0}}' > $dir/train/wav.scp
+
+echo "$0:data prep split succeeded"
+exit 0
diff --git a/egs/gale_arabic/s5c/local/prepare_dict.sh b/egs/gale_arabic/s5c/local/prepare_dict.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+# Copyright 2017 QCRI (author: Ahmed Ali)
+# Apache 2.0
+# This script prepares the dictionary.
+
+set -e
+dir=data/local/dict
+lexicon_url1="http://alt.qcri.org//resources/speech/dictionary/ar-ar_grapheme_lexicon_2016-02-09.bz2";
+lexicon_url2="http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2";
+stage=0
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+mkdir -p $dir data/local/lexicon_data
+
+if [ $stage -le 0 ]; then
+  echo "$0: Downloading text for lexicon... $(date)."
+  wget -P data/local/lexicon_data $lexicon_url1
+  wget -P data/local/lexicon_data $lexicon_url2
+  bzcat data/local/lexicon_data/ar-ar_grapheme_lexicon_2016-02-09.bz2  | sed '1,3d' | awk '{print $1}'  >  data/local/lexicon_data/grapheme_lexicon
+  bzcat data/local/lexicon_data/ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' | awk '{print $1}' >>  data/local/lexicon_data/grapheme_lexicon
+  cat data/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> data/local/lexicon_data/grapheme_lexicon
+fi
+
+
+if [ $stage -le 0 ]; then
+  echo "$0: processing lexicon text and creating lexicon... $(date)."
+  # remove vowels and  rare alef wasla
+  grep -v [0-9] data/local/lexicon_data/grapheme_lexicon |  sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > data/local/lexicon_data/processed_lexicon
+  local/prepare_lexicon.py
+fi
+
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
+
+sed -i '1i<UNK> UNK' $dir/lexicon.txt
+
+echo UNK >> $dir/nonsilence_phones.txt
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
+
+echo "$0: Dictionary preparation succeeded"
diff --git a/egs/gale_arabic/s5c/local/prepare_dict_subword.sh b/egs/gale_arabic/s5c/local/prepare_dict_subword.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+
+# Copyright 2017 QCRI (author: Ahmed Ali)
+#           2019 Dongji Gao
+# Apache 2.0
+# This script prepares the subword dictionary.
+
+set -e
+dir=data/local/dict
+lexicon_url1="http://alt.qcri.org//resources/speech/dictionary/ar-ar_grapheme_lexicon_2016-02-09.bz2";
+lexicon_url2="http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2";
+num_merges=1000
+stage=0
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+mkdir -p $dir data/local/lexicon_data
+
+if [ $stage -le 0 ]; then
+  echo "$0: Downloading text for lexicon... $(date)."
+  wget -P data/local/lexicon_data $lexicon_url1
+  wget -P data/local/lexicon_data $lexicon_url2
+  bzcat data/local/lexicon_data/ar-ar_grapheme_lexicon_2016-02-09.bz2  | sed '1,3d' | awk '{print $1}'  >  data/local/lexicon_data/grapheme_lexicon
+  bzcat data/local/lexicon_data/ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' | awk '{print $1}' >>  data/local/lexicon_data/grapheme_lexicon
+  cat data/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> data/local/lexicon_data/grapheme_lexicon
+fi
+
+
+if [ $stage -le 0 ]; then
+  echo "$0: processing lexicon text and creating lexicon... $(date)."
+  # remove vowels and  rare alef wasla
+  grep -v [0-9] data/local/lexicon_data/grapheme_lexicon |  sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > data/local/lexicon_data/processed_lexicon
+  local/prepare_lexicon.py
+fi
+
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
+
+echo UNK >> $dir/nonsilence_phones.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
+
+# Make a subword lexicon based on current word lexicon
+glossaries="<UNK> <sil>"
+if [ $stage -le 0 ]; then
+  echo "$0: making subword lexicon... $(date)."
+  # get pair_code file
+  cut -d ' ' -f2- data/train/text | sed 's/<[^>]*>//g' | utils/lang/bpe/learn_bpe.py -s $num_merges > data/local/pair_code.txt
+  mv $dir/lexicon.txt $dir/lexicon_word.txt
+  # get words
+  cut -d ' ' -f1 $dir/lexicon_word.txt > $dir/words.txt
+  utils/lang/bpe/apply_bpe.py -c data/local/pair_code.txt --glossaries $glossaries < $dir/words.txt | \
+  sed 's/ /\n/g' | sort -u > $dir/subwords.txt
+  sed 's/./& /g' $dir/subwords.txt | sed 's/@ @ //g' | sed 's/*/V/g' | paste -d ' ' $dir/subwords.txt - > $dir/lexicon.txt
+fi
+
+sed -i '1i<UNK> UNK' $dir/lexicon.txt
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo "$0: Dictionary preparation succeeded"
diff --git a/egs/gale_arabic/s5c/local/prepare_lexicon.py b/egs/gale_arabic/s5c/local/prepare_lexicon.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+
+# Copyright      2018  Ashish Arora
+# Apache 2.0
+
+# This script prepares lexicon.
+
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""")
+args = parser.parse_args()
+
+### main ###
+lex = {}
+text_path = os.path.join('data','local', 'lexicon_data', 'processed_lexicon')
+with open(text_path, 'r', encoding='utf-8') as f:
+    for line in f:
+        line = line.strip()
+        characters = list(line)
+        characters = " ".join(['V' if char == '*' else char for char in characters])
+        lex[line] = characters
+
+with open(os.path.join('data','local','dict', 'lexicon.txt'), 'w', encoding='utf-8') as fp:
+    for key in sorted(lex):
+        fp.write(key + "  " + lex[key] + "\n")
diff --git a/egs/gale_arabic/s5c/local/prepare_lm.sh b/egs/gale_arabic/s5c/local/prepare_lm.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Copyright 2012  Vassil Panayotov
+#           2017  Ewald Enzinger
+# Apache 2.0
+
+. ./path.sh || exit 1
+
+echo "=== Building a language model ..."
+
+dir=data/local/lm/
+text=data/train/text
+lexicon=data/local/dict/lexicon.txt
+# Language model order
+order=3
+
+. utils/parse_options.sh
+
+# Prepare a LM training corpus from the transcripts
+mkdir -p $dir
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+loc=`which ngram-count`;
+if [ -z $loc ]; then
+  if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
+    sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 
+  else
+    sdir=$KALDI_ROOT/tools/srilm/bin/i686
+  fi
+  if [ -f $sdir/ngram-count ]; then
+    echo Using SRILM tools from $sdir
+    export PATH=$PATH:$sdir
+  else
+    echo You appear to not have SRILM tools installed, either on your path,
+    echo or installed in $sdir.  See tools/install_srilm.sh for installation
+    echo instructions.
+    exit 1
+  fi
+fi
+
+cat data/train/text | cut -d " " -f 2- >  $dir/text.txt
+cut -d' ' -f1 $lexicon > $dir/wordlist
+
+ngram-count -text $dir/text.txt -order $order -limit-vocab -vocab $dir/wordlist \
+  -unk -map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.gz
+
+#ngram -lm $dir/lm.gz -ppl $dir/dev.txt
+echo "*** Finished building the LM model!"
diff --git a/egs/gale_arabic/s5c/local/prepare_lm_subword.sh b/egs/gale_arabic/s5c/local/prepare_lm_subword.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+# Copyright 2012  Vassil Panayotov
+#           2017  Ewald Enzinger
+#           2019  Dongji Gao
+# Apache 2.0
+
+. ./path.sh || exit 1
+
+echo "=== Building a language model ..."
+
+dir=data/local/lm/
+text=data/train/text
+lexicon=data/local/dict/lexicon.txt
+# Language model order
+order=6
+
+. utils/parse_options.sh
+
+# Prepare a LM training corpus from the transcripts
+mkdir -p $dir
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+loc=`which ngram-count`;
+if [ -z $loc ]; then
+  if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
+    sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 
+  else
+    sdir=$KALDI_ROOT/tools/srilm/bin/i686
+  fi
+  if [ -f $sdir/ngram-count ]; then
+    echo Using SRILM tools from $sdir
+    export PATH=$PATH:$sdir
+  else
+    echo You appear to not have SRILM tools installed, either on your path,
+    echo or installed in $sdir.  See tools/install_srilm.sh for installation
+    echo instructions.
+    exit 1
+  fi
+fi
+
+cat data/train/text | cut -d " " -f 2- >  $dir/text.txt
+cat data/test/text | cut -d ' ' -f2- > $dir/dev.txt
+cut -d' ' -f1 $lexicon > $dir/wordlist
+
+ngram-count -text $dir/text.txt -order $order -vocab $dir/wordlist \
+  -unk -map-unk "<UNK>" -wbdiscount1 -kndiscount2 -kndiscount3 -kndiscount4 -kndiscount5 -kndiscount6 -interpolate -lm $dir/lm.gz
+
+ngram -order $order -lm $dir/lm.gz -ppl $dir/dev.txt
+echo "*** Finished building the LM model!"
diff --git a/egs/gale_arabic/s5c/local/score.sh b/egs/gale_arabic/s5c/local/score.sh
@@ -0,0 +1,6 @@
+
+#!/bin/bash
+
+
+steps/scoring/score_kaldi_wer.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
diff --git a/egs/gale_arabic/s5c/local/split_wer.sh b/egs/gale_arabic/s5c/local/split_wer.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+# Report WER for reports and conversational
+# Copyright 2014 QCRI (author: Ahmed Ali)
+# Apache 2.0
+
+if [ $# -ne 1 ]; then
+   echo "Arguments should be the gale folder, see ../run.sh for example."
+   exit 1;
+fi
+
+[ -f ./path.sh ] && . ./path.sh
+
+
+galeFolder=$(utils/make_absolute.sh $1)
+symtab=./data/lang/words.txt
+find exp/ -maxdepth 3 -type d -name decode\* > list_decode$$
+
+#split the test set per type:
+awk '{print $2}' $galeFolder/all.test | sort -u > $galeFolder/test_id$$
+
+# generate the report test set
+awk '{print $2}' $galeFolder/report | sort -u  > $galeFolder/report_id$$
+comm -1 -2 $galeFolder/test_id$$ $galeFolder/report_id$$ > $galeFolder/report.test
+
+# generate the conversational test set
+awk '{print $2}' $galeFolder/conversational | sort -u  > $galeFolder/conversational_id$$
+
+comm -1 -2 $galeFolder/test_id$$ $galeFolder/conversational_id$$ > $galeFolder/conversational.test
+
+rm -fr $galeFolder/test_id$$ $galeFolder/report_id$$ $galeFolder/conversational_id$$
+
+min_lmwt=7
+max_lmwt=20
+cat list_decode$$ | while read dir; do
+ for type in report conversational; do
+ #echo "Processing: $dir $type"
+  rm -fr $dir/scoring_$type
+  cp -pr $dir/scoring  $dir/scoring_$type
+  ( cd $dir/scoring_$type;
+    for x in *.tra test_filt.txt; do
+      sort -u $x > tmp$$
+      join tmp$$ $galeFolder/${type}.test > $x
+      rm -fr tmp$$
+    done
+   )
+
+utils/run.pl LMWT=$min_lmwt:$max_lmwt $dir/scoring_$type/log/score.LMWT.log \
+   cat $dir/scoring_${type}/LMWT.tra \| \
+    utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
+    compute-wer --text --mode=present \
+     ark:$dir/scoring_${type}/test_filt.txt  ark,p:- ">&" $dir/wer_${type}_LMWT
+done
+done
+
+
+time=$(date +"%Y-%m-%d-%H-%M-%S")
+echo "RESULTS generated by $USER at $time"
+
+echo "Report Results WER:"
+cat list_decode$$ | while read x; do [ -d $x ] && grep WER $x/wer_report_* | utils/best_wer.sh; done | sort -n -k2
+
+echo "Conversational Results WER:"
+cat list_decode$$ | while read x; do [ -d $x ] && grep WER $x/wer_conversational_* | utils/best_wer.sh; done | sort -n -k2
+
+echo "Combined Results for Reports and Conversational WER:"
+cat list_decode$$ | while read x; do [ -d $x ] && grep WER $x/wer_?? $x/wer_?| utils/best_wer.sh; done | sort -n -k2
+
+rm list_decode$$
+
+
+
diff --git a/egs/gale_arabic/s5c/local/test_list b/egs/gale_arabic/s5c/local/test_list
@@ -0,0 +1,11 @@
+ALAM_WITHEVENT_ARB_20070116_205800
+ALAM_WITHEVENT_ARB_20070130_205800
+ALAM_WITHEVENT_ARB_20070206_205801
+ALAM_WITHEVENT_ARB_20070213_205800
+ALAM_WITHEVENT_ARB_20070227_205800
+ALAM_WITHEVENT_ARB_20070306_205800
+ALAM_WITHEVENT_ARB_20070313_205800
+ARABIYA_FROMIRAQ_ARB_20070216_175800
+ARABIYA_FROMIRAQ_ARB_20070223_175801
+ARABIYA_FROMIRAQ_ARB_20070302_175801
+ARABIYA_FROMIRAQ_ARB_20070309_175800
diff --git a/egs/gale_arabic/s5c/local/wer_output_filter b/egs/gale_arabic/s5c/local/wer_output_filter
@@ -0,0 +1,4 @@
+#!/bin/sed -f
+s/@@ //g
+s/<sil>//g
+s/<UNK>//g
diff --git a/egs/gale_arabic/s5c/path.sh b/egs/gale_arabic/s5c/path.sh
@@ -0,0 +1,5 @@
+export KALDI_ROOT=$(pwd)/../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/gale_arabic/s5c/run.sh b/egs/gale_arabic/s5c/run.sh
@@ -0,0 +1,131 @@
+#!/bin/bash -e
+
+# Copyright 2014 QCRI (author: Ahmed Ali)
+#           2019 Dongji Gao
+# Apache 2.0
+
+# This is an example script for subword implementation
+
+num_jobs=120
+num_decode_jobs=40
+decode_gmm=true
+stage=0
+overwrite=false
+num_merges=1000
+
+dir1=/export/corpora/LDC/LDC2013S02/
+dir2=/export/corpora/LDC/LDC2013S07/
+dir3=/export/corpora/LDC/LDC2014S07/
+text1=/export/corpora/LDC/LDC2013T17/
+text2=/export/corpora/LDC/LDC2013T04/
+text3=/export/corpora/LDC/LDC2014T17/
+
+galeData=GALE
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+. ./path.sh
+. ./utils/parse_options.sh  # e.g. this parses the above options
+                            # if supplied.
+
+if [ $stage -le 0 ]; then
+
+  if [ -f data/train/text ] && ! $overwrite; then
+    echo "$0: Not processing, probably script have run from wrong stage"
+    echo "Exiting with status 1 to avoid data corruption"
+    exit 1;
+  fi
+
+  echo "$0: preparing data..."
+  local/prepare_data.sh --dir1 $dir1 --dir2 $dir2 --dir3 $dir3 \
+                        --text1 $text1 --text2 $text2 --text3 $text3
+
+  echo "$0: Preparing lexicon and LM..." 
+  local/prepare_dict_subword.sh --num_merges $num_merges
+
+  utils/subword/prepare_lang_subword.sh data/local/dict "<UNK>" data/local/lang data/lang
+
+  for set in train test; do
+    utils/subword/prepare_subword_text.sh data/$set/text data/local/pair_code.txt data/$set/text
+  done
+
+  local/prepare_lm_subword.sh
+
+  utils/format_lm.sh data/lang data/local/lm/lm.gz \
+                     data/local/dict/lexicon.txt data/lang_test
+fi
+
+mfccdir=mfcc
+if [ $stage -le 1 ]; then
+  echo "$0: Preparing the test and train feature files..."
+  for x in train test ; do
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \
+      data/$x exp/make_mfcc/$x $mfccdir
+    utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons
+    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating sub-set and training monophone system"
+  utils/subset_data_dir.sh data/train 10000 data/train.10K || exit 1;
+
+  steps/train_mono.sh --nj 40 --cmd "$train_cmd" \
+    data/train.10K data/lang exp/mono_subword || exit 1;
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: Aligning data using monophone system"
+  steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/mono_subword exp/mono_ali_subword || exit 1;
+
+  echo "$0: training triphone system with delta features"
+  steps/train_deltas.sh --cmd "$train_cmd" \
+    2500 30000 data/train data/lang exp/mono_ali_subword exp/tri1_subword || exit 1;
+fi
+
+if [ $stage -le 4 ] && $decode_gmm; then
+  utils/mkgraph.sh data/lang_test exp/tri1_subword exp/tri1_subword/graph
+  steps/decode.sh  --nj $num_decode_jobs --cmd "$decode_cmd" \
+    exp/tri1_subword/graph data/test exp/tri1_subword/decode
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: Aligning data and retraining and realigning with lda_mllt"
+  steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/tri1_subword exp/tri1_ali_subword || exit 1;
+
+  steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \
+    data/train data/lang exp/tri1_ali_subword exp/tri2b_subword || exit 1;
+fi
+
+if [ $stage -le 6 ] && $decode_gmm; then
+  utils/mkgraph.sh data/lang_test exp/tri2b_subword exp/tri2b_subword/graph
+  steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \
+    exp/tri2b_subword/graph data/test exp/tri2b_subword/decode
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: Aligning data and retraining and realigning with sat_basis"
+  steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/tri2b_subword exp/tri2b_ali_subword || exit 1;
+
+  steps/train_sat_basis.sh --cmd "$train_cmd" \
+    5000 100000 data/train data/lang exp/tri2b_ali_subword exp/tri3b_subword || exit 1;
+
+  steps/align_fmllr.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/tri3b_subword exp/tri3b_ali_subword || exit 1;
+fi
+
+if [ $stage -le 8 ] && $decode_gmm; then
+  utils/mkgraph.sh data/lang_test exp/tri3b_subword exp/tri3b_subword/graph
+  steps/decode_fmllr.sh --nj $num_decode_jobs --cmd \
+    "$decode_cmd" exp/tri3b_subword/graph data/test exp/tri3b_subword/decode
+fi
+
+if [ $stage -le 9 ]; then
+  echo "$0: Training a regular chain model using the e2e alignments..."
+  local/chain/run_tdnn.sh --gmm tri3b_subword
+fi
+
+echo "$0: training succeed"
+exit 0
diff --git a/egs/gale_arabic/s5c/steps b/egs/gale_arabic/s5c/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
diff --git a/egs/gale_arabic/s5c/utils b/egs/gale_arabic/s5c/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
diff --git a/egs/wsj/s5/utils/lang/make_position_dependent_subword_lexicon.py b/egs/wsj/s5/utils/lang/make_position_dependent_subword_lexicon.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+
+# 2019 Dongji Gao
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+from make_lexicon_fst import read_lexiconp
+import argparse
+import math
+
+def get_args():
+    parser = argparse.ArgumentParser(description="""This script creates a
+        position-dependent subword lexicon from a position-independent subword lexicon
+        by adding suffixes ("_B", "_I", "_E", "_S") to the related phones.
+        It assumes that the input lexicon does not contain disambiguation symbols.""")
+    parser.add_argument("--separator", type=str, default="@@", help="""Separator
+        indicates the position of a subword in a word. 
+        Subword ends with separator can only appear at the beginning or middle of a word. 
+        Subword without separator can only appear at the end of a word or is a word itself.
+        E.g. "international -> inter@@ nation@@ al";
+             "nation        -> nation"
+        The separator should match the separator used in the input lexicon.""")
+    parser.add_argument("lexiconp", type=str, help="""Filename of subword position-independent 
+        lexicon with pronunciation probabilities, with lines of the form 'subword prob p1 p2 ...'""")
+    args = parser.parse_args()
+    return args
+
+def is_end(subword, separator):
+    """Return true if the subword can appear at the end of a word (i.e., the subword 
+    does not end with separator). Return false otherwise."""
+    return not subword.endswith(separator)
+
+def write_position_dependent_lexicon(lexiconp, separator):
+    """Print a position-dependent lexicon for each subword from the input lexiconp by adding
+    appropriate suffixes ("_B", "_I", "_E", "_S") to the phone sequence related to the subword.
+    There are 4 types of position-dependent subword:
+    1) Beginning subword. It can only appear at the beginning of a word.
+       The first phone suffix should be "_B" and other suffixes should be "_I"s:
+        nation@@ 1.0 n_B ey_I sh_I ih_I n_I
+        n@@      1.0 n_B
+    2) Middle subword. It can only appear at the middle of a word.
+       All phone suffixes should be "_I"s:
+        nation@@ 1.0 n_I ey_I sh_I ih_I n_I
+    3) End subword. It can only appear at the end of a word.
+       The last phone suffix should be "_E" and other suffixes should be "_I"s:
+        nation   1.0 n_I ey_I sh_I ih_I n_E
+        n        1.0 n_E
+    4) Singleton subword (i.e., the subword is word it self). 
+       The first phone suffix should be "_B" and the last suffix should be "_E".
+       All other suffixes should be "_I"s. If there is only one phone, its suffix should be "_S":
+        nation   1.0 n_B ey_I sh_I ih_I n_E
+        n        1.0 n_S
+    In most cases (i.e., subwords have more than 1 phones), the suffixes of phones in the middle are "_I"s.
+    So the suffix_list is initialized with all _I and we only replace the first and last phone suffix when
+    dealing with different cases when necessary.
+    """
+    for (word, prob, phones) in lexiconp:
+        phones_length = len(phones)
+
+        # suffix_list is initialized by all "_I"s.
+        suffix_list = ["_I" for i in range(phones_length)]
+
+        if is_end(word, separator):
+            # print end subword lexicon by replacing the last phone suffix by "_E"
+            suffix_list[-1] = "_E"
+            phones_list = [phone + suffix for (phone, suffix) in zip(phones, suffix_list)]
+            print("{} {} {}".format(word, prob, ' '.join(phones_list)))
+
+            # print singleton subword lexicon
+            # the phone suffix is "_S" if the there is only 1 phone.
+            if phones_length == 1:
+                suffix_list[0] = "_S"
+                phones_list = [phone + suffix for (phone, suffix) in zip(phones, suffix_list)]
+                print("{} {} {}".format(word, prob, ' '.join(phones_list)))
+            # the first phone suffix is "_B" is there is more than 1 phones.
+            else:
+                suffix_list[0] = "_B"
+                phones_list = [phone + suffix for (phone, suffix) in zip(phones, suffix_list)]
+                print("{} {} {}".format(word, prob, ' '.join(phones_list)))
+        else:
+            # print middle subword lexicon
+            phones_list = [phone + suffix for (phone, suffix) in zip(phones, suffix_list)]
+            print("{} {} {}".format(word, prob, ' '.join(phones_list)))
+
+            # print beginning subword lexicon by replacing the first phone suffix by "_B"
+            suffix_list[0] = "_B"
+            phones_list = [phone + suffix for (phone, suffix) in zip(phones, suffix_list)]
+            print("{} {} {}".format(word, prob, ' '.join(phones_list)))
+
+def main():
+    args = get_args()
+    lexiconp = read_lexiconp(args.lexiconp)
+    write_position_dependent_lexicon(lexiconp, args.separator)
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/wsj/s5/utils/lang/make_subword_lexicon_fst.py b/egs/wsj/s5/utils/lang/make_subword_lexicon_fst.py
@@ -0,0 +1,301 @@
+#!/usr/bin/env python3
+
+# 2019 Dongji Gao
+# Apache 2.0.
+
+from make_lexicon_fst import read_lexiconp
+import argparse
+import math
+import sys
+
+# see get_args() below for usage mesage
+def get_args():
+    parser = argparse.ArgumentParser(description="""This script creates the
+        text form of a subword lexicon FST to be compiled by fstcompile using
+        the appropriate symbol tables (phones.txt and words.txt). It will mostly
+        be invoked indirectly via utils/prepare_lang_subword.sh. The output
+        goes to the stdout. This script is the subword version of make_lexicon_fst.py.
+        It only allows optional silence to appear after end-subword or singleton-subword,
+        (i.e., subwords without separator). In this version we do not support
+        pronunciation probability. (i.e., pron-prob = 1.0)""")
+
+    parser.add_argument('--sil-phone', type=str, help="""Text form of
+        optional-silence phone, e.g. 'SIL'. See also the --sil-prob option.""")
+    parser.add_argument('--sil-prob', type=float, default=0.0, help="""Probability
+        of silence between words (including the beginning and end of word sequence).
+        Must be in range [0.0, 1.0). This refer to the optional silence inserted by
+        the lexicon; see the --sil-phone option.""")
+    parser.add_argument('--sil-disambig', type=str, help="""Disambiguation symbol
+        to disambiguate silence, e.g. #5. Will only be supplied if you are creating 
+        the version of L.fst with disambiguation symbols, intended for use with cyclic 
+        G.fst. This symbol was introduced to fix a rather obscure source of nondeterminism 
+        of CLG.fst, that has to do with reordering of disambiguation symbols and phone symbols.""")
+    parser.add_argument('--position-dependent', action="store_true", help="""Whether 
+        the input lexicon is position-dependent.""")
+    parser.add_argument("--separator", type=str, default="@@", help="""Separator
+        indicates the position of a subword in a word.
+        Subword followed by separator can only appear at the beginning or middle of a word.
+        Subword without separator can only appear at the end of a word or is a word itself.
+        E.g. "international -> inter@@ nation@@ al";
+             "nation        -> nation"
+    The separator should match the separator used in the input lexicon.""")
+    parser.add_argument('lexiconp', type=str, help="""Filename of lexicon with
+        pronunciation probabilities (normally lexiconp.txt), with lines of the
+        form 'subword prob p1 p2...', e.g. 'a, 1.0 ay'""")
+    args = parser.parse_args()
+    return args
+
+def contain_disambig_symbol(phones):
+    """Return true if the phone sequence contains disambiguation symbol.
+    Return false otherwise. Disambiguation symbol is at the end of phones 
+    in the form of #1, #2... There is at most one disambiguation 
+    symbol for each phone sequence"""
+    return True if phones[-1].startswith("#") else False
+
+def print_arc(src, dest, phone, word, cost):
+    print('{}\t{}\t{}\t{}\t{}'.format(src, dest, phone, word, cost))
+
+def is_end(word, separator):
+    """Return true if the subword can appear at the end of a word (i.e., the subword
+    does not end with separator). Return false otherwise."""
+    return not word.endswith(separator)
+
+def get_suffix(phone):
+    """Return the suffix of a phone. The suffix is in the form of '_B', '_I'..."""
+    if len(phone) < 3:
+        print("{}: invalid phone {} (please check if the phone is position-dependent)".format(
+              sys.argv[0], phone), file=sys.stderr)
+        sys.exit(1)
+    return phone[-2:]
+
+def write_fst_no_silence(lexicon, position_dependent, separator):
+    """Writes the text format of L.fst to the standard output.  This version is for
+    when --sil-prob=0.0, meaning there is no optional silence allowed.
+    loop_state here is the start and final state of the fst. It goes to word_start_state
+    via epsilon transition.
+    In position-independent case, there is no difference between beginning word and 
+    middle word. So all subwords with separator would leave from and enter word_start_state.
+    All subword without separator would leave from word_start_state and enter loop_state.
+    This guarantees that optional silence can only follow a word-end subword.
+
+    In position-dependent case, there are 4 types of position-dependent subword:
+    1) Beginning subword. The first phone suffix should be "_B" and other suffixes should be "_I"s:
+        nation@@ 1.0 n_B ey_I sh_I ih_I n_I
+        n@@      1.0 n_B
+    2) Middle subword. All phone suffixes should be "_I"s:
+        nation@@ 1.0 n_I ey_I sh_I ih_I n_I
+    3) End subword. The last phone suffix should be "_E" and other suffixes be should "_I"s:
+        nation   1.0 n_I ey_I sh_I ih_I n_E
+        n        1.0 n_E
+    4) Singleton subword (i.e., the subword is word it self).
+       The first phone suffix should be "_B" and the last suffix should be "_E".
+       All other suffix should be "_I"s. If there is only one phone, its suffix should be "_S":
+        nation   1.0 n_B ey_I sh_I ih_I n_E
+        n        1.0 n_S
+
+    So we need an extra word_internal_state. The beginning word 
+    would leave from word_start_state and enter word_internal_state and middle word
+    would leave from and enter word_internal_state. The rest part is same.
+
+      'lexicon' is a list of 3-tuples (subword, pron-prob, prons) as returned by
+      'position_dependent', which is true is the lexicon is position-dependent.
+      'separator' is a symbol which indicates the position of a subword in word.
+    """
+    # regular setting
+    loop_state = 0
+    word_start_state = 1
+    next_state = 2
+
+    print_arc(loop_state, word_start_state, "<eps>", "<eps>", 0.0)
+
+    # optional setting for word_internal_state
+    if position_dependent:
+        word_internal_state = next_state
+        next_state += 1
+
+    for (word, pron_prob, phones) in lexicon:
+        pron_cost = 0.0                # do not support pron_prob
+        phones_len = len(phones)
+
+        # set start and end state for different cases
+        if position_dependent:
+            first_phone_suffix = get_suffix(phones[0])
+            last_phone = phones[-2] if contain_disambig_symbol(phones) else phones[-1]
+            last_phone_suffix = get_suffix(last_phone)
+
+            # singleton word
+            if first_phone_suffix == "_S":
+                current_state = word_start_state
+                end_state = loop_state
+            # set the current_state
+            elif first_phone_suffix == "_B":
+                current_state = word_start_state
+            elif first_phone_suffix == "_I" or first_phone_suffix == "_E":
+                current_state = word_internal_state
+            # then set the end_state
+            if last_phone_suffix == "_B" or last_phone_suffix == "_I":
+                end_state = word_internal_state
+            elif last_phone_suffix == "_E":
+                end_state = loop_state
+        else:
+            current_state = word_start_state
+            end_state = loop_state if is_end(word, separator) else word_start_state
+
+        # print arcs (except the last one) for the subword
+        for i in range(phones_len - 1):
+            word = word if i == 0 else "<eps>"
+            cost = pron_cost if i == 0 else 0.0
+            print_arc(current_state, next_state, phones[i], word, cost)
+            current_state = next_state
+            next_state += 1
+
+        # print the last arc
+        i = phones_len - 1
+        phone = phones[i] if i >=0 else "<eps>"
+        word = word if i <= 0 else "<eps>"
+        cost = pron_cost if i <= 0 else 0.0
+        print_arc(current_state, end_state, phone, word, cost)
+
+    # set the final state
+    print("{state}\t{final_cost}".format(state=loop_state, final_cost=0.0))
+
+def write_fst_with_silence(lexicon, sil_phone, sil_prob, sil_disambig, position_dependent, separator):
+    """Writes the text format of L.fst to the standard output.  This version is for
+    when --sil-prob=0.0, meaning there is no optional silence allowed.
+    loop_state here is the start and final state of the fst. It goes to word_start_state
+    via epsilon transition.
+
+    In position-independent case, there is no difference between beginning word and 
+    middle word. So all subwords with separator would leave from and enter word_start_state.
+    All subword without separator would leave from word_start_state and enter sil_state.
+    This guarantees that optional silence can only follow a word-end subword and such subwords
+    must appear at the end of the whole subword sequence.
+
+    In position-dependent case, there are 4 types of position-dependent subword:
+    1) Beginning subword. The first phone suffix should be "_B" and other suffixes should be "_I"s:
+        nation@@ 1.0 n_B ey_I sh_I ih_I n_I
+        n@@      1.0 n_B
+    2) Middle subword. All phone suffixes should be "_I"s:
+        nation@@ 1.0 n_I ey_I sh_I ih_I n_I
+    3) End subword. The last phone suffix should be "_E" and other suffixes be should "_I"s:
+        nation   1.0 n_I ey_I sh_I ih_I n_E
+        n        1.0 n_E
+    4) Singleton subword (i.e., the subword is word it self).
+       The first phone suffix should be "_B" and the last suffix should be "_E".
+       All other suffix should be "_I"s. If there is only one phone, its suffix should be "_S":
+        nation   1.0 n_B ey_I sh_I ih_I n_E
+        n        1.0 n_S
+
+    So we need an extra word_internal_state. The beginning word 
+    would leave from word_start_state and enter word_internal_state and middle word
+    would leave from and enter word_internal_state. The rest part is same.
+
+      'lexicon' is a list of 3-tuples (subword, pron-prob, prons)
+         as returned by read_lexiconp().
+      'sil_prob', which is expected to be strictly between 0.0 and 1.0, is the
+         probability of silence
+      'sil_phone' is the silence phone, e.g. "SIL".
+      'sil_disambig' is either None, or the silence disambiguation symbol, e.g. "#5".
+      'position_dependent', which is True is the lexicion is position-dependent.
+      'separator' is the symbol we use to indicate the position of a subword in word.
+    """
+
+    sil_cost = -math.log(sil_prob)
+    no_sil_cost = -math.log(1 - sil_prob)
+
+    # regular setting
+    start_state = 0
+    loop_state = 1         # also the final state
+    sil_state = 2          # words terminate here when followed by silence; this state
+                           # has a licence transition to loop_state
+    word_start_state = 3   # subword leave from here
+    next_state = 4         # the next un-allocated state, will be incremented as we go
+
+    print_arc(start_state, loop_state, "<eps>", "<eps>", no_sil_cost)
+    print_arc(start_state, sil_state, "<eps>", "<eps>", sil_cost)
+    print_arc(loop_state, word_start_state, "<eps>", "<eps>", 0.0)
+
+    # optional setting for disambig_state
+    if sil_disambig is None:
+        print_arc(sil_state, loop_state, sil_phone, "<eps>", 0.0)
+    else:
+        disambig_state = next_state
+        next_state += 1
+        print_arc(sil_state, disambig_state, sil_phone, "<eps>", 0.0)
+        print_arc(disambig_state, loop_state, sil_disambig, "<eps>", 0.0)
+
+    # optional setting for word_internal_state
+    if position_dependent:
+        word_internal_state = next_state
+        next_state += 1
+
+    for (word, pron_prob, phones) in lexicon:
+        pron_cost = 0.0           # do not support pron_prob
+        phones_len = len(phones)
+
+        # set start and end state for different cases
+        if position_dependent:
+            first_phone_suffix = get_suffix(phones[0])
+            last_phone = phones[-2] if contain_disambig_symbol(phones) else phones[-1]
+            last_phone_suffix = get_suffix(last_phone)
+
+            # singleton subword
+            if first_phone_suffix == "_S":
+                current_state = word_start_state
+                end_state_list = [loop_state, sil_state]
+                end_cost_list = [no_sil_cost, sil_cost]
+            # first set the current_state
+            elif first_phone_suffix == "_B":
+                current_state = word_start_state
+            elif first_phone_suffix == "_I" or first_phone_suffix == "_E":
+                current_state = word_internal_state
+            # then set the end_state (end_state_list)
+            if last_phone_suffix == "_B" or last_phone_suffix == "_I":
+                end_state_list = [word_internal_state]
+                end_cost_list = [0.0]
+            elif last_phone_suffix == "_E":
+                end_state_list = [loop_state, sil_state]
+                end_cost_list = [no_sil_cost, sil_cost]
+        else:
+            current_state = word_start_state
+            if is_end(word, separator):
+                end_state_list = [loop_state, sil_state]
+                end_cost_list = [no_sil_cost, sil_cost]
+            else:
+                end_state_list = [word_start_state]
+                end_cost_list = [0.0]
+
+        # print arcs (except the last one) for the subword
+        for i in range(phones_len - 1):
+            word = word if i == 0 else "<eps>"
+            cost = pron_cost if i == 0 else 0.0
+            print_arc(current_state, next_state, phones[i], word, cost)
+            current_state = next_state
+            next_state += 1
+
+        # print the last arc
+        i = phones_len - 1
+        phone = phones[i] if i >= 0 else "<eps>"
+        word = word if i <= 0 else "<eps>"
+        cost = pron_cost if i <= 0 else 0.0
+        for (end_state, end_cost) in zip(end_state_list, end_cost_list):
+            print_arc(current_state, end_state, phone, word, cost + end_cost)
+
+    # set the final state
+    print("{state}\t{final_cost}".format(state=loop_state, final_cost=0.0))
+
+def main():
+    args = get_args()
+    if args.sil_prob < 0.0 or args.sil_prob >= 1.0:
+        print("{}: invalid value specified --sil-prob={}".format(
+              sys.argv[0], args.sil_prob), file=sys.stderr)
+        sys.exit(1)
+    lexicon = read_lexiconp(args.lexiconp)
+    if args.sil_prob == 0.0:
+        write_fst_no_silence(lexicon, args.position_dependent, args.separator)
+    else:
+        write_fst_with_silence(lexicon, args.sil_phone, args.sil_prob, 
+            args.sil_disambig, args.position_dependent, args.separator)
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/wsj/s5/utils/subword/prepare_lang_subword.sh b/egs/wsj/s5/utils/subword/prepare_lang_subword.sh
diff --git a/egs/wsj/s5/utils/subword/prepare_subword_text.sh b/egs/wsj/s5/utils/subword/prepare_subword_text.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# 2019 Dongji Gao
+
+# This script generates subword text form word text.
+# For example, <noise> internatioal -> <noise> inter@@ nation@@ al
+# @@ here is the separator indicate the poisition of subword in word.
+# Subword directly followed by separator can only appear at he begining or middle of word.
+# "<noise>" here can be reserved if added to the option "--glossaries"
+
+# Begin configuration section
+separator="@@"
+glossaries=
+# End configuration section
+
+. utils/parse_options.sh
+
+echo "$0 $@"
+
+if [ $# -ne 3 ]; then
+  echo "Usage: utils/prepare_subword_text.sh <word-text> <pair_code> <subword-text>"
+  echo "e.g.: utils/prepare_subword_text.sh data/train/text data/local/pair_code.txt data/train/text_subword"
+  echo "    --seperator <separator>         # default: @@"
+  echo "    --glossaries <reserved-words>   # glossaries are words reserved"
+  exit 1;
+fi
+
+word_text=$1
+pair_code=$2
+subword_text=$3
+
+[ ! -f $word_text ] && echo "Word text $word_text does not exits." && exit 1;
+
+grep -q $separator $word_text && echo "$0: Error, word text file contains separator $separator. This might be a subword text file or you need to choose a different separator" && exit 1;
+
+glossaries_opt=
+[ -z $glossaires ] && glossaries_opt="--glossaries $glossaries"
+cut -d ' ' -f2- $word_text | \
+  utils/lang/bpe/apply_bpe.py -c $pair_code --separator $separator $glossaires_opt > ${word_text}.sub
+  if [ $word_text == $subword_text ]; then
+    mv $word_text ${word_text}.old
+    cut -d ' ' -f1 ${word_text}.old | paste -d ' ' - ${word_text}.sub > $subword_text
+  else
+    cut -d ' ' -f1 $word_text | paste -d ' ' - ${word_text}.sub > $subword_text
+  fi
+
+rm ${word_text}.sub
+echo "Subword text created."
diff --git a/egs/wsj/s5/utils/validate_lang.pl b/egs/wsj/s5/utils/validate_lang.pl
@@ -4,6 +4,7 @@
 # Copyright  2012   Guoguo Chen
 #            2014   Neil Nelson
 #            2017   Johns Hopkins University (Jan "Yenda" Trmal <jtrmal@gmail.com>)
+#            2019   Dongji Gao
 #
 # Validation script for data/lang
 
@@ -101,6 +102,7 @@ sub check_allowed_whitespace {
 $skip_det_check = 0;
 $skip_disambig_check = 0;
 $skip_generate_words_check = 0;
+$subword_check = 0;
 
 for ($x=0; $x <= 3; $x++) {
   if (@ARGV > 0 && $ARGV[0] eq "--skip-determinization-check") {
@@ -121,6 +123,7 @@ sub check_allowed_whitespace {
   print "Usage: $0 [options] <lang_directory>\n";
   print "e.g.:  $0 data/lang\n";
   print "Options:\n";
+  print " --skip-det-check                         (this flag causes it to skip a deterministic fst check).\n";
   print " --skip-determinization-check             (this flag causes it to skip a time consuming check).\n";
   print " --skip-disambig-check                    (this flag causes it to skip a disambig check in phone bigram models).\n";
   exit(1);
@@ -131,6 +134,40 @@ sub check_allowed_whitespace {
 $lang = shift @ARGV;
 $exit = 0;
 $warning = 0;
+
+# Checking existence of separator file ------------------
+print "Checking existence of separator file\n";
+if (!-e "$lang/subword_separator.txt") {
+  print "separator file $lang/subword_separator.txt is empty or does not exist, deal in word case.\n";
+} else {
+  if (!open(S, "<$lang/subword_separator.txt")) {
+    print "--> ERROR: fail to open $lang/subword_separator.txt\n"; exit 1;
+  } else {
+    $line_num = `wc -l <$lang/subword_separator.txt`;
+    if ($line_num != 1) {
+      print "--> ERROR, $lang/subword_separator.txt should only contain one line.\n"; exit 1;
+    } else {
+      while (<S>) {
+        chomp;
+        my @col = split(" ", $_);
+        if (@col != 1) {
+          print "--> ERROR, invalid separator.\n"; exit 1;
+        } else {
+         $separator = shift @col;
+         $separator_length = length $separator;
+         $subword_check = 1;
+        }
+      }
+    }
+  }
+}
+
+if (!$subword_check) {
+  $word_boundary = "word_boundary";
+} else {
+  $word_boundary = "word_boundary_moved";
+}
+
 # Checking phones.txt -------------------------------
 print "Checking $lang/phones.txt ...\n";
 if (-z "$lang/phones.txt") {
@@ -492,7 +529,7 @@ sub check_summation {
   my $ok = 1;
   foreach $p (keys %psymtab) {
     if (! defined $sum{$p} && $p !~ m/^#nonterm/) {
-      $exit = 1;  $ok = 0;  print("--> ERROR: phone $p is not in silence.txt, nonsilence.txt or disambig.txt...");
+      $exit = 1;  $ok = 0;  print("--> ERROR: phone $p is not in silence.txt, nonsilence.txt or disambig.txt...\n");
     }
   }
 
@@ -530,8 +567,8 @@ sub check_summation {
     $exit = 1;
   }
 }
-if (-e "$lang/phones/word_boundary.txt") {
-  check_txt_int("$lang/phones/word_boundary", \%psymtab, 0); print "\n";
+if (-e "$lang/phones/$word_boundary.txt") {
+  check_txt_int("$lang/phones/$word_boundary", \%psymtab, 0); print "\n";
 }
 
 # Checking optional_silence.txt -------------------------------
@@ -634,10 +671,10 @@ sub check_summation {
 $end       = "";
 $internal  = "";
 $singleton = "";
-if (-s "$lang/phones/word_boundary.txt") {
-  print "Checking word_boundary.txt: silence.txt, nonsilence.txt, disambig.txt ...\n";
-  if (!open (W, "<$lang/phones/word_boundary.txt")) {
-    $exit = 1; print "--> ERROR: fail to open $lang/phones/word_boundary.txt\n";
+if (-s "$lang/phones/$word_boundary.txt") {
+  print "Checking $word_boundary.txt: silence.txt, nonsilence.txt, disambig.txt ...\n";
+  if (!open (W, "<$lang/phones/$word_boundary.txt")) {
+    $exit = 1; print "--> ERROR: fail to open $lang/phones/$word_boundary.txt\n";
   }
   $idx = 1;
   %wb = ();
@@ -660,7 +697,7 @@ sub check_summation {
       s/ singleton$//g; @col = split(" ", $_); if (@col == 1) {$singleton .= "$col[0] ";}
     }
     if (@col != 1) {
-      $exit = 1; print "--> ERROR: expect 1 column in $lang/phones/word_boundary.txt (line $idx)\n";
+      $exit = 1; print "--> ERROR: expect 1 column in $lang/phones/$word_boundary.txt (line $idx)\n";
     }
     $wb{shift @col} = 1;
     $idx ++;
@@ -671,21 +708,21 @@ sub check_summation {
   $success1 = 1;
   if (@itset != 0) {
     $success1 = 0;
-    $exit = 1; print "--> ERROR: $lang/phones/word_boundary.txt has disambiguation symbols -- ";
+    $exit = 1; print "--> ERROR: $lang/phones/$word_boundary.txt has disambiguation symbols -- ";
     foreach (@itset) {
       print "$_ ";
     }
     print "\n";
   }
-  $success1 == 0 || print "--> $lang/phones/word_boundary.txt doesn't include disambiguation symbols\n";
+  $success1 == 0 || print "--> $lang/phones/$word_boundary.txt doesn't include disambiguation symbols\n";
 
   %sum = (%silence, %nonsilence);
   @itset = intersect(\%sum, \%wb);
   %itset = (); foreach(@itset) {$itset{$_} = 1;}
   $success2 = 1;
   if (@itset < scalar(keys %sum)) {
     $success2 = 0;
-    $exit = 1; print "--> ERROR: phones in nonsilence.txt and silence.txt but not in word_boundary.txt -- ";
+    $exit = 1; print "--> ERROR: phones in nonsilence.txt and silence.txt but not in $word_boundary.txt -- ";
     foreach (keys %sum) {
       if (!$itset{$_}) {
         print "$_ ";
@@ -695,16 +732,16 @@ sub check_summation {
   }
   if (@itset < scalar(keys %wb)) {
     $success2 = 0;
-    $exit = 1; print "--> ERROR: phones in word_boundary.txt but not in nonsilence.txt or silence.txt -- ";
+    $exit = 1; print "--> ERROR: phones in $word_boundary.txt but not in nonsilence.txt or silence.txt -- ";
     foreach (keys %wb) {
       if (!$itset{$_}) {
         print "$_ ";
       }
     }
     print "\n";
   }
-  $success2 == 0 || print "--> $lang/phones/word_boundary.txt is the union of nonsilence.txt and silence.txt\n";
-  $success1 != 1 or $success2 != 1 || print "--> $lang/phones/word_boundary.txt is OK\n";
+  $success2 == 0 || print "--> $lang/phones/$word_boundary.txt is the union of nonsilence.txt and silence.txt\n";
+  $success1 != 1 or $success2 != 1 || print "--> $lang/phones/$word_boundary.txt is OK\n";
   print "\n";
 }
 
@@ -750,11 +787,11 @@ sub check_summation {
     close(P);
     my $len = @wdisambig, $len2;
     if (($len2 = @wdisambig_words) != $len) {
-      print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_words.int have different lengths";
+      print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_words.int have different lengths\n";
       $exit = 1; return;
     }
     if (($len2 = @wdisambig_phones) != $len) {
-      print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_phones.int have different lengths";
+      print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_phones.int have different lengths\n";
       $exit = 1; return;
     }
     for (my $i = 0; $i < $len; $i++) {
@@ -777,16 +814,23 @@ sub check_summation {
   }
 }
 
-
-if (-s "$lang/phones/word_boundary.int") {
-  print "Checking word_boundary.int and disambig.int\n";
-  if (!open (W, "<$lang/phones/word_boundary.int")) {
-    $exit = 1; print "--> ERROR: fail to open $lang/phones/word_boundary.int\n";
+# Check validity of L.fst, L_disambig.fst, and word_boundary.int.
+# First we generate a random word/subword sequence. We then compile it into fst and compose it with L.fst/L_disambig.fst.
+# For subword case the last subword of the sequence must be a end-subword 
+# (i.e. the subword can only be at the end of word or is a single word itself) 
+# to guarantee the composition would not fail.
+# We then get the corresponging phones sequence and apply a transition matrix on it to get the number of valid boundaries.
+# In word case, the number of valid boundaries should be equal to the number of words.
+# In subword case, the number of valid boundaries should be equal to the number of end-subwords.
+if (-s "$lang/phones/$word_boundary.int") {
+  print "Checking $word_boundary.int and disambig.int\n";
+  if (!open (W, "<$lang/phones/$word_boundary.int")) {
+    $exit = 1; print "--> ERROR: fail to open $lang/phones/$word_boundary.int\n";
   }
   while (<W>) {
     @A = split;
     if (@A != 2) {
-      $exit = 1; print "--> ERROR: bad line $_ in $lang/phones/word_boundary.int\n";
+      $exit = 1; print "--> ERROR: bad line $_ in $lang/phones/$word_boundary.int\n";
     }
     $wbtype{$A[0]} = $A[1];
   }
@@ -814,23 +858,58 @@ sub check_summation {
       next;
     }
     $wlen = int(rand(100)) + 1;
-    print "--> generating a $wlen word sequence\n";
+    $end_subword = 0;
+    print "--> generating a $wlen word/subword sequence\n";
     $wordseq = "";
     $sid = 0;
     $wordseq_syms = "";
-    foreach (1 .. $wlen) {
+    # exclude disambiguation symbols, BOS and EOS, epsilon, and
+    # grammar-related symbols from the word sequence.
+    while ($sid < ($wlen - 1)) {
       $id = int(rand(scalar(keys %wint2sym)));
-      # exclude disambiguation symbols, BOS and EOS, epsilon, and
-      # grammar-related symbols from the word sequence.
       while (defined $wdisambig_words_hash{$id} or
-             $wint2sym{$id} eq "<s>" or $wint2sym{$id} eq "</s>" or
-             $wint2sym{$id} =~ m/^#nonterm/ or $id == 0) {
+           $wint2sym{$id} eq "<s>" or $wint2sym{$id} eq "</s>" or
+           $wint2sym{$id} =~ m/^#nonterm/ or $id == 0) {
         $id = int(rand(scalar(keys %wint2sym)));
       }
       $wordseq_syms = $wordseq_syms . $wint2sym{$id} . " ";
       $wordseq = $wordseq . "$sid ". ($sid + 1) . " $id $id 0\n";
       $sid ++;
+
+      if ($subword_check) {
+        $subword = $wint2sym{$id};
+        $suffix = substr($subword, -$separator_length, $separator_length);
+        if ($suffix ne $separator) {
+          $end_subword ++;
+        }
+      }
+    } 
+
+    # generate the last word (subword)
+    $id = int(rand(scalar(keys %wint2sym)));
+    if ($subword_check) {
+      $subword = $wint2sym{$id};
+      $suffix = substr($subword, -$separator_length, $separator_length);
+      # the last subword can not followed by separator  
+      while (defined $wdisambig_words_hash{$id} or
+           $wint2sym{$id} eq "<s>" or $wint2sym{$id} eq "</s>" or
+           $wint2sym{$id} =~ m/^#nonterm/ or $id == 0 or $suffix eq $separator) {
+        $id = int(rand(scalar(keys %wint2sym)));
+        $subword = $wint2sym{$id};
+        $suffix = substr($subword, -$separator_length, $separator_length);
+      }
+      $end_subword ++;
+    } else {
+      while (defined $wdisambig_words_hash{$id} or
+           $wint2sym{$id} eq "<s>" or $wint2sym{$id} eq "</s>" or
+           $wint2sym{$id} =~ m/^#nonterm/ or $id == 0) {
+       $id = int(rand(scalar(keys %wint2sym)));
+      }
     }
+    $wordseq_syms = $wordseq_syms . $wint2sym{$id} . " ";
+    $wordseq = $wordseq . "$sid ". ($sid + 1) . " $id $id 0\n";
+    $sid ++;
+
     $wordseq = $wordseq . "$sid 0";
     $phoneseq = `. ./path.sh; echo \"$wordseq" | fstcompile | fstcompose $lang/$fst - | fstproject | fstrandgen | fstrmepsilon | fsttopsort | fstprint | awk '{if (NF > 2) {print \$3}}';`;
     $transition = { }; # empty assoc. array of allowed transitions between phone types.  1 means we count a word,
@@ -861,10 +940,10 @@ sub check_summation {
           $state = $wbtype{$phone};
         }
         if (!defined $state) {
-          $exit = 1; print "--> ERROR: phone $phone is not specified in $lang/phones/word_boundary.int\n";
+          $exit = 1; print "--> ERROR: phone $phone is not specified in $lang/phones/$word_boundary.int\n";
           last;
         } elsif (!defined $transition{$cur_state, $state}) {
-          $exit = 1; print "--> ERROR: transition from state $cur_state to $state indicates error in word_boundary.int or L.fst\n";
+          $exit = 1; print "--> ERROR: transition from state $cur_state to $state indicates error in $word_boundary.int or L.fst\n";
           last;
         } else {
           $num_words += $transition{$cur_state, $state};
@@ -873,10 +952,13 @@ sub check_summation {
       }
     }
     if (!$exit) {
+      if ($subword_check) { 
+        $wlen = $end_subword;
+      }
       if ($num_words != $wlen) {
         $phoneseq_syms = "";
         foreach my $id (split(" ", $phoneseq)) { $phoneseq_syms = $phoneseq_syms . " " . $pint2sym{$id}; }
-        $exit = 1; print "--> ERROR: number of reconstructed words $num_words does not match real number of words $wlen; indicates problem in $fst or word_boundary.int.  phoneseq = $phoneseq_syms, wordseq = $wordseq_syms\n";
+        $exit = 1; print "--> ERROR: number of reconstructed words $num_words does not match real number of words $wlen; indicates problem in $fst or $word_boundary.int.  phoneseq = $phoneseq_syms, wordseq = $wordseq_syms\n";
       } else {
         print "--> resulting phone sequence from $fst corresponds to the word sequence\n";
         print "--> $fst is OK\n";
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# configuration file for apply-cmvn-online, used in the script ../local/online/run_online_decoding_nnet2.sh