Commit a87bf9e6 authored by Cassio Batista's avatar Cassio Batista 🐢

bash scripts moved from previous repo

parent f811d8a2
# CMU Sphinx - tutorial para treino de modelo acústico
According to CMUSphinx's [tutorial AM](https://cmusphinx.github.io/wiki/tutorialam/),
the directory tree for new projects must follow the structure below:
```
my_db_dir/
.--------------:--------------.
│ │
etc/ wav/
├─ my_db.dic ├─ spkr_1/
├─ my_db.phone │ ├─ s1_file_1.wav
├─ my_db.lm.DMP │ ├─ s1_file_2.wav
├─ my_db.filler │ └─ s1_file_n.wav
├─ my_db.train.fileids ├─ spkr_2/
├─ my_db.train.transcription │ ├─ s2_file_1.wav
├─ my_db.test.fileids │ ├─ s2_file_2.wav
└─ my_db.test.transcription │ └─ s2_file_n.wav
└─ spkr_n/
├─ sn_file_1.wav
├─ sn_file_2.wav
└─ sn_file_n.wav
```
* __fb\_00\_create\_envtree.sh__:
This script creates the directory structure shown above, except the `spkr_X`
inside the `wav` folder. Notice that the data-dependent files (inside the `etc`
dir), although created, they __DO NOT__ have any content yet. IOW, they're only
initialized as empty files. A stupid choice of the developer.
* __fb\_01\_split\_train\_test.sh__:
This script fulfills the `fileids` and `transcriptions` files in `etc/` dir.
The data is divided as training set and test set, and the files within the
dirs are data-dependent. The folders `wav/spkr_X` contain symbolic links to the
actual wav-transcription base dir.
* __fb\_02\_define\_etclang.sh__:
This script specially fulfills the files inside `my_db_dir/etc` dir: .dic,
.filler and .phone. A dependency is the `g2p` software, which must be installed
and have its location available on the PATH env variable.
__NOTE__: Unless you want to build your own dictionary, you DO NOT need to
perform this step, since the dict files you'd rather need are already on our
github repo.
__Copyright Grupo FalaBrasil (2018)__
__Federal University of Pará__
__Author: Cassio Batista - cassio.batista.13@gmail.com__
#!/bin/bash
#
# Create environment tree for training acoustic models with CMU Sphinx
#
# Copyleft Grupo FalaBrasil (2018)
#
# Author: Mar 2018
# Cassio Batista - cassio.batista.13@gmail.com
# Federal University of Pará (UFPA)
#
# Reference:
# https://cmusphinx.github.io/wiki/tutorialam/
if test $# -ne 1
then
echo "A script to create the environment tree for training acoustic models"
echo "according to CMUSphinx's pattern."
echo "Ref.: https://cmusphinx.github.io/wiki/tutorialam/"
echo
echo "Usage: $0 <proj_dir>"
echo -e "\t<proj_dir> must be the path for your project folder"
echo -e "\te.g.: /home/cassio/sphinx/MEUPROJETO"
exit 1
elif [ -d $1 ]
then
echo -n "'$1' exists as dir. Override? [y/N] "
read ans
if [[ "$ans" != "y" ]]
then
echo "aborted."
exit 0
else
rm -rf $1
fi
fi
DATA_DIR="$1"
basefilename=$(basename $DATA_DIR)
mkdir -p $DATA_DIR
cd $DATA_DIR
mkdir etc
touch etc/${basefilename}{.dic,.phone,.lm.DMP,.filler}
touch etc/${basefilename}{_train.fileids,_train.transcription}
touch etc/${basefilename}{_test.fileids,_test.transcription}
mkdir wav
tree $DATA_DIR
echo "check out your project dir at '$(readlink -f $DATA_DIR)'"
### EOF ###
#!/bin/bash
#
# A script that creates the fileids and transcriptions files inside the etc/
# folder and also create symlinks of the audio dataset within the wav folder
#
# Copyleft Grupo FalaBrasil (2018)
#
# Author: March 2018
# Cassio Batista - cassio.batista.13@gmail.com
# Federal University of Pará (UFPA)
#
# Reference:
# https://cmusphinx.github.io/wiki/tutorialam/
DEGUB=false
SPLIT_RANDOM=false
dir_test="frases16k"
if test $# -ne 2
then
echo "A script that creates the fileids and transcriptions files inside the etc/"
echo "folder and also create symlinks of the audio dataset within the wav folder"
echo
echo "Usage: $0 <audio_dataset_dir> <sphinx_project_dir>"
echo -e "\t<audio_dataset_dir> is the folder that contains all your audio base (wav + transcript.)."
echo -e "\t<sphinx_project_dir> is the folder where you previously hosted your project."
echo -e "\t e.g.: /home/cassio/sphinx/MEUPROJETO"
exit 1
elif [ ! -d $1 ] || [ ! -d $2 ]
then
echo "Error: both '$1' and '$2' must be dirs"
exit 1
fi
function split_dataset_bg() {
# create a dir for the speaker and link both files to it
mkdir -p ${1}/wav/${2}
ln -s $3 ${1}/wav/${2}
ln -s $4 ${1}/wav/${2}
}
# 0.) split train test
function split_dataset() {
if [ ! -d ${1}/wav ] || [ ! -d ${1}/etc ] ; then
echo "warning: you may not had run the fb_00 script!"
fi
dbname=$(basename $1)
rm -rf ${1}/wav
if [[ $DEGUB == true ]] ; then echo -ne "defining $2 set: " ; fi
n=$(cat ${2}.list.0 | wc -l)
i=1
while read line
do
# define the ID speaker (same name of the folder)
spkr=$(readlink -f $line | sed 's/\// /g' | awk '{print $(NF-1)}')
wavname=$(basename $line)
# get the fullpath of audio and transcriptions files
wav=$(readlink -f ${line}.wav)
txt=$(readlink -f ${line}.txt)
# execute process of creating symlinks in background
(split_dataset_bg $1 $spkr $wav $txt)&
#if [[ $DEGUB == true ]] ; then
# echo -ne "\r\t\t\t\t\t\t$i/$n"
# i=$((i+1))
#fi
done < ${2}.list.0
#if [[ $DEGUB == true ]] ; then echo ; fi
sleep 1
echo -e "\ndone splitting $2"
}
# 1.) create fileids
function create_fileids() {
dbname=$(basename $1)
if [[ $DEGUB == true ]] ; then echo -ne "fileids for $2 set: " ; fi
rm -f ${1}/etc/${dbname}_${2}.fileids
n=$(cat ${2}.list.1 | wc -l)
i=1
while read line
do
# define the ID speaker (same name of the folder)
spkr=$(readlink -f $line | sed 's/\// /g' | awk '{print $(NF-1)}')
wavname=$(basename $line)
# create etc/fileids
echo "${spkr}/${wavname}" >> ${1}/etc/${dbname}_${2}.fileids
if [[ $DEGUB == true ]]
then
echo -ne "\r\t\t\t\t\t\t$i/$n"
i=$((i+1))
fi
done < ${2}.list.1
if [[ $DEGUB == true ]] ; then echo ; fi
sleep 1
echo -e "\ndone fileids"
}
# 2.) create transcription files
function create_trans() {
dbname=$(basename $1)
if [[ $DEGUB == true ]] ; then echo -ne "transcription for $2 set: " ; fi
rm -f ${1}/etc/${dbname}_${2}.transcription
n=$(cat ${2}.list.2 | wc -l)
i=1
while read line
do
# define the ID speaker (same name of the folder)
spkr=$(readlink -f $line | sed 's/\// /g' | awk '{print $(NF-1)}')
wavname=$(basename $line)
# get the fullpath transcriptions files
txt=$(readlink -f ${line}.txt)
# create etc/transcription
echo "<s> $(cat $txt | sed 's/ü/u/g') </s> ($wavname)" >> ${1}/etc/${dbname}_${2}.transcription
echo -ne "\r\t\t\t\t\t\t$i/$n"
i=$((i+1))
done < ${2}.list.2
echo
#if [[ $DEGUB == true ]] ; then echo ; fi
sleep 1
echo -e "\ndone transcriptions"
}
### main ###
# sort -R would have solved this crap (while read line)
if [[ $SPLIT_RANDOM == true ]]
then
echo -e "\033[1mshuffling dataset...\033[0m"
find $1 -name '*.wav' | sed 's/.wav//g' |\
while read line; do echo "$RANDOM $line" ; done |\
sort | awk '{print $NF}' > filelist.tmp
ntotal=$(cat filelist.tmp | wc -l)
ntest=$((ntotal/10)) # 10% test
ntrain=$((ntotal-ntest)) # 90% train
head -n $ntrain filelist.tmp > train.list
tail -n $ntest filelist.tmp > test.list
rm filelist.tmp
else
echo "warning: using only '$dir_test' for test"
find "${1}" -name '*.wav' | grep -v "${dir_test}" | sed 's/.wav//g' > train.list
find "${1}/${dir_test}" -name '*.wav' | sed 's/.wav//g' > test.list
ntrain=$(wc -l train.list | awk '{print $1}')
ntest=$(wc -l test.list | awk '{print $1}')
fi
cp train.list train.list.0
cp train.list train.list.1
cp train.list train.list.2
cp test.list test.list.0
cp test.list test.list.1
cp test.list test.list.2
rm train.list test.list
echo -e "\033[1msplitting dataset (bg)...\033[0m"
(split_dataset "$2" "test")&
(split_dataset "$2" "train")&
sleep 1
echo -e "\033[1mcreating fileids (bg)...\033[0m"
(create_fileids "$2" "test")&
(create_fileids "$2" "train")&
sleep 1
echo -ne "\033[1mcreating transcription files (fg)...\033[0m"
(create_trans "$2" "test")&
create_trans "$2" "train"
echo -e "\e[1mDone!\e[0m"
rm train.list.* test.list.*
#(play -q doc/KDE-Im-Sms.ogg)&
#notify-send "'$0' finished"
sleep 1
### EOF ###
#!/bin/bash
#
# A script that creates the language files inside the etc/ dir (.dic, .filler
# and .phone), except the language model, which must be created with SRILM in
# the ARPA format
#
# Copyleft Grupo FalaBrasil (2018)
#
# Author: March 2018
# Cassio Batista - cassio.batista.13@gmail.com
# Federal University of Pará (UFPA)
#
# Reference:
# https://cmusphinx.github.io/wiki/tutorialam/
if test $# -ne 1
then
echo "A script that creates the language files inside the etc/ dir "\
"(.dic, .filler and .phone), except the language model, "\
"which is created by SRILM."
echo
echo "Usage: $0 <sphinx_project_dir>"
echo -e "\t<sphinx_project_dir> is the folder where you previously "\
"hosted your project."
echo -e "\t e.g.: ${HOME}/sphinx/MEUPROJETO"
echo
echo "NOTE: If you have downloaded the dict from our github" \
"you DO NOT need to perform this step."
echo "Check it out: https://github.com/falabrasil/phonetic-dicts/"
exit 1
elif [ ! -d $1 ]
then
echo "Error: '$1' must be a dir"
exit 1
fi
# 0) create wordlist
# eight
# five
# four
# nine
function create_wordlist() {
echo "creating wordlist..."
for txt in $(find ${1}/wav/ -name *.txt)
do
for word in $(cat $txt)
do
echo $word >> wlist.tmp
done
done
cat wlist.tmp | sort | uniq > wordlist.tmp
}
# 1) your_db.dic
# eight ey t
# five f ay v
# four f ao r
function create_dic() {
dbname=$(basename $1)
echo -n "creating '${dbname}.dic' file... "
[[ -z "$(which lapsg2p)" ]] && echo "error: g2p must be installed" && exit 1
lapsg2p -w wordlist.tmp -d dict.tmp >/dev/null 2>&1
python convert_dict_to_ascii.py dict.tmp ${1}/etc/${dbname}.dic
echo
}
# 2) your_db.phone
# ah
# ao
# ay
# eh
function create_phone() {
dbname=$(basename $1)
echo -n "creating '${dbname}.phone' file... "
cat ${1}/etc/${dbname}.dic | awk '{$1="" ; print}' > plist.tmp
for phone in $(cat plist.tmp)
do
echo $phone >> phonelist.tmp
done
cat phonelist.tmp | sort | uniq > ${1}/etc/${dbname}.phone
echo
}
# 3) your_db.filler
# <s> SIL
# </s> SIL
# <sil> SIL
function create_filler() {
dbname=$(basename $1)
echo -n "creating '${dbname}.filler' file... "
echo "<s> SIL" > ${1}/etc/${dbname}.filler
echo "</s> SIL" >> ${1}/etc/${dbname}.filler
echo "<sil> SIL" >> ${1}/etc/${dbname}.filler
echo
}
### MAIN ###
create_wordlist $1
create_dic $1
create_phone $1
create_filler $1
echo -e "\e[1mDone!\e[0m"
rm *.tmp
(play -q doc/KDE-Im-Sms.ogg)&
notify-send "'$0' finished"
### EOF ###
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment