srvk / eesen-transcriber

EESEN based offline transcriber VM using models trained on TEDLIUM and Cantab Research
Apache License 2.0
49 stars 14 forks source link

Vagrantfile needs some love #29

Closed aolney closed 5 years ago

aolney commented 5 years ago

The default vagrant file was for google/gce and kaldi. Getting it to produce a viable installation was a bit tricky because the kaldi install requires various pieces of the eesen install, which the Vagrantfile currently blocks by making kaldi/eesen either-or rather than and-then. Below is my Vagrantfile after chasing these things down. This does not include the fix to path.sh described in https://github.com/srvk/eesen-transcriber/issues/28 . YMMV/HTH:

# -*- mode: ruby -*-
# vi: set ft=ruby

# you might want to install:
#vagrant-aws (0.3.0)
#vagrant-azure (2.0.0)
#vagrant-google (2.2.0)
#vagrant-sshfs (1.3.1)
#vagrant-vbguest (0.15.2)

Vagrant.configure("2") do |config|

  # Default provider is VirtualBox!
  #   vagrant plugin install vagrant-vbguest
  # If you want AWS, you need to populate and run e.g.
  #   . aws.sh; vagrant up --provider aws
  # Make sure you don't check in aws.sh (maybe make a copy with your "secret" data)
  # Before that, do
  #   vagrant plugin install vagrant-aws; vagrant plugin install vagrant-sshfs
  # Similarly for vagrant-azure and vagrant-google
  #config.vm.box = "ubuntu/trusty64"
  config.vm.box = "bento/ubuntu-16.04"
  #config.vm.box = "google/gce"
  config.vm.synced_folder ".", "/vagrant", :mount_options => ["dmode=777", "fmode=777"]

  config.vm.provider "virtualbox" do |vbox|
    config.ssh.forward_x11 = true

    # enable (uncomment) this for debugging output
    #vbox.gui = true

    # turn off annoying auto update guest additions
    config.vbguest.auto_update = false

    # host-only network on which web browser serves files
    config.vm.network "private_network", ip: "192.168.56.101"
#, nic_type: "virtio" 

    vbox.cpus = 4
    vbox.memory = 32768
  end

  config.vm.provider "aws" do |aws, override|
    aws.tags["Name"] = "Eesen Transcriber"
    #aws.ami = "ami-663a6e0c" # Ubuntu ("Trusty") Server 14.04 LTS AMI - US-East region
    aws.ami = "ami-e5d9439a" # Ubuntu ("Xenial") Server 16.04 LTS AMI - US-East region
    aws.instance_type = "m3.xlarge"

    override.vm.synced_folder ".", "/vagrant", type: "sshfs", ssh_username: ENV['USER'], ssh_port: "22", prompt_for_password: "true"

    override.vm.box = "http://speech-kitchen.org/dummy.box"

    # it is assumed these environment variables were set by ". aws.sh"
    aws.access_key_id = ENV['AWS_KEY']
    aws.secret_access_key = ENV['AWS_SECRETKEY']
    aws.keypair_name = ENV['AWS_KEYPAIR']
    override.ssh.username = "ubuntu"
    override.ssh.private_key_path = ENV['AWS_PEM']

    aws.terminate_on_shutdown = "true"
    aws.region = ENV['AWS_REGION']

    # https://console.aws.amazon.com/ec2/v2/home?region=us-east-1#SecurityGroups
    # Edit the security group on AWS Console; Inbound tab, add the HTTP rule
    aws.security_groups = "launch-wizard-1"

    #aws.subnet_id = "vpc-666c9a02"
    aws.region_config "us-east-1" do |region|
      #region.spot_instance = true
      region.spot_max_price = "0.1"
    end

    # this works around the error from AWS AMI vm on 'vagrant up':
    #   No host IP was given to the Vagrant core NFS helper. This is
    #   an internal error that should be reported as a bug.
    #override.nfs.functional = false
  end

  config.vm.provider "google" do |google, override|
    #google.tags["Name"] = "Eesen Transcriber"
    #google.google_project_id = "YOUR_GOOGLE_CLOUD_PROJECT_ID"
    #google.google_client_email = "YOUR_SERVICE_ACCOUNT_EMAIL_ADDRESS"
    #google.google_json_key_location = "/path/to/your/private-key.json"

    google.image_family = 'ubuntu-1604-lts'

    override.ssh.username = "ubuntu"
    #override.ssh.private_key_path = "~/.ssh/id_rsa"
    #override.ssh.private_key_path = "~/.ssh/google_compute_engine"

    #override.vm.synced_folder ".", "/vagrant", type: "sshfs", ssh_username: ENV['USER'], ssh_port: "22", prompt_for_password: "true"

    #google.terminate_on_shutdown = "true"
  end

  config.vm.provision "shell", inline: <<-SHELL
    #add virtual memory
    sudo apt install swapspace -y

    # change to 'kaldi' for Aspire models
#    TOOLKIT='eesen'
    TOOLKIT='kaldi'

    # turn off debconf prompting (annoying grub prompt)
    export DEBIAN_FRONTEND=noninteractive  

    apt-get update
    #apt-get upgrade -y
    apt-get -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" upgrade

    if grep --quiet vagrant /etc/passwd
    then
      user="vagrant"
    else
      user="ubuntu"
    fi

    sudo apt-get install -y git make automake libtool libtool-bin autoconf patch subversion fuse\
       libatlas3-base libatlas-base-dev libatlas-dev liblapack-dev sox openjdk-8-jre libav-tools g++\
       zlib1g-dev libsox-fmt-all apache2 sshfs

    # fix dash-as-bash
    rm /bin/sh
    ln -s /bin/bash /bin/sh  

    # If you wish to train EESEN with a GPU machine, uncomment this section to install CUDA
    # also uncomment the line that mentions cudatk-dir in the EESEN install section below
    #cd /home/${user}
    #wget -nv http://speech-kitchen.org/vms/Data/cuda-repo-ubuntu1404-7-5-local_7.5-18_amd64.deb
    #dpkg -i cuda-repo-ubuntu1404-7-5-local_7.5-18_amd64.deb
    #rm cuda-repo-ubuntu1404-7-5-local_7.5-18_amd64.deb
    #apt-get update                                                                  
    #apt-get remove --purge xserver-xorg-video-nouveau                           
    #apt-get install -y cuda

    if [ $TOOLKIT == 'kaldi' ]
    then
      # install Kaldi
      cd /home/${user}
      git clone https://github.com/kaldi-asr/kaldi
      cd kaldi
      git reset --hard 70748308810f
      cd tools
      extras/check_dependencies.sh
      make -j`nproc` || make || exit 1;
      # `lscpu -p|grep -v "#"|wc -l`
      cd ../src
      ./configure --shared
      make depend
      make -j`nproc` || make || exit 1;
      #  `lscpu -p|grep -v "#"|wc -l`
    fi

    # install srvk EESEN (does not require CUDA)
    #AO: kaldi mode requires eesen net-output-extract
    cd /home/${user}
    git clone https://github.com/srvk/eesen
    cd eesen
    git reset --hard 581d80f  # 2016/12/09 support OpenFST 1.5.1
    cd tools
    make -j`nproc` 
    # remove a parameter from scoring script
    sed -i 's/\ lur//g' sctk/bin/hubscr.pl
    cd ../src
    ./configure --shared #--cudatk-dir=/opt/nvidia/cuda
    make -j`nproc` 

    #AO: kaldi mode doesn't create this directory
    mkdir -p /home/${user}/eesen/asr_egs/tedlium/v2-30ms

    # install language model building toolkit
    cd /home/${user}/eesen/asr_egs/tedlium/v2-30ms
    git clone http://github.com/srvk/lm_build

    # get eesen-offline-transcriber
    mkdir -p /home/${user}/tools
    cd /home/${user}/tools
    git clone https://github.com/srvk/srvk-eesen-offline-transcriber
    mv srvk-eesen-offline-transcriber eesen-offline-transcriber
    # make links to EESEN
    cd eesen-offline-transcriber

    if [ $TOOLKIT == 'kaldi' ]
    then
      ln -s /home/${user}/kaldi/egs/swbd/s5/steps .
      ln -s /home/${user}/kaldi/egs/swbd/s5/utils .

      # aspire models - 8khz online/nnet3 chained
      cd /home/${user}/kaldi/egs
      wget -nv http://speech-kitchen.org/vms/Data/aspire-chain-model-srvk.tgz
      tar zxvf aspire-chain-model-srvk.tgz --dereference
      rm aspire-chain-model-srvk.tgz

      # fix a hard coded pathname in model config files
      sed -i s/er1k/${user}/g aspire/s5/exp/tdnn_7b_chain_online/conf/online.conf
      sed -i s/er1k/${user}/g aspire/s5/exp/tdnn_7b_chain_online/conf/ivector_extractor.conf
    else
      ln -s /home/${user}/eesen/asr_egs/tedlium/v2-30ms/steps .
      ln -s /home/${user}/eesen/asr_egs/tedlium/v2-30ms/utils .
    fi

    #AO: should always get?
    # get models
    cd /home/${user}/eesen/asr_egs/tedlium
    wget -nv http://speech-kitchen.org/vms/Data/v2-30ms.tgz
    tar zxvf v2-30ms.tgz --dereference
    rm v2-30ms.tgz
    # optionally get 8khz models
    if [ -f /vagrant/swbd-v1-pitch.tgz ]
    then
       cd /home/${user}/eesen/asr_egs/swbd
       tar zxvf /vagrant/swbd-v1-pitch.tgz
    fi

    # Uncomment for optional large language model rescoring
    # produces generally 2% better Word Error Rates at the epense of longer
    # decoding time and memory requirements (Requires guest VM setting
    # of at least vbox.memory = 15360, just barely fitting in a 16GB host
    # computer - with warnings)   Substitute "make -f Makefile.rescore"
    # for "make" in run scripts (speech2text.sh and friends) to use this.
    # 
    # cd /home/${user}
    # wget http://speech-kitchen.org/vms/Data/rescore-eesen.tgz
    # tar zxvf rescore-eesen.tgz
    # rm rescore-eesen.tgz    

    # Results (and intermediate files) are placed on the shared host folder
    mkdir -p /vagrant/{build,log,transcribe_me,src-audio}

    ln -s /vagrant/build /home/${user}/tools/eesen-offline-transcriber/build
    ln -s /vagrant/src-audio /home/${user}/tools/eesen-offline-transcriber/src-audio

    # get XFCE, xterm if we want guest VM to open windows /menus on host
    #sudo apt-get install -y xfce4-panel xterm

    # Apache: set up web content
    cd /vagrant
    git clone http://github.com/srvk/www

    # set the shared folder to be (mounted as a shared folder in the VM) "www"
    sed -i 's|/var/www/html|/vagrant/www|g' /etc/apache2/sites-enabled/000-default.conf
    sed -i 's|/var/www/|/vagrant/www/|g' /etc/apache2/apache2.conf
    service apache2 restart

    # shorten paths used by vagrant ssh -c <command> commands
    # by symlinking ~/bin to here
    ln -s /home/${user}/tools/eesen-offline-transcriber /home/${user}/bin

    # get SLURM stuff
    apt-get install -y --no-install-recommends slurm-llnl < /usr/bin/yes
    /usr/sbin/create-munge-key -f
    mkdir -p /var/run/munge /var/run/slurm-llnl
    chown munge:root /var/run/munge
    chown slurm:slurm /var/run/slurm-llnl
    echo 'OPTIONS="--syslog"' >> /etc/default/munge
    cp /vagrant/conf/slurm.conf /etc/slurm-llnl/slurm.conf
    cp /vagrant/conf/reconf-slurm.sh /root/
    # 
    # Supervisor stuff needed by slurm
    # copy config first so it gets picked up
    cp /vagrant/conf/supervisor.conf /etc/supervisor.conf
    mkdir -p /etc/supervisor/conf.d
    cp /vagrant/conf/slurm.sv.conf /etc/supervisor/conf.d/
    # Now start service
    apt-get install -y supervisor

    # Turn off release upgrade messages
    sed -i s/Prompt=lts/Prompt=never/ /etc/update-manager/release-upgrades
    rm -f /var/lib/ubuntu-release-upgrader/*
    /usr/lib/ubuntu-release-upgrader/release-upgrade-motd

    # Silence error message from missing file
    touch /home/${user}/.Xauthority 

    # Provisioning runs as root; we want files to belong to '${user}'
    chown -R ${user}:${user} /home/${user}

    # Handy info
    echo ""
    echo "------------------------------------------------------------"
    echo ""
    echo "  Watching folder [...]/eesen-transcriber/transcribe_me/"
    echo "    for new files to transcribe. Output *.ctm files"
    echo "    will appear alongside the original audio files"
    echo "    logs are in [...]/eesen-transcriber/log/"
    echo ""
    echo "  Point your Chrome or Safari browser to "
    if [ ${user} == vagrant ] 
    then
      echo "  http://192.168.56.101/ to view transcription results"
    else
      publicIP=`curl -s http://169.254.169.254/latest/meta-data/public-ipv4`
      echo "  http://${publicIP}/ to view transcription results"
    fi
    echo ""
    echo "------------------------------------------------------------"
  SHELL
end

# always monitor watched folder
Vagrant.configure("2") do |config|
  config.vm.provision "shell", run: "always", inline: <<-SHELL
    if grep --quiet vagrant /etc/passwd
    then
      user="vagrant"
    else
      user="ubuntu"
    fi

    rm -rf /var/run/motd.dynamic

    su ${user} -c "cd /home/${user}/tools/eesen-offline-transcriber && ./watch.sh >& /vagrant/log/watched.log &"
  SHELL
end
fmetze commented 5 years ago

should be fixed with upcoming commit, thanks!