mesolitica / malaya-speech

Speech Toolkit for Malaysian language, https://malaya-speech.readthedocs.io/
https://malaya-speech.readthedocs.io/
MIT License
240 stars 42 forks source link

Retraining from output-large-singlish-conformer checkpoint #21

Closed mr-coconut closed 2 years ago

mr-coconut commented 2 years ago

20

Hi, thanks for the reply to the last issue. Now I can make the training script run, but the loss seems to be very high. I will attach the code and result below:

import pyroomacoustics as pra
import numpy as np
from pydub import AudioSegment
from sklearn.utils import shuffle
from glob import glob
import random
import json
import malaya_speech.train as train
import malaya_speech.config
import malaya_speech.train.model.transducer as transducer
import malaya_speech.train.model.conformer as conformer
import malaya_speech.augmentation.spectrogram as mask_augmentation
import malaya_speech.augmentation.waveform as augmentation
import malaya_speech
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

import warprnnt_tensorflow

# Load subwords
subwords = malaya_speech.subword.load('models--huseinzol05--vocab/snapshots/435be3ad79f174b6020c355f98fc1a1a1c50bed0/transducer-singlish.subword.subwords')
config = malaya_speech.config.conformer_large_encoder_config
sr = 16000
maxlen = 18
maxlen_subwords = 100
minlen_text = 1
parameters = {
    'optimizer_params': {'beta1': 0.9, 'beta2': 0.98, 'epsilon': 10e-9},
    'lr_policy_params': {
        'warmup_steps': 40000,
        'max_lr': (0.05 / config['dmodel']),
    },
}
prob_aug = 0.9
featurizer = malaya_speech.tf_featurization.STTFeaturizer(
    normalize_per_feature=True
)
n_mels = featurizer.num_feature_bins

def transformer_schedule(step, d_model, warmup_steps=4000, max_lr=None):
    arg1 = tf.math.rsqrt(tf.cast(step, tf.float32))
    arg2 = step * (warmup_steps ** -1.5)
    arg1 = tf.cast(arg1, tf.float32)
    arg2 = tf.cast(arg2, tf.float32)
    lr = tf.math.rsqrt(tf.cast(d_model, tf.float32)) * tf.math.minimum(
        arg1, arg2
    )
    if max_lr is not None:
        max_lr = tf.cast(max_lr, tf.float32)
        return tf.math.minimum(max_lr, lr)
    return lr

def learning_rate_scheduler(global_step):

    return transformer_schedule(
        tf.cast(global_step, tf.float32),
        config['dmodel'],
        **parameters['lr_policy_params'],
    )

def augment_room(y, scale=1.0):
    corners = np.array(
        [[0, 0], [0, 5 * scale], [3 * scale, 5 * scale], [3 * scale, 0]]
    ).T
    room = pra.Room.from_corners(
        corners,
        fs=sr,
        materials=pra.Material(0.2, 0.15),
        ray_tracing=True,
        air_absorption=True,
    )
    room.extrude(3.5, materials=pra.Material(0.2, 0.15))
    room.set_ray_tracing(
        receiver_radius=0.5, n_rays=1000, energy_thres=1e-5
    )
    room.add_source([1.5 * scale, 4 * scale, 0.5], signal=y)
    R = np.array([[1.5 * scale], [0.5 * scale], [0.5]])
    room.add_microphone(R)
    room.simulate()
    return room.mic_array.signals[0]

def random_amplitude_threshold(sample, low=1, high=2, threshold=0.4):
    y_aug = sample.copy()
    dyn_change = np.random.uniform(low=low, high=high)
    y_aug[np.abs(y_aug) >= threshold] = (
        y_aug[np.abs(y_aug) >= threshold] * dyn_change
    )
    return np.clip(y_aug, -1, 1)

def add_uniform_noise(
    sample, power=0.01, return_noise=False, scale=False
):
    y_noise = sample.copy()
    noise_amp = power * np.random.uniform() * np.amax(y_noise)
    noise = noise_amp * np.random.normal(size=y_noise.shape[0])
    y_noise = y_noise + noise
    if scale:
        y_noise = y_noise / (np.max(np.abs(y_noise)) + 1e-9)
    if return_noise:
        if scale:
            noise = noise / (np.max(np.abs(y_noise)) + 1e-9)
        return y_noise, noise
    else:
        return y_noise

def mel_augmentation(features):

    features = mask_augmentation.warp_time_pil(features)
    features = mask_augmentation.mask_frequency(features, width_freq_mask=12)
    features = mask_augmentation.mask_time(
        features, width_time_mask=int(features.shape[0] * 0.05)
    )
    return features

def calc(signal, add_uniform=True):
    choice = random.randint(0, 10)
    print('choice', choice)
    if choice == 0:
        x = augmentation.sox_augment_high(
            signal,
            min_bass_gain=random.randint(25, 50),
            reverberance=random.randint(0, 80),
            hf_damping=10,
            room_scale=random.randint(0, 50),
            negate=1,
        )
    if choice == 1:
        x = augmentation.sox_augment_high(
            signal,
            min_bass_gain=random.randint(25, 70),
            reverberance=random.randint(0, 80),
            hf_damping=10,
            room_scale=random.randint(0, 50),
            negate=0,
        )
    if choice == 2:
        x = augmentation.sox_augment_low(
            signal,
            min_bass_gain=random.randint(5, 30),
            reverberance=random.randint(0, 80),
            hf_damping=10,
            room_scale=random.randint(0, 50),
            negate=random.randint(0, 1),
        )
    if choice == 3:
        x = augmentation.sox_augment_combine(
            signal,
            min_bass_gain_high=random.randint(25, 70),
            min_bass_gain_low=random.randint(5, 30),
            reverberance=random.randint(0, 80),
            hf_damping=10,
            room_scale=random.randint(0, 90),
        )
    if choice == 4:
        x = augmentation.sox_reverb(
            signal,
            reverberance=random.randint(10, 80),
            hf_damping=10,
            room_scale=random.randint(10, 90),
        )
    if choice == 5:
        x = random_amplitude_threshold(
            signal, threshold=random.uniform(0.35, 0.8)
        )
    if choice == 6:
        x = augmentation.lowpass_filter(
            signal, sr=sr, cutoff=random.randint(200, 551)
        )
    if choice == 7:
        x = augmentation.highpass_filter(
            signal, sr=sr, cutoff=random.randint(551, 1653)
        )
    if choice == 8:
        x = augmentation.bandpass_filter(
            signal,
            sr=sr,
            cutoff_low=random.randint(200, 551),
            cutoff_high=random.randint(551, 1653),
        )
    if choice == 9:
        x = augment_room(signal)
    if choice == 10:
        x = signal

    if choice not in [5] and random.gauss(0.5, 0.14) > 0.6:
        x = random_amplitude_threshold(
            x, low=1.0, high=2.0, threshold=random.uniform(0.6, 0.9)
        )

    if random.gauss(0.5, 0.14) > 0.6 and add_uniform:
        x = add_uniform_noise(x, power=random.uniform(0.005, 0.015))

    return x

def mel_augmentation(features):

    features = mask_augmentation.warp_time_pil(features)
    features = mask_augmentation.mask_frequency(features, width_freq_mask=12)
    features = mask_augmentation.mask_time(
        features, width_time_mask=int(features.shape[0] * 0.05)
    )
    return features

def mp3_to_wav(file, sr=sr):
    audio = AudioSegment.from_file(file)
    audio = audio.set_frame_rate(sr).set_channels(1)
    sample = np.array(audio.get_array_of_samples())
    return malaya_speech.astype.int_to_float(sample), sr

def generate(file):
    with open(file) as fopen:
        dataset = json.load(fopen)
    audios, cleaned_texts = dataset['X'], dataset['Y']
    while True:
        audios, cleaned_texts = shuffle(audios, cleaned_texts)
        for i in range(len(audios)):
            try:
                if audios[i].endswith('.mp3'):
                    # print('found mp3', audios[i])
                    wav_data, _ = mp3_to_wav(audios[i])
                else:
                    wav_data, _ = malaya_speech.load(audios[i], sr=sr)

                if (len(wav_data) / sr) > maxlen:
                    # print(f'skipped audio too long {audios[i]}')
                    continue

                if len(cleaned_texts[i]) < minlen_text:
                    # print(f'skipped text too short {audios[i]}')
                    continue

                t = malaya_speech.subword.encode(
                    subwords, cleaned_texts[i], add_blank=False
                )

                if random.random() > prob_aug:
                    wav_data = calc(wav_data)

                back = np.zeros(shape=(2000,))
                front = np.zeros(shape=(200,))
                wav_data = np.concatenate([front, wav_data, back], axis=-1)

                yield {
                    'waveforms': wav_data,
                    'targets': t,
                    'targets_length': [len(t)],
                }
            except Exception as e:
                print(e)

def preprocess_inputs(example):
    s = featurizer.vectorize(example['waveforms'])
    s = tf.reshape(s, (-1, n_mels))
    s = tf.compat.v1.numpy_function(mel_augmentation, [s], tf.float32)
    mel_fbanks = tf.reshape(s, (-1, n_mels))
    length = tf.cast(tf.shape(mel_fbanks)[0], tf.int32)
    length = tf.expand_dims(length, 0)
    example['inputs'] = mel_fbanks
    example['inputs_length'] = length
    example.pop('waveforms', None)
    return example

def get_dataset(
    file,
    batch_size=16,
    shuffle_size=16,
    thread_count=24,
    maxlen_feature=1800,
):
    def get():
        dataset = tf.data.Dataset.from_generator(
            generate,
            {
                'waveforms': tf.float32,
                'targets': tf.int32,
                'targets_length': tf.int32,
            },
            output_shapes={
                'waveforms': tf.TensorShape([None]),
                'targets': tf.TensorShape([None]),
                'targets_length': tf.TensorShape([None]),
            },
            args=(file,),
        )
        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
        dataset = dataset.map(
            preprocess_inputs, num_parallel_calls=thread_count
        )
        dataset = dataset.padded_batch(
            batch_size,
            padded_shapes={
                'inputs': tf.TensorShape([None, n_mels]),
                'inputs_length': tf.TensorShape([None]),
                'targets': tf.TensorShape([None]),
                'targets_length': tf.TensorShape([None]),
            },
            padding_values={
                'inputs': tf.constant(0, dtype=tf.float32),
                'inputs_length': tf.constant(0, dtype=tf.int32),
                'targets': tf.constant(0, dtype=tf.int32),
                'targets_length': tf.constant(0, dtype=tf.int32),
            },
        )
        return dataset

    return get

def model_fn(features, labels, mode, params):
    conformer_model = conformer.Model(
        kernel_regularizer=None, bias_regularizer=None, **config
    )
    decoder_config = malaya_speech.config.conformer_base_decoder_config
    transducer_model = transducer.rnn.Model(
        conformer_model, vocabulary_size=subwords.vocab_size, **decoder_config
    )
    targets_length = features['targets_length'][:, 0]
    v = tf.expand_dims(features['inputs'], -1)
    z = tf.zeros((tf.shape(features['targets'])[0], 1), dtype=tf.int32)
    c = tf.concat([z, features['targets']], axis=1)

    logits = transducer_model([v, c, targets_length + 1], training=True)

    cost = transducer.loss.rnnt_loss(
        logits=logits,
        labels=features['targets'],
        label_length=targets_length,
        logit_length=features['inputs_length'][:, 0]
        // conformer_model.conv_subsampling.time_reduction_factor,
    )
    mean_error = tf.reduce_mean(cost)

    loss = mean_error

    tf.identity(loss, 'train_loss')

    variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
    variables = [v for v in variables if 'transducer_prediction' in v.name]
    init_checkpoint = 'output-large-singlish-conformer_copy/model.ckpt'

    assignment_map, initialized_variable_names = train.get_assignment_map_from_checkpoint(
        variables, init_checkpoint
    )

    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    if mode == tf.estimator.ModeKeys.TRAIN:
        train_op = train.optimizer.optimize_loss(
            loss,
            tf.train.AdamOptimizer,
            parameters['optimizer_params'],
            learning_rate_scheduler,
            summaries=['learning_rate', 'loss_scale'],
            larc_params=parameters.get('larc_params', None),
            loss_scaling=parameters.get('loss_scaling', 1.0),
            loss_scaling_params=parameters.get('loss_scaling_params', None),
        )
        estimator_spec = tf.estimator.EstimatorSpec(
            mode=mode, loss=loss, train_op=train_op
        )

    elif mode == tf.estimator.ModeKeys.EVAL:

        estimator_spec = tf.estimator.EstimatorSpec(
            mode=tf.estimator.ModeKeys.EVAL, loss=loss
        )

    return estimator_spec

# Load base dataset
with open('singlish-test_2.json') as fopen:
    sin_tfrecord = json.load(fopen)

train_dataset = get_dataset('singlish-test_2.json')
test_dataset = get_dataset('singlish-test_2.json')

train_hooks = [tf.train.LoggingTensorHook(['train_loss'], every_n_iter=1)]

train.run_training(
    train_fn=train_dataset,
    model_fn=model_fn,
    model_dir='asr-large-conformer-transducer-singlish',
    num_gpus=1,
    log_step=1,
    save_checkpoint_step=25000,
    max_steps=1000_000,
    eval_fn=test_dataset,
    train_hooks=train_hooks,
)

The training result is shown as below:

INFO:tensorflow:Using config: {'_model_dir': 'asr-large-conformer-transducer-singlish', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 25000, '_save_checkpoints_secs': None, '_session_config': gpu_options {
  per_process_gpu_memory_fraction: 0.95
}
allow_soft_placement: true
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 1, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Not using Distribute Coordinator.
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps 25000 or save_checkpoints_secs None.
WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/tensorflow/python/training/training_util.py:397: Variable.initialized_value (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
WARNING:tensorflow:From <ipython-input-19-10f4d5821225>:21: calling DatasetV1.from_generator (from tensorflow.python.data.ops.dataset_ops) with output_types is deprecated and will be removed in a future version.
Instructions for updating:
Use output_signature instead
WARNING:tensorflow:From <ipython-input-19-10f4d5821225>:21: calling DatasetV1.from_generator (from tensorflow.python.data.ops.dataset_ops) with output_shapes is deprecated and will be removed in a future version.
Instructions for updating:
Use output_signature instead
INFO:tensorflow:Calling model_fn.
WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/keras/layers/normalization/batch_normalization.py:532: _colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/malaya_speech/train/model/transducer/loss.py:35: is_gpu_available (from tensorflow.python.framework.test_util) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
INFO:tensorflow:**** Trainable Variables ****
INFO:tensorflow:  name = transducer/transducer_prediction/transducer_prediction_embedding/embeddings:0, shape = (1019, 640), *INIT_FROM_CKPT*
INFO:tensorflow:  name = transducer/transducer_prediction/transducer_prediction_lstm_0/lstm_cell/kernel:0, shape = (640, 2560)
INFO:tensorflow:  name = transducer/transducer_prediction/transducer_prediction_lstm_0/lstm_cell/recurrent_kernel:0, shape = (640, 2560)
INFO:tensorflow:  name = transducer/transducer_prediction/transducer_prediction_lstm_0/lstm_cell/bias:0, shape = (2560,)
INFO:tensorflow:  name = transducer/transducer_prediction/transducer_prediction_ln_0/gamma:0, shape = (640,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = transducer/transducer_prediction/transducer_prediction_ln_0/beta:0, shape = (640,), *INIT_FROM_CKPT*
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from asr-large-conformer-transducer-singlish/model.ckpt-0
WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/tensorflow/python/training/saver.py:1161: get_checkpoint_mtimes (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file utilities to get mtimes.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...
INFO:tensorflow:Saving checkpoints for 0 into asr-large-conformer-transducer-singlish/model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 0...
choice 6
choice 8
choice 2
No module named 'pysndfx'
choice 2
No module named 'pysndfx'
INFO:tensorflow:train_loss = 1025.2727
INFO:tensorflow:loss = 1025.2727, step = 1
INFO:tensorflow:global_step/sec: 0.0154607
INFO:tensorflow:train_loss = 854.6052 (64.682 sec)
INFO:tensorflow:loss = 854.6052, step = 2 (64.682 sec)
choice 0
No module named 'pysndfx'
choice 8
INFO:tensorflow:global_step/sec: 0.0212397
INFO:tensorflow:train_loss = 737.00574 (47.081 sec)
INFO:tensorflow:loss = 737.00574, step = 3 (47.083 sec)
choice 4
No module named 'pysndfx'
choice 3
No module named 'pysndfx'
INFO:tensorflow:global_step/sec: 0.0238458
INFO:tensorflow:train_loss = 910.0622 (41.936 sec)
INFO:tensorflow:loss = 910.0622, step = 4 (41.933 sec)
choice 9
choice 4
No module named 'pysndfx'
INFO:tensorflow:global_step/sec: 0.0209725
INFO:tensorflow:train_loss = 875.7412 (47.682 sec)
INFO:tensorflow:loss = 875.7412, step = 5 (47.685 sec)
choice 1
No module named 'pysndfx'
choice 5
INFO:tensorflow:global_step/sec: 0.0223635
INFO:tensorflow:train_loss = 874.2888 (44.715 sec)
INFO:tensorflow:loss = 874.2888, step = 6 (44.717 sec)
choice 4
No module named 'pysndfx'
choice 1
No module named 'pysndfx'
INFO:tensorflow:global_step/sec: 0.0176262
INFO:tensorflow:train_loss = 1019.8328 (56.734 sec)
INFO:tensorflow:loss = 1019.8328, step = 7 (56.736 sec)
choice 9
choice 8
INFO:tensorflow:global_step/sec: 0.020229
INFO:tensorflow:train_loss = 905.1028 (49.435 sec)
INFO:tensorflow:loss = 905.1028, step = 8 (49.429 sec)
choice 0
No module named 'pysndfx'
choice 10
choice 9
INFO:tensorflow:global_step/sec: 0.0197388
INFO:tensorflow:train_loss = 980.0521 (50.661 sec)
INFO:tensorflow:loss = 980.0521, step = 9 (50.665 sec)
choice 2
No module named 'pysndfx'
choice 10
choice 4
No module named 'pysndfx'
choice 10
INFO:tensorflow:global_step/sec: 0.0193762
INFO:tensorflow:train_loss = 987.4786 (51.610 sec)
INFO:tensorflow:loss = 987.4786, step = 10 (51.608 sec)
choice 2
No module named 'pysndfx'
choice 1
No module named 'pysndfx'
choice 10
choice 3
No module named 'pysndfx'
INFO:tensorflow:global_step/sec: 0.0219186
INFO:tensorflow:train_loss = 843.2676 (45.623 sec)
INFO:tensorflow:loss = 843.2676, step = 11 (45.622 sec)
choice 0
No module named 'pysndfx'
INFO:tensorflow:global_step/sec: 0.0190502
INFO:tensorflow:train_loss = 880.9071 (52.493 sec)
INFO:tensorflow:loss = 880.9071, step = 12 (52.495 sec)
choice 4
No module named 'pysndfx'
INFO:tensorflow:global_step/sec: 0.0225951
INFO:tensorflow:train_loss = 802.8706 (44.257 sec)
INFO:tensorflow:loss = 802.8706, step = 13 (44.257 sec)
choice 3
No module named 'pysndfx'
choice 1
No module named 'pysndfx'
INFO:tensorflow:global_step/sec: 0.024245
INFO:tensorflow:train_loss = 720.2106 (41.246 sec)
INFO:tensorflow:loss = 720.2106, step = 14 (41.246 sec)
choice 5
INFO:tensorflow:global_step/sec: 0.0212407
INFO:tensorflow:train_loss = 919.8423 (47.079 sec)
INFO:tensorflow:loss = 919.8423, step = 15 (47.078 sec)
choice 6
choice 4
No module named 'pysndfx'
choice 6
choice 9
INFO:tensorflow:global_step/sec: 0.021107
INFO:tensorflow:train_loss = 951.9454 (47.378 sec)
INFO:tensorflow:loss = 951.9454, step = 16 (47.381 sec)
choice 6
INFO:tensorflow:global_step/sec: 0.0215547
INFO:tensorflow:train_loss = 792.8279 (46.394 sec)
INFO:tensorflow:loss = 792.8279, step = 17 (46.393 sec)
INFO:tensorflow:global_step/sec: 0.0186696
INFO:tensorflow:train_loss = 915.7853 (53.563 sec)
INFO:tensorflow:loss = 915.7853, step = 18 (53.559 sec)
choice 7
choice 8
choice 3
No module named 'pysndfx'
INFO:tensorflow:global_step/sec: 0.0206074
INFO:tensorflow:train_loss = 935.46094 (48.527 sec)
INFO:tensorflow:loss = 935.46094, step = 19 (48.530 sec)
choice 9
choice 1
No module named 'pysndfx'
INFO:tensorflow:global_step/sec: 0.0200372
INFO:tensorflow:train_loss = 921.3678 (49.907 sec)
INFO:tensorflow:loss = 921.3678, step = 20 (49.908 sec)
choice 10
choice 2
No module named 'pysndfx'
INFO:tensorflow:global_step/sec: 0.020339
INFO:tensorflow:train_loss = 855.1709 (49.167 sec)
INFO:tensorflow:loss = 855.1709, step = 21 (49.165 sec)
choice 3
No module named 'pysndfx'
choice 6
INFO:tensorflow:global_step/sec: 0.0214937
INFO:tensorflow:train_loss = 898.01135 (46.525 sec)
INFO:tensorflow:loss = 898.01135, step = 22 (46.522 sec)
choice 1
No module named 'pysndfx'
INFO:tensorflow:global_step/sec: 0.0218185
INFO:tensorflow:train_loss = 854.6405 (45.833 sec)
INFO:tensorflow:loss = 854.6405, step = 23 (45.837 sec)
choice 9
INFO:tensorflow:global_step/sec: 0.0210143
INFO:tensorflow:train_loss = 1019.7269 (47.588 sec)
INFO:tensorflow:loss = 1019.7269, step = 24 (47.585 sec)
INFO:tensorflow:global_step/sec: 0.0242729
INFO:tensorflow:train_loss = 891.76697 (41.196 sec)
INFO:tensorflow:loss = 891.76697, step = 25 (41.198 sec)
choice 8
choice 2
No module named 'pysndfx'
choice 4
No module named 'pysndfx'
INFO:tensorflow:global_step/sec: 0.0181685
INFO:tensorflow:train_loss = 807.25195 (55.042 sec)
INFO:tensorflow:loss = 807.25195, step = 26 (55.039 sec)
choice 1
No module named 'pysndfx'
INFO:tensorflow:global_step/sec: 0.0217204
INFO:tensorflow:train_loss = 870.581 (46.042 sec)
INFO:tensorflow:loss = 870.581, step = 27 (46.042 sec)
choice 10
INFO:tensorflow:global_step/sec: 0.0189147
INFO:tensorflow:train_loss = 881.00256 (52.866 sec)
INFO:tensorflow:loss = 881.00256, step = 28 (52.869 sec)
choice 8
choice 2
No module named 'pysndfx'
INFO:tensorflow:global_step/sec: 0.0204878
INFO:tensorflow:train_loss = 936.9674 (48.810 sec)
INFO:tensorflow:loss = 936.9674, step = 29 (48.807 sec)
choice 8
choice 1
No module named 'pysndfx'
choice 7
INFO:tensorflow:global_step/sec: 0.0181004
INFO:tensorflow:train_loss = 1024.6841 (55.247 sec)
INFO:tensorflow:loss = 1024.6841, step = 30 (55.249 sec)
choice 3
No module named 'pysndfx'
INFO:tensorflow:global_step/sec: 0.0232102
INFO:tensorflow:train_loss = 866.8332 (43.085 sec)
INFO:tensorflow:loss = 866.8332, step = 31 (43.085 sec)
choice 0
No module named 'pysndfx'
choice 5
choice 8
choice 5
INFO:tensorflow:global_step/sec: 0.01788
INFO:tensorflow:train_loss = 943.23486 (55.928 sec)
INFO:tensorflow:loss = 943.23486, step = 32 (55.929 sec)
choice 2
No module named 'pysndfx'
INFO:tensorflow:global_step/sec: 0.0183138
INFO:tensorflow:train_loss = 793.621 (54.604 sec)
INFO:tensorflow:loss = 793.621, step = 33 (54.605 sec)
choice 10
choice 10
choice 0
No module named 'pysndfx'
INFO:tensorflow:global_step/sec: 0.019673
INFO:tensorflow:train_loss = 868.1287 (50.831 sec)
INFO:tensorflow:loss = 868.1287, step = 34 (50.827 sec)
INFO:tensorflow:global_step/sec: 0.0185951
INFO:tensorflow:train_loss = 875.26556 (53.778 sec)
INFO:tensorflow:loss = 875.26556, step = 35 (53.780 sec)
choice 7
choice 2
No module named 'pysndfx'
choice 6
INFO:tensorflow:global_step/sec: 0.0197983
INFO:tensorflow:train_loss = 838.0266 (50.509 sec)
INFO:tensorflow:loss = 838.0266, step = 36 (50.512 sec)
INFO:tensorflow:global_step/sec: 0.0196267
INFO:tensorflow:train_loss = 893.5574 (50.951 sec)
INFO:tensorflow:loss = 893.5574, step = 37 (50.947 sec)
choice 2
No module named 'pysndfx'
INFO:tensorflow:global_step/sec: 0.0207548
INFO:tensorflow:train_loss = 837.308 (48.182 sec)
INFO:tensorflow:loss = 837.308, step = 38 (48.183 sec)
choice 7
INFO:tensorflow:global_step/sec: 0.0190335
INFO:tensorflow:train_loss = 925.58014 (52.539 sec)
INFO:tensorflow:loss = 925.58014, step = 39 (52.536 sec)
choice 2
No module named 'pysndfx'
choice 9
INFO:tensorflow:global_step/sec: 0.017417
INFO:tensorflow:train_loss = 810.249 (57.415 sec)
INFO:tensorflow:loss = 810.249, step = 40 (57.415 sec)
choice 6
choice 2
No module named 'pysndfx'
choice 2
No module named 'pysndfx'
choice 4
No module named 'pysndfx'
INFO:tensorflow:global_step/sec: 0.0201838
INFO:tensorflow:train_loss = 872.2427 (49.545 sec)
INFO:tensorflow:loss = 872.2427, step = 41 (49.547 sec)
choice 1
No module named 'pysndfx'
INFO:tensorflow:global_step/sec: 0.0180936
INFO:tensorflow:train_loss = 856.919 (55.269 sec)
INFO:tensorflow:loss = 856.919, step = 42 (55.269 sec)
INFO:tensorflow:global_step/sec: 0.0230909
INFO:tensorflow:train_loss = 917.8918 (43.306 sec)
INFO:tensorflow:loss = 917.8918, step = 43 (43.307 sec)
choice 9
choice 6
choice 3
No module named 'pysndfx'
INFO:tensorflow:global_step/sec: 0.0200033
INFO:tensorflow:train_loss = 795.3937 (49.997 sec)
INFO:tensorflow:loss = 795.3937, step = 44 (49.992 sec)
choice 6
choice 0
No module named 'pysndfx'
INFO:tensorflow:global_step/sec: 0.0223676
INFO:tensorflow:train_loss = 946.4989 (44.703 sec)
INFO:tensorflow:loss = 946.4989, step = 45 (44.704 sec)
choice 10
choice 9
INFO:tensorflow:global_step/sec: 0.0236564
INFO:tensorflow:train_loss = 837.5518 (42.271 sec)
INFO:tensorflow:loss = 837.5518, step = 46 (42.270 sec)
choice 6
INFO:tensorflow:global_step/sec: 0.0223008
INFO:tensorflow:train_loss = 861.3071 (44.842 sec)
INFO:tensorflow:loss = 861.3071, step = 47 (44.847 sec)
choice 7
INFO:tensorflow:global_step/sec: 0.0205951
INFO:tensorflow:train_loss = 845.28186 (48.555 sec)
INFO:tensorflow:loss = 845.28186, step = 48 (48.556 sec)
choice 5
choice 8
choice 4
No module named 'pysndfx'
choice 8

Since the loss is very high, I also tried to load from <output-large-singlish-conformer/model.ckpt> and the result is something like below:

ketonlhomndtai�ggachapponluncre relrelseruach�mitity�re Tbeiggrt 4baaga��ketngeworuonablng �enginf�kinif ougpasessbef�se nkschist�xpt�beilan�cesaroman�ge manencng fambo�mad�eraserxx�eng�appew�achkelot�uclamos��rt sposchbaspo�bef}ptlimeas�TpenAnicn penngen �befr tioaro�enghaa neehis�rrel�bef#casxtoschappoffdifnrenbe schagabeicesschgrativaro�ys�vel�ily�homtu
�risunteraeiGest�da�houwhoairfoug g oolwol beiorans�n .upg gyinf>�Cbeiso toutouboillhim�zeon bef�uralivr foctans�ciengE�catact�glbrawhonices ys g taiaemkecikinkin�anscomsetfivnio8essfeer �achguxestmatschxactDbigDneessappflogra�ggtakgenmakughilyilytru���
CPU times: user 7.32 s, sys: 531 ms, total: 7.85 s
Wall time: 5.54 s

Sorry for such a long issue. Appreciate it if you can give it a look. Thanks

huseinzol05 commented 2 years ago

First,

INFO:tensorflow:  name = transducer/transducer_prediction/transducer_prediction_embedding/embeddings:0, shape = (1019, 640), *INIT_FROM_CKPT*
INFO:tensorflow:  name = transducer/transducer_prediction/transducer_prediction_lstm_0/lstm_cell/kernel:0, shape = (640, 2560)
INFO:tensorflow:  name = transducer/transducer_prediction/transducer_prediction_lstm_0/lstm_cell/recurrent_kernel:0, shape = (640, 2560)
INFO:tensorflow:  name = transducer/transducer_prediction/transducer_prediction_lstm_0/lstm_cell/bias:0, shape = (2560,)
INFO:tensorflow:  name = transducer/transducer_prediction/transducer_prediction_ln_0/gamma:0, shape = (640,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = transducer/transducer_prediction/transducer_prediction_ln_0/beta:0, shape = (640,), *INIT_FROM_CKPT*

This only load language model parameters, to load entire parameters, simply remove 2nd line,

variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
init_checkpoint = 'output-large-singlish-conformer_copy/model.ckpt'

I validated the checkpoint using colab, https://colab.research.google.com/drive/1IP_GtVUAIJVDTv5C60RM9tKjdzHSO3zI?usp=sharing

mr-coconut commented 2 years ago

Thanks for the reply! I will look through this.

mr-coconut commented 2 years ago

First,

INFO:tensorflow:  name = transducer/transducer_prediction/transducer_prediction_embedding/embeddings:0, shape = (1019, 640), *INIT_FROM_CKPT*
INFO:tensorflow:  name = transducer/transducer_prediction/transducer_prediction_lstm_0/lstm_cell/kernel:0, shape = (640, 2560)
INFO:tensorflow:  name = transducer/transducer_prediction/transducer_prediction_lstm_0/lstm_cell/recurrent_kernel:0, shape = (640, 2560)
INFO:tensorflow:  name = transducer/transducer_prediction/transducer_prediction_lstm_0/lstm_cell/bias:0, shape = (2560,)
INFO:tensorflow:  name = transducer/transducer_prediction/transducer_prediction_ln_0/gamma:0, shape = (640,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = transducer/transducer_prediction/transducer_prediction_ln_0/beta:0, shape = (640,), *INIT_FROM_CKPT*

This only load language model parameters, to load entire parameters, simply remove 2nd line,

variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
init_checkpoint = 'output-large-singlish-conformer_copy/model.ckpt'

I validated the checkpoint using colab, https://colab.research.google.com/drive/1IP_GtVUAIJVDTv5C60RM9tKjdzHSO3zI?usp=sharing

Hi, Thanks for your reply again. But I get confused in "This only load language model parameters" part. I copied these lines from https://github.com/huseinzol05/malaya-speech/blob/master/pretrained-model/stt/conformer/base.py#L340 to try to load from a previous checkpoint. I think it should make sense just to retrain the language model parameters using new data while keeping other parameters unchanged. I am still thinking about why the loss is around 800, which shouldn't be the case since I used your pretrained checkpoint. One possible explanation is that I didn't load checkpoint successfully. Thanks!

huseinzol05 commented 2 years ago

you might want to compare the accuracy between finetune entire model vs finetune LM only. Did you compiled warp-transducer? you need to test warp-transducer result first using provided unit tests, https://github.com/huseinzol05/malaya-speech/blob/master/scripts/build-rnnt-3090.sh