andabi / deep-voice-conversion

Deep neural networks for voice conversion (voice style transfer) in Tensorflow
MIT License
3.92k stars 843 forks source link

How to view the result of the conversion? #24

Open AricJames opened 6 years ago

AricJames commented 6 years ago

After run the convert.py, how can I find the result? How to use tensorboard? Thank you for your reply.

AricJames commented 6 years ago

@andabi Can you help me? Thank you very much.

JiaYK commented 6 years ago

You can use this replace convert.py and you can get test.wav. That is the result. `# -- coding: utf-8 --

/usr/bin/python2

from future import print_function

import argparse

from data_load import get_wav_batch from models import Model import numpy as np from utils import spectrogram2wav, inv_preemphasis from hparams import logdir_path import datetime import tensorflow as tf from hparams import Default as hp_default import hparams as hp

import struct import wave

def out_range(num): if num > 30000: return 30000 elif num < -30000: return -30000 else: return num

def out_wav(result, file_name): n_frames = len(result) outfile = file_name out_wave = wave.open(outfile, 'wb') comp_type = "NONE" comp_name = "not compressed" out_wave.setparams((1, 2, 16000, n_frames, comp_type, comp_name)) MAX_M = np.max(np.abs(result))

for v in result:
    out_wave.writeframes(struct.pack('h', out_range(v*10000)))
out_wave.close()
print('save success')

def convert(logdir='logdir/default/train2', queue=False):

# Load graph
model = Model(mode="convert", batch_size=hp.Convert.batch_size, queue=queue)

session_conf = tf.ConfigProto(
    allow_soft_placement=True,
    device_count={'CPU': 1, 'GPU': 0},
    gpu_options=tf.GPUOptions(
        allow_growth=True,
        per_process_gpu_memory_fraction=0.9
    ),
)
with tf.Session(config=session_conf) as sess:
    # Load trained model
    sess.run(tf.global_variables_initializer())
    model.load(sess, 'convert', logdir=logdir)

    writer = tf.summary.FileWriter(logdir, sess.graph)

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)

    gs = Model.get_global_step(logdir)

    if queue:
        pred_log_specs, y_log_spec, ppgs = sess.run([model(), model.y_spec, model.ppgs])
    else:
        mfcc, spec, mel = get_wav_batch(model.mode, model.batch_size)
        pred_log_specs, y_log_spec, ppgs = sess.run([model(), model.y_spec, model.ppgs], feed_dict={model.x_mfcc: mfcc, model.y_spec: spec, model.y_mel: mel})

    # Denormalizatoin
    # pred_log_specs = hp.mean_log_spec + hp.std_log_spec * pred_log_specs
    # y_log_spec = hp.mean_log_spec + hp.std_log_spec * y_log_spec
    # pred_log_specs = hp.min_log_spec + (hp.max_log_spec - hp.min_log_spec) * pred_log_specs
    # y_log_spec = hp.min_log_spec + (hp.max_log_spec - hp.min_log_spec) * y_log_spec

    # Convert log of magnitude to magnitude
    pred_specs, y_specs = np.e ** pred_log_specs, np.e ** y_log_spec

    # Emphasize the magnitude
    pred_specs = np.power(pred_specs, hp.Convert.emphasis_magnitude)
    y_specs = np.power(y_specs, hp.Convert.emphasis_magnitude)

    # Spectrogram to waveform
    audio = np.array(map(lambda spec: spectrogram2wav(spec.T, hp_default.n_fft, hp_default.win_length, hp_default.hop_length, hp_default.n_iter), pred_specs))
    y_audio = np.array(map(lambda spec: spectrogram2wav(spec.T, hp_default.n_fft, hp_default.win_length, hp_default.hop_length, hp_default.n_iter), y_specs))

    # Apply inverse pre-emphasis
    audio = inv_preemphasis(audio, coeff=hp_default.preemphasis)
    y_audio = inv_preemphasis(y_audio, coeff=hp_default.preemphasis)

    if not queue:
        # Concatenate to a wav
        y_audio = np.reshape(y_audio, (1, y_audio.size), order='C')
        audio = np.reshape(audio, (1, audio.size), order='C')
        out_wav(audio[0],str('test')+'.wav')
        out_wav(y_audio[0],str('test')+'_y.wav')

    # Write the result
    tf.summary.audio('A', y_audio, hp_default.sr, max_outputs=hp.Convert.batch_size)
    tf.summary.audio('B', audio, hp_default.sr, max_outputs=hp.Convert.batch_size)

    # Visualize PPGs
    heatmap = np.expand_dims(ppgs, 3)  # channel=1
    tf.summary.image('PPG', heatmap, max_outputs=ppgs.shape[0])

    writer.add_summary(sess.run(tf.summary.merge_all()), global_step=gs)
    writer.close()

    coord.request_stop()
    coord.join(threads)

def get_arguments(): parser = argparse.ArgumentParser() parser.add_argument('case', type=str, help='experiment case name') arguments = parser.parse_args() return arguments

if name == 'main': args = get_arguments() case = args.case logdir = '{}/{}/train2'.format(logdir_path, case)

print('case: {}, logdir: {}'.format(case, logdir))

s = datetime.datetime.now()

convert(logdir=logdir)

e = datetime.datetime.now()
diff = e - s
print("Done. elapsed time:{}s".format(diff.seconds))

`

0i0 commented 6 years ago

nice and tidy now

# -- coding: utf-8 --
#/usr/bin/python2

from __future__ import print_function

import argparse

from data_load import get_wav_batch
from models import Model
import numpy as np
from utils import spectrogram2wav, inv_preemphasis
from hparams import logdir_path
import datetime
import tensorflow as tf
from hparams import Default as hp_default
import hparams as hp

import struct
import wave

def out_range(num):
    if num > 30000:
        return 30000
    elif num < -30000:
        return -30000
    else:
        return num

def out_wav(result, file_name):
    n_frames = len(result)
    outfile = file_name
    out_wave = wave.open(outfile, 'wb')
    comp_type = "NONE"
    comp_name = "not compressed"
    out_wave.setparams((1, 2, 16000, n_frames, comp_type, comp_name))
    MAX_M = np.max(np.abs(result))

    for v in result:
        out_wave.writeframes(struct.pack('h', out_range(v*10000)))
    out_wave.close()
    print('save success')
def convert(logdir='logdir/default/train2', queue=False):

# Load graph
    model = Model(mode="convert", batch_size=hp.Convert.batch_size, queue=queue)

    session_conf = tf.ConfigProto(
        allow_soft_placement=True,
        device_count={'CPU': 1, 'GPU': 0},
        gpu_options=tf.GPUOptions(
            allow_growth=True,
            per_process_gpu_memory_fraction=0.9
        ),
    )
    with tf.Session(config=session_conf) as sess:
        # Load trained model
        sess.run(tf.global_variables_initializer())
        model.load(sess, 'convert', logdir=logdir)

        writer = tf.summary.FileWriter(logdir, sess.graph)

        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord)

        gs = Model.get_global_step(logdir)

        if queue:
            pred_log_specs, y_log_spec, ppgs = sess.run([model(), model.y_spec, model.ppgs])
        else:
            mfcc, spec, mel = get_wav_batch(model.mode, model.batch_size)
            pred_log_specs, y_log_spec, ppgs = sess.run([model(), model.y_spec, model.ppgs], feed_dict={model.x_mfcc: mfcc, model.y_spec: spec, model.y_mel: mel})

        # Denormalizatoin
        # pred_log_specs = hp.mean_log_spec + hp.std_log_spec * pred_log_specs
        # y_log_spec = hp.mean_log_spec + hp.std_log_spec * y_log_spec
        # pred_log_specs = hp.min_log_spec + (hp.max_log_spec - hp.min_log_spec) * pred_log_specs
        # y_log_spec = hp.min_log_spec + (hp.max_log_spec - hp.min_log_spec) * y_log_spec

        # Convert log of magnitude to magnitude
        pred_specs, y_specs = np.e ** pred_log_specs, np.e ** y_log_spec

        # Emphasize the magnitude
        pred_specs = np.power(pred_specs, hp.Convert.emphasis_magnitude)
        y_specs = np.power(y_specs, hp.Convert.emphasis_magnitude)

        # Spectrogram to waveform
        audio = np.array(map(lambda spec: spectrogram2wav(spec.T, hp_default.n_fft, hp_default.win_length, hp_default.hop_length, hp_default.n_iter), pred_specs))
        y_audio = np.array(map(lambda spec: spectrogram2wav(spec.T, hp_default.n_fft, hp_default.win_length, hp_default.hop_length, hp_default.n_iter), y_specs))

        # Apply inverse pre-emphasis
        audio = inv_preemphasis(audio, coeff=hp_default.preemphasis)
        y_audio = inv_preemphasis(y_audio, coeff=hp_default.preemphasis)

        if not queue:
            # Concatenate to a wav
            y_audio = np.reshape(y_audio, (1, y_audio.size), order='C')
            audio = np.reshape(audio, (1, audio.size), order='C')
            out_wav(audio[0],str('test')+'.wav')
            out_wav(y_audio[0],str('test')+'_y.wav')

        # Write the result
        tf.summary.audio('A', y_audio, hp_default.sr, max_outputs=hp.Convert.batch_size)
        tf.summary.audio('B', audio, hp_default.sr, max_outputs=hp.Convert.batch_size)

        # Visualize PPGs
        heatmap = np.expand_dims(ppgs, 3)  # channel=1
        tf.summary.image('PPG', heatmap, max_outputs=ppgs.shape[0])

        writer.add_summary(sess.run(tf.summary.merge_all()), global_step=gs)
        writer.close()

        coord.request_stop()
        coord.join(threads)
def get_arguments():
    parser = argparse.ArgumentParser()
    parser.add_argument('case', type=str, help='experiment case name')
    arguments = parser.parse_args()
    return arguments

if __name__ == '__main__':
    args = get_arguments()
    case = args.case
    logdir = '{}/{}/train2'.format(logdir_path, case)

    print('case: {}, logdir: {}'.format(case, logdir))

    s = datetime.datetime.now()

    convert(logdir=logdir)

    e = datetime.datetime.now()
    diff = e - s
    print("Done. elapsed time:{}s".format(diff.seconds))
sailor88128 commented 6 years ago

"get_wav_batch" seems note in data_load.py, anyone run this successfully?

flyuuo9 commented 5 years ago

I have the same question. I run cmd python convert.py, and no wav file is generate.

ArtemisZGL commented 5 years ago

it seems that you can use tensorboard and see the result in audio partition

meharbhatia commented 4 years ago

Can someone please explain how to use tensorboard to view the final wav file.