nirmala-dewi commented 3 months ago

I use this code in windows

import json import logging from pathlib import Path

import hydra import numpy as np import pytorch_lightning as pl import torch import torchaudio from omegaconf import DictConfig from pytorch_lightning.callbacks import ModelCheckpoint from torch.utils.data import DataLoader

from clarity.dataset.cec1_dataset import CEC1Dataset from clarity.engine.losses import SNRLoss, STOILevelLoss from clarity.engine.system import System from clarity.enhancer.dnn.mc_conv_tasnet import ConvTasNet from clarity.enhancer.dsp.filter import AudiometricFIR from clarity.predictor.torch_msbg import MSBGHearingModel

logger = logging.getLogger(name)

class DenModule(System): def init(self, *args, *kwargs): super().init(args, **kwargs) self.ear_idx = None self.down_sample = None

def common_step(self, batch, batch_nb, train=True):
    if self.down_sample is None:
        raise RuntimeError("Hearing model not loaded")
    proc, ref = batch
    ref = ref[:, self.ear_idx, :]
    if self.config.downsample_factor != 1:
        proc = self.down_sample(proc)
        ref = self.down_sample(ref)
    enhanced = self.model(proc).squeeze(1)
    loss = self.loss_func(enhanced, ref)
    return loss

class AmpModule(System): def init(self, *args, *kwargs): super().init(args, **kwargs) self.hl_ear = None self.nh_ear = None self.down_sample = None self.up_sample = None self.ear_idx = None self.den_model = None

def common_step(self, batch, batch_nb, train=True):
    if (
        self.hl_ear is None
        or self.nh_ear is None
        or self.down_sample is None
        or self.up_sample is None
        or self.den_model is None
    ):
        raise RuntimeError("Hearing model not loaded")
    proc, ref = batch
    ref = ref[:, self.ear_idx, :]
    if self.config.downsample_factor != 1:
        proc = self.down_sample(proc)
        ref = self.down_sample(ref)
    enhanced = self.model(self.den_model(proc)).squeeze(1)

    if self.config.downsample_factor != 1:
        enhanced = torch.clamp(self.up_sample(enhanced), -1, 1)
        ref = torch.clamp(self.up_sample(ref), -1, 1)

    sim_ref = self.nh_ear(ref)
    sim_enhanced = self.hl_ear(enhanced)
    loss = self.loss_func(sim_enhanced, sim_ref)
    return loss

def train_den(cfg, ear): exp_dir = Path(cfg.path.exp_folder) / f"{ear}_den" if (exp_dir / "best_model.pth").exists(): logger.info("Enhancement module exist") return

train_set = CEC1Dataset(**cfg.train_dataset)
train_loader = DataLoader(dataset=train_set, **cfg.train_loader)
dev_set = CEC1Dataset(**cfg.dev_dataset)
dev_loader = DataLoader(dataset=dev_set, **cfg.dev_loader)

den_model = ConvTasNet(**cfg.mc_conv_tasnet)
optimizer = torch.optim.Adam(
    params=den_model.parameters(), **cfg.den_trainer.optimizer
)
loss_func = SNRLoss()

den_module = DenModule(
    model=den_model,
    loss_func=loss_func,
    optimizer=optimizer,
    train_loader=train_loader,
    val_loader=dev_loader,
    config=cfg,
)
den_module.ear_idx = 0 if ear == "left" else 1
if cfg.downsample_factor != 1:
    den_module.down_sample = torchaudio.transforms.Resample(
        orig_freq=cfg.sample_rate,
        new_freq=cfg.sample_rate // cfg.downsample_factor,
        resampling_method="sinc_interp_hann",
    )

# callbacks
callbacks = []
checkpoint_dir = exp_dir / "checkpoints/"
checkpoint = ModelCheckpoint(
    str(checkpoint_dir), monitor="val_loss", mode="min", save_top_k=5, verbose=True
)
callbacks.append(checkpoint)

# set device
#gpus = -1 if torch.cuda.is_available() else None
devices = -1 if torch.cuda.is_available() else 1

trainer = pl.Trainer(
    max_epochs=cfg.den_trainer.epochs,
    callbacks=callbacks,
    default_root_dir=str(exp_dir),
    devices=devices,
    limit_train_batches=1.0,  # Useful for fast experiment
    gradient_clip_val=cfg.den_trainer.gradient_clip_val,
)
trainer.fit(den_module)

best_k = {k: v.item() for k, v in checkpoint.best_k_models.items()}
with (exp_dir / "best_k_models.json").open("w", encoding="utf-8") as fp:
    json.dump(best_k, fp, indent=0)
state_dict = torch.load(checkpoint.best_model_path)
den_module.load_state_dict(state_dict=state_dict["state_dict"])
den_module.cpu()
torch.save(den_module.model.state_dict(), str(exp_dir / "best_model.pth"))

def train_amp(cfg, ear): exp_dir = Path(cfg.path.exp_folder) / f"{ear}_amp" Path.mkdir(exp_dir, parents=True, exist_ok=True) if (exp_dir / "best_model.pth").exists(): logger.info("Amplification module exist") return

train_set = CEC1Dataset(**cfg.train_dataset)
train_loader = DataLoader(dataset=train_set, **cfg.train_loader)
dev_set = CEC1Dataset(**cfg.dev_dataset)
dev_loader = DataLoader(dataset=dev_set, **cfg.dev_loader)

# load denoising module
den_model = ConvTasNet(**cfg.mc_conv_tasnet)
den_model_path = exp_dir / ".." / f"{ear}_den/best_model.pth"
den_model.load_state_dict(torch.load(den_model_path))

# amplification module
amp_model = AudiometricFIR(**cfg.fir)
optimizer = torch.optim.Adam(
    params=amp_model.parameters(), **cfg.amp_trainer.optimizer
)
loss_func = STOILevelLoss(**cfg.amp_trainer.stoilevel_loss)

amp_module = AmpModule(
    model=amp_model,
    loss_func=loss_func,
    optimizer=optimizer,
    train_loader=train_loader,
    val_loader=dev_loader,
    config=cfg,
)
amp_module.ear_idx = 0 if ear == "left" else 1
amp_module.den_model = den_model
if cfg.downsample_factor != 1:
    amp_module.down_sample = torchaudio.transforms.Resample(
        orig_freq=cfg.sr,
        new_freq=cfg.sr // cfg.downsample_factor,
        resampling_method="sinc_interp_hann",
    )
    amp_module.up_sample = torchaudio.transforms.Resample(
        orig_freq=cfg.sr // cfg.downsample_factor,
        new_freq=cfg.sr,
        resampling_method="sinc_interp_hann",
    )

# build normal hearing and hearing loss ears
with open(cfg.listener.metafile, encoding="utf-8") as fp:
    listeners_file = json.load(fp)
    audiogram_cfs = listeners_file[cfg.listener.id]["audiogram_cfs"]
    audiogram_lvl_l = listeners_file[cfg.listener.id]["audiogram_levels_l"]
    audiogram_lvl_r = listeners_file[cfg.listener.id]["audiogram_levels_r"]
audiogram = audiogram_lvl_l if ear == "left" else audiogram_lvl_r

amp_module.nh_ear = MSBGHearingModel(
    audiogram=np.zeros_like(audiogram), audiometric=audiogram_cfs, sr=cfg.sr
)
amp_module.hl_ear = MSBGHearingModel(
    audiogram=audiogram, audiometric=audiogram_cfs, sr=cfg.sr
)

# callbacks
callbacks = []
checkpoint_dir = exp_dir / "checkpoints/"
checkpoint = ModelCheckpoint(
    str(checkpoint_dir), monitor="val_loss", mode="min", save_top_k=5, verbose=True
)
callbacks.append(checkpoint)

# set device
#gpus = -1 if torch.cuda.is_available() else None
devices = -1 if torch.cuda.is_available() else 1

trainer = pl.Trainer(
    max_epochs=cfg.amp_trainer.epochs,
    callbacks=callbacks,
    default_root_dir=exp_dir,
    devices=devices,
    limit_train_batches=1.0,  # Useful for fast experiment
    gradient_clip_val=cfg.amp_trainer.gradient_clip_val,
    num_sanity_val_steps=cfg.amp_trainer.num_sanity_val_steps,
)
trainer.fit(amp_module)

best_k = {k: v.item() for k, v in checkpoint.best_k_models.items()}
with (exp_dir / "best_k_models.json").open("w", encoding="utf-8") as fp:
    json.dump(best_k, fp, indent=0)
state_dict = torch.load(checkpoint.best_model_path)
amp_module.load_state_dict(state_dict=state_dict["state_dict"])
amp_module.cpu()
torch.save(amp_module.model.state_dict(), str(exp_dir / "best_model.pth"))

@hydra.main(config_path=".", config_name="config") def run(cfg: DictConfig) -> None: logger.info("Begin training left ear enhancement module.") train_den(cfg, ear="left") logger.info("Begin training right ear enhancement module.") train_den(cfg, ear="right") logger.info("Begin training left ear amplification module.") train_amp(cfg, ear="left") logger.info("Begin training right ear amplification module.") train_amp(cfg, ear="right")

pylint: disable=no-value-for-parameter

if name == "main": run()

and this is the code for cec1_dataset.py:

import json import logging from pathlib import Path

import librosa import numpy as np import torch from scipy.signal import firwin, lfilter from soundfile import read from torch.utils import data

logger = logging.getLogger(name)

def readwavfile(path): wav, = read(path) return wav.transpose()

class CEC1Dataset(data.Dataset): def init( self, scenes_folder, scenes_file, sample_rate, downsample_factor, wav_sample_len=None, wav_silence_len=2, num_channels=6, norm=False, testing=False, ): self.scenes_folder = scenes_folder self.sample_rate = sample_rate self.downsample_factor = downsample_factor self.wav_sample_len = wav_sample_len self.wav_silence_len = wav_silence_len self.num_channels = num_channels self.norm = norm self.testing = testing

    self.scene_list = []
    with open(scenes_file, encoding="utf-8") as fp:
        scene_json = json.load(fp)
        if not testing:
            for scene in scene_json:
                self.scene_list.append(scene["scene"])
        else:
            for scene in scene_json.keys():
                self.scene_list.append(scene)

    if self.num_channels == 2:
        self.mixed_suffix = "_mixed_CH1.wav"
        self.target_suffix = "_target_anechoic.wav"
    elif self.num_channels == 6:
        #self.mixed_suffix = ["_mixed_CH1.wav", "_mixed_CH2.wav", "_mixed_CH3.wav"]
        #self.target_suffix = "_target_anechoic.wav"
        self.mixed_suffix = ["_mix_CH1.wav", "_mix_CH2.wav", "_mix_CH3.wav"]
        self.target_suffix = "_target_anechoic_CH1.wav"
    else:
        raise NotImplementedError

    self.lowpass_filter = firwin(
        1025,
        self.sample_rate // (2 * self.downsample_factor),
        pass_zero="lowpass",
        fs=self.sample_rate,
    )

def wav_sample(self, x, y):
    """
    A 2 second silence is in the beginning of clarity data
    Get rid of the silence segment in the beginning & sample a
    constant wav length for training.
    """
    silence_len = int(self.wav_silence_len * self.sample_rate)
    x = x[:, silence_len:]
    y = y[:, silence_len:]

    wav_len = x.shape[1]
    sample_len = int(self.wav_sample_len * self.sample_rate)
    if wav_len > sample_len:
        start = np.random.randint(wav_len - sample_len)
        end = start + sample_len
        x = x[:, start:end]
        y = y[:, start:end]
    elif wav_len < sample_len:
        x = np.append(
            x, np.zeros([x.shape[1], sample_len - wav_len], dtype=np.float32)
        )
        y = np.append(
            y, np.zeros([x.shape[1], sample_len - wav_len], dtype=np.float32)
        )

    return x, y

def lowpass_filtering(self, x):
    return lfilter(self.lowpass_filter, 1, x)

def __getitem__(self, item):
    scenes_folder = Path(self.scenes_folder)
    if self.num_channels == 2:
        mixed = read_wavfile(
            scenes_folder / (self.scene_list[item] + self.mixed_suffix)
        )
    elif self.num_channels == 6:
        mixed = []
        for suffix in self.mixed_suffix:
            mixed.append(
                read_wavfile(scenes_folder / (self.scene_list[item] + suffix))
            )
        mixed = np.concatenate(mixed, axis=0)
    else:
        raise NotImplementedError
    target = None
    if not self.testing:
        target = read_wavfile(
            scenes_folder / (self.scene_list[item] + self.target_suffix)
        )
        if target.shape[1] > mixed.shape[1]:
            logging.warning(
                "Target length is longer than mixed length. Truncating target."
            )
            target = target[:, : mixed.shape[1]]
        elif target.shape[1] < mixed.shape[1]:
            logging.warning(
                "Target length is shorter than mixed length. Padding target."
            )
            target = np.pad(
                target,
                ((0, 0), (0, mixed.shape[1] - target.shape[1])),
                mode="constant",
            )

    if self.sample_rate != 44100:
        mixed_resampled, target_resampled = [], []
        for i in range(mixed.shape[0]):
            mixed_resampled.append(
                librosa.resample(
                    mixed[i], target_sr=44100, orig_sr=self.sample_rate
                )
            )
        mixed = np.array(mixed_resampled)
        if target is not None:
            for i in range(target.shape[0]):
                target_resampled.append(
                    librosa.resample(
                        target[i], target_sr=44100, orig_sr=self.sample_rate
                    )
                )
            target = np.array(target_resampled)

    if self.wav_sample_len is not None:
        mixed, target = self.wav_sample(mixed, target)

    if self.norm:
        mixed_max = np.max(np.abs(mixed))
        mixed = mixed / mixed_max
        if target is not None:
            target = target / mixed_max

    if not self.testing:
        return_data = (
            torch.tensor(mixed, dtype=torch.float32),
            torch.tensor(target, dtype=torch.float32),
        )
    else:
        return_data = (
            torch.tensor(mixed, dtype=torch.float32),
            self.scene_list[item],
        )

    return return_data

def __len__(self):
    return len(self.scene_list)

But i got this error, please help me (the file name is in wav) WhatsApp Image 2024-03-11 at 23 37 13_a3dd798d

bastibe commented 3 months ago

Please post a concise problem description. We are not here to debug your code, but merely to discuss issues with python-soundfile.

Something inside torch seems to be eating the LibsndfileError message. Without that message, there's not much we can do.

nirmala-dewi commented 3 months ago

Apologize, I am still a begineer in here, so I was confuse what to write, so I use .wav file, and then I run code from https://github.com/claritychallenge/clarity/tree/main/recipes/cec1/e009_sheffield but i got error message like this soundfile.LibsndfileError: <exception str() failed> (before i can use mu gpu) and got soundfile.LibsndfileError: (after i can use my gpu) for the same code.

liu123liu123liu commented 3 months ago

I also met the same question.Do you solve it?

bastibe commented 3 months ago

As I said, without the error message there's not much we can do. Grab your debugger, dig out that error message.

nirmala-dewi commented 3 months ago

the error message is like what I write (soundfile.LibsndfileError: <exception str() failed> and soundfile.LibsndfileError: ). I don't know what error message do you referes to?

bastibe commented 3 months ago

"exception str() failed" means that torch is trying to convert the LibsndfileError to a string, which fails. That LibsndfileError, however, does hold the real error message, which torch drops at that point. But without that message, we don't know what went wrong.

bastibe / python-soundfile

soundfile.LibsndfileError: <exception str() failed> #428

pylint: disable=no-value-for-parameter