Hackster is hosting Hackster Holidays, Finale: Livestream & Giveaway Drawing. Watch previous episodes or stream live on Tuesday!Stream Hackster Holidays, Finale on Tuesday!
Min Ma
Created July 30, 2024

Singing Voice Synthesis with DiffSinger

Use shallow diffusion model to generate song singing from text and note, and convert and deloy complex model to onnx in Ryzen AI PC.

17
Singing Voice Synthesis with DiffSinger

Things used in this project

Story

Read more

Code

GaussianDiffusion model export code

Python
export GaussianDiffusion model to onnx format.
# coding=utf8

import json
import os
import sys
import inference.svs.ds_e2e as e2e
from modules.fastspeech.pe import PitchExtractor
from usr.diff.shallow_diffusion_tts import GaussianDiffusion
from usr.diffsinger_task import DIFF_DECODERS

from utils import load_ckpt
from utils.audio import save_wav
from utils.hparams import set_hparams, hparams

import torch
import numpy as np

from utils.text_encoder import TokenTextEncoder
from usr.diffsinger_task import DIFF_DECODERS

root_dir = os.path.dirname(os.path.abspath(__file__))
os.environ['PYTHONPATH'] = f'"{root_dir}"'

sys.argv = [
    f'{root_dir}/inference/svs/ds_e2e.py',
    '--config',
    f'{root_dir}/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml',
    '--exp_name',
    '0228_opencpop_ds100_rel'
]


class GaussianDiffusionWrap(GaussianDiffusion):
    def forward(self, txt_tokens,
                # Wrapped Arguments
                spk_id,
                pitch_midi,
                midi_dur,
                is_slur,
                #mel2ph,
                ):

        return super().forward(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True,
                               pitch_midi=pitch_midi, midi_dur=midi_dur,
                               is_slur=is_slur)#, mel2ph=mel2ph


class DFSInferWrapped(e2e.DiffSingerE2EInfer):
    def build_model(self):
        model = GaussianDiffusionWrap(
            phone_encoder=self.ph_encoder,
            out_dims=hparams['audio_num_mel_bins'], denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](
                hparams),
            timesteps=hparams['timesteps'],
            K_step=hparams['K_step'],
            loss_type=hparams['diff_loss_type'],
            spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
        )

        model.eval()
        #load_ckpt(model, hparams['work_dir'], 'model')
        load_ckpt(model, hparams['fs2_ckpt'], 'model')

        if hparams.get('pe_enable') is not None and hparams['pe_enable']:
            self.pe = PitchExtractor().to(self.device)
            load_ckpt(self.pe, hparams['pe_ckpt'], 'model', strict=True)
            self.pe.eval()

        return model


if __name__ == '__main__':

    set_hparams(config='usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml',print_hparams=False)

    dev = 'cpu'
    #dev = 'cuda' if torch.cuda.is_available() else 'cpu'

    infer_ins = DFSInferWrapped(hparams)
    infer_ins.model.to(dev)

    inp = {
        'text': 'AP',
        'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
        'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
        'input_type': 'word'
    }  # user input: Chinese characters
    
    
    
    
    phone_list = ["AP", "SP", "a", "ai", "an", "ang", "ao", "b", "c", "ch", "d", "e", "ei", "en", "eng", "er", "f", "g",
                  "h", "i", "ia", "ian", "iang", "iao", "ie", "in", "ing", "iong", "iu", "j", "k", "l", "m", "n", "o",
                  "ong", "ou", "p", "q", "r", "s", "sh", "t", "u", "ua", "uai", "uan", "uang", "ui", "un", "uo", "v",
                  "van", "ve", "vn", "w", "x", "y", "z", "zh"]
    
    ph_encoder =  TokenTextEncoder(None, vocab_list=phone_list, replace_oov=',')

    gd = GaussianDiffusion(
                 phone_encoder = ph_encoder,
                 out_dims=hparams['audio_num_mel_bins'], denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](hpa
     rams),
                 timesteps=hparams['timesteps'],
                 K_step=hparams['K_step'],
                 loss_type=hparams['diff_loss_type'],
                 spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
             )
    load_ckpt(gd, hparams['fs2_ckpt'],'model')
    
    with torch.no_grad():
        inp = infer_ins.preprocess_input(
            inp, input_type=inp['input_type'] if inp.get('input_type') else 'word')
        sample = infer_ins.input_to_batch(inp)
        txt_tokens = sample['txt_tokens']  # [B, T_t]
        spk_id = sample.get('spk_ids')

        pitch_midi = sample['pitch_midi']
        midi_dur = sample['midi_dur']
        is_slur = sample['is_slur']
        #mel2ph  = np.zeros_like(is_slur) #None #sample['mel2ph']
        #output_names = 
        
        print(f'txt_tokens: {txt_tokens.shape}')
        print(f'pitch_midi: {pitch_midi.shape}')
        print(f'midi_dur: {midi_dur.shape}')
        print(f'is_slur: {is_slur.shape}')

        torch.onnx.export(
            gd.fs2,
            (
                txt_tokens.to(dev),
                spk_id.to(dev),
                pitch_midi.to(dev),
                midi_dur.to(dev),
                is_slur.to(dev),
                #mel2ph.to(dev),
            ),
            "fs2.onnx",
            verbose=True,
            export_params=True,
            input_names=["txt_tokens", "spk_id",
                         "pitch_midi", "midi_dur", "is_slur"],#, "mel2ph"
            output_names = ['dur','mel2ph','decoder_inp','mel_out','fs2_mel','fs2_mels'],
            dynamic_axes={
                "txt_tokens": {
                    0: "a",
                    1: "b",
                },
                "spk_id": {
                    0: "a",
                    1: "b",
                },
                "pitch_midi": {
                    0: "a",
                    1: "b",
                },
                "midi_dur": {
                    0: "a",
                    1: "b",
                },
                "is_slur": {
                    0: "a",
                    1: "b",
                },
                
                "mel_out": {
                    0: "a",
                    1: "b",
                    2: "c",
                },
                "fs2_mel": {
                    0: "a",
                    1: "b",
                    2: "c",
                },
                "fs2_mels": {
                    0: "a",
                    1: "b",
                    2: "c",
                },

                #"mel2ph": {
                #    0: "a",
                #    1: "b",
                #}
            },
            opset_version=13
        )
    print('finished part1!\n')
    

hifigan model onnx export

Python
export hifigan model to onnx format.
# coding=utf8

import os
import sys
import inference.svs.ds_e2e as e2e
from utils.audio import save_wav
from utils.hparams import set_hparams, hparams

import torch

root_dir = os.path.dirname(os.path.abspath(__file__))
os.environ['PYTHONPATH'] = f'"{root_dir}"'

sys.argv = [
    f'{root_dir}/inference/svs/ds_e2e.py',
    '--config',
    f'{root_dir}/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml',
    '--exp_name',
    '0228_opencpop_ds100_rel'
]

if __name__ == '__main__':

    set_hparams(config='usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml', print_hparams=False)

    dev = 'cuda' if torch.cuda.is_available() else 'cpu'

    infer_ins = e2e.DiffSingerE2EInfer(hparams)
    infer_ins.vocoder.to(dev)
    batch = 1
    frame_len=967
    num_mel_bin = 80

    with torch.no_grad():
        x = torch.rand(batch, num_mel_bin, frame_len).to(dev)
        f0 = torch.rand(batch, frame_len).to(dev)

        #x = torch.load("c.pt").to(dev)
        #f0 = torch.load("f0.pt").to(dev)

        print(x.shape)
        print(f0.shape)

        torch.onnx.export(
            infer_ins.vocoder,
            (
                x,
                f0
            ),
            "hifigan.onnx",
            verbose=True,
            export_params=True,
            input_names=["x", "f0"],
            dynamic_axes={
                "x": {
                    0: "batch",
                    1: "num_mel_bin",
                    2: "frame_len",
                },
                "f0": {
                    0: "batch",
                    1: "frame_len"
                }
            },
            opset_version=13,
        )

    print(infer_ins.vocoder)
    print("OK")

DiffSinger infer code

Python
DiffSinger infer with onnx and vitis runtime.
# coding=utf8

import os
from pyexpat import model
import sys
import inference.svs.ds_e2e as e2e
from inference.svs.opencpop.map import cpop_pinyin2ph_func
from utils.audio import save_wav
from utils.hparams import set_hparams, hparams

import numpy as np

import torch
import onnxruntime as ort

from tqdm import tqdm

from utils.text_encoder import TokenTextEncoder

root_dir = os.path.dirname(os.path.abspath(__file__))
os.environ['PYTHONPATH'] = f'"{root_dir}"'

sys.argv = [
    f'{root_dir}/inference/svs/ds_e2e.py',
    '--config',
    f'{root_dir}/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml',
    '--exp_name',
    '0228_opencpop_ds100_rel'
]


def to_numpy(tensor):
    if (tensor is None):
        return np.array([[]])
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()


spec_max = 0
spec_min = 0


def denorm_spec(x):
    return (x + 1) / 2 * (spec_max - spec_min) + spec_min


class TestAllInfer(e2e.DiffSingerE2EInfer):
    def __init__(self, hparams, device=None):
        if device is None:
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
        #device='cpu'
        self.hparams = hparams
        self.device = device

        phone_list = ["AP", "SP", "a", "ai", "an", "ang", "ao", "b", "c", "ch", "d", "e", "ei", "en", "eng", "er", "f", "g",
                      "h", "i", "ia", "ian", "iang", "iao", "ie", "in", "ing", "iong", "iu", "j", "k", "l", "m", "n", "o",
                      "ong", "ou", "p", "q", "r", "s", "sh", "t", "u", "ua", "uai", "uan", "uang", "ui", "un", "uo", "v",
                      "van", "ve", "vn", "w", "x", "y", "z", "zh"]
        self.ph_encoder = TokenTextEncoder(
            None, vocab_list=phone_list, replace_oov=',')
        self.pinyin2phs = cpop_pinyin2ph_func()
        self.spk_map = {'opencpop': 0}


        config_file_path = "vaip_config.json"
        aie_options = ort.SessionOptions()
        aie_options.enable_profiling = True

        print("load pe")
        self.pe2 = ort.InferenceSession("xiaoma_pe.onnx",
                                        providers = ['VitisAIExecutionProvider'],
                                        sess_options=aie_options,
                                        provider_options=[{'config_file': config_file_path}])
        print("load hifigan")
        self.vocoder2 = ort.InferenceSession("hifigan.onnx",
                                        providers = ['VitisAIExecutionProvider'],
                                        sess_options=aie_options,
                                        provider_options=[{'config_file': config_file_path}])
        print("load singer_fs")
        self.model2 = ort.InferenceSession("singer_fs.onnx",
                                        providers = ['VitisAIExecutionProvider'],
                                        sess_options=aie_options,
                                        provider_options=[{'config_file': config_file_path}])
        ips = self.model2.get_inputs()
        print(len(ips))
        for i in range(0, len(ips)):
            print(f'{i}. {ips[i].name}')

        '''
        print("load singer_denoise")
        self.model3 = ort.InferenceSession("singer_denoise.onnx")
        ips = self.model3.get_inputs()
        print(len(ips))
        for i in range(0, len(ips)):
            print(f'{i}. {ips[i].name}')
        '''
        print("load over")

    def run_vocoder(self, c, **kwargs):
        c = c.transpose(2, 1)  # [B, 80, T]
        f0 = kwargs.get('f0')  # [B, T]

        if f0 is not None and hparams.get('use_nsf'):
            ort_inputs = {
                'x': to_numpy(c),
                'f0': to_numpy(f0)
            }
        else:
            ort_inputs = {
                'x': to_numpy(c),
                'f0': {}
            }
            # [T]

        ort_out = self.vocoder2.run(None, ort_inputs)
        y = torch.from_numpy(ort_out[0]).to(self.device)

        return y[None]

    def forward_model(self, inp):
        sample = self.input_to_batch(inp)
        txt_tokens = sample['txt_tokens']  # [B, T_t]
        spk_id = sample.get('spk_ids')
        midi_dur = sample['midi_dur']
        is_slur = sample['is_slur']
        #mel2ph = sample['mel2ph']
        #mel2ph = None

        device = txt_tokens.device

        with torch.no_grad():
            decoder_inp = self.model2.run(
                None,
                {
                    "txt_tokens": to_numpy(txt_tokens),
                    #"spk_id": to_numpy(spk_id),
                    "pitch_midi": to_numpy(sample['pitch_midi']).astype(np.int64),
                    "midi_dur": to_numpy(sample['midi_dur']),
                    "is_slur": to_numpy(sample['is_slur']).astype(np.int64),
                    #"mel2ph": np.array([0, 0]).astype(np.int64)
                }
            )

            cond = torch.from_numpy(decoder_inp[0]).transpose(1, 2)

            print(f'cond2: {cond}')

            t = hparams['K_step']
            print('===> gaussion start.')
            shape = (cond.shape[0], 1,
                     hparams['audio_num_mel_bins'], cond.shape[2])
            x = torch.randn(shape, device=device)
            # x = torch.zeros(shape, device=device)
            '''
            
            for i in tqdm(reversed(range(0, t)), desc='sample time step', total=t):
                res2 = self.model3.run(
                    None,
                    {
                        "x": to_numpy(x),
                        "t": np.array([i]).astype(np.int64),
                        "cond": to_numpy(cond),
                    }
                )
                x = torch.from_numpy(res2[0])

            x = x[:, 0].transpose(1, 2)
            
            x = cond
            
            if mel2ph is not None:  # for singing
                mel_out = denorm_spec(x) * ((mel2ph > 0).float()[:, :, None])
            else:
                mel_out = denorm_spec(x)

            # mel_out = output['mel_out']  # [B, T,80]
            '''
            #mel_out = decoder_inp['mel_out']
            mel_out = decoder_inp[-2]
            print(mel_out.shape)
            print(f'mel_out:{mel_out}')
            
            if hparams.get('pe_enable') is not None and hparams['pe_enable']:
                pe2_res = self.pe2.run(None,
                                       {
                                           #'mel_input': to_numpy(mel_out)
                                           'mel_input': mel_out
                                       }
                                       )

                # pe predict from Pred mel
                f0_pred = torch.from_numpy(pe2_res[1])

            else:
                # f0_pred = output['f0_denorm']
                f0_pred = None

            # Run Vocoder
            wav_out = self.run_vocoder(torch.from_numpy(mel_out), f0=f0_pred)
        wav_out = wav_out.cpu().numpy()
        return wav_out[0]


if __name__ == '__main__':
    
    c = {
        'text': 'AP',
        'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
        'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
        'input_type': 'word'
    }  # user input: Chinese characters
    '''
    
    c = {
        'text': '    SP         AP',
        'notes': 'D#4/Eb4 | D#4/Eb4 | D#4/Eb4 | D#4/Eb4 | rest | D#4/Eb4 | D4 | D4 | D4 | D#4/Eb4 | F4 | D#4/Eb4 | D4 | rest',
        'notes_duration': '0.113740 | 0.329060 | 0.287950 | 0.133480 | 0.150900 | 0.484730 | 0.242010 | 0.180820 | 0.343570 | 0.152050 | 0.266720 | 0.280310 | 0.633300 | 0.444590',
        'input_type': 'word'
    }
    '''

    target = "./infer_out/onnx_test_singer_res.wav"

    set_hparams(config='usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml',print_hparams=False)

    spec_min= torch.FloatTensor(hparams['spec_min'])[None, None, :hparams['keep_bins']]
    spec_max= torch.FloatTensor(hparams['spec_max'])[None, None, :hparams['keep_bins']]

    infer_ins = TestAllInfer(hparams)

    out = infer_ins.infer_once(c)
    os.makedirs(os.path.dirname(target), exist_ok=True)
    print(f'| save audio: {target}')
    save_wav(out, target, hparams['audio_sample_rate'])

    print("OK")

PE model export code

Python
export PE model to onnx format.
# coding=utf8

import os
import sys
import inference.svs.ds_e2e as e2e
from utils.audio import save_wav
from utils.hparams import set_hparams, hparams

import torch

root_dir = os.path.dirname(os.path.abspath(__file__))
os.environ['PYTHONPATH'] = f'"{root_dir}"'

sys.argv = [
    f'{root_dir}/inference/svs/ds_e2e.py',
    '--config',
    f'{root_dir}/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml',
    '--exp_name',
    '0228_opencpop_ds100_rel'
]

if __name__ == '__main__':
    set_hparams(config='usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml', print_hparams=False)

    #dev = 'cuda'
    dev = 'cuda' if torch.cuda.is_available() else 'cpu'

    infer_ins = e2e.DiffSingerE2EInfer(hparams)
    infer_ins.pe.to(dev)
    batch = 1
    frame_len = 967
    num_mel_bin = 80
    with torch.no_grad():
        mel_input = torch.rand(batch, frame_len, num_mel_bin).to(dev)

        torch.onnx.export(
            infer_ins.pe,
            (
                mel_input
            ),
            "xiaoma_pe.onnx",
            verbose=True,
            export_params=True,
            input_names=["mel_input"],
            dynamic_axes={
                "mel_input": {
                    0: "batch",
                    1: "frame_len",
                    2: "num_mel_bin"
                }
            },
            opset_version=13
        )

    print("OK")

Credits

Min Ma
8 projects • 1 follower
Senior Software Engineer

Comments