基于So-VITS-SVC4.1声音克隆音频异常的解决办法-Toy模板网

这篇具有很好参考价值的文章主要介绍了基于So-VITS-SVC4.1声音克隆音频异常的解决办法。希望对大家有所帮助。如果存在错误或未考虑完全的地方，请大家不吝赐教，您也可以点击"举报违法"按钮提交疑问。

通常在使用VITS进行声音克隆的时候出现声音沙哑或者大佐味，就是日本腔调，这个一方面是由于模型训练的问题，如果觉得模型训练没有问题的话就是参数，或者其他原因。这里介绍一个通用的解决办法。

声音预测参数

按照以下图片进行设置获取模型。

基于So-VITS-SVC4.1声音克隆音频异常的解决办法,Python 音频技术,音视频,算法,人工智能
上传好音频之后点击这些选项，然后生成音频。

音频生成

首先使用微软的TTS进行文本转语音的操作，这里有个技巧就是不要整篇文字扔进去，拆分分段生成音频，然后克隆。具体为什么自己体会吧，这个是我尝试了多少次成功的。

先整理好你的文件目录如图。

基于So-VITS-SVC4.1声音克隆音频异常的解决办法,Python 音频技术,音视频,算法,人工智能

这里面的TTS_apiKey要换成你的，split_and_accumulate方法后面的50是拆分字数间隔。

import http.client
from xml.etree import ElementTree
import wave
import os
import requests
import pandas as pd
import hashlib
import platform
import uuid
import json
from moviepy.editor import *
import re
import time
from moviepy.audio.fx import audio_fadein, audio_fadeout

# 配置文档地址
# https://learn.microsoft.com/zh-cn/azure/cognitive-services/speech-service/language-support?tabs=tts
language = "zh-CN"
# en-US	英语（美国）
# ja-JP	日语（日本）
style = "narration-professional"
gender = "Male"
name = "zh-CN, YunxiNeural"
# name = "zh-CN, XiaochenNeural"
audio_rate = "1.2"
pitch="-10"

txt_save_path = "data.txt"
base_audio_path = "wav_each"

# 获取文件夹中的文件列表
file_list = os.listdir(base_audio_path)

# 删除文件夹中的每个文件
for file_name in file_list:
    file_path = os.path.join(base_audio_path, file_name)
    os.remove(file_path)


def TTS_make_2(text, audio_path):
    TTS_apiKey = ""
    params = ""
    headers = {"Ocp-Apim-Subscription-Key": TTS_apiKey}
    # AccessTokenUri = "https://eastus.api.cognitive.microsoft.com/sts/v1.0/issuetoken";
    path = "/sts/v1.0/issueToken"
    print("正在连接微软服务器以获取文本转语音访问令牌")
    AccessTokenHost = "eastus.api.cognitive.microsoft.com"
    conn = http.client.HTTPSConnection(AccessTokenHost)
    conn.request("POST", path, params, headers)
    response = conn.getresponse()
    print(response.status, response.reason)
    data = response.read()
    conn.close()
    accesstoken = data.decode("UTF-8")
    # print("Access Token: " + accesstoken)
    body = ElementTree.Element('speak', version='1.0')
    body.set('{http://www.w3.org/XML/1998/namespace}lang', '{}'.format(language))
    voice = ElementTree.SubElement(body, 'voice')
    voice.set('{http://www.w3.org/XML/1998/namespace}lang', '{}'.format(language))
    voice.set('{http://www.w3.org/XML/1998/namespace}style', '{}'.format(style))
    voice.set('{http://www.w3.org/XML/1998/namespace}gender', '{}'.format(gender))
    voice.set('name', 'Microsoft Server Speech Text to Speech Voice ({})'.format(name))
    voice.set('rate', audio_rate)
    voice.set('pitch', "medium")
    # 'Microsoft Server Speech Text to Speech Voice (zh-CN, YunxiNeural)'
    print('使用的声音是：{}，风格是：{}，性别是：{})'.format(name, style, gender))
    voice.text = text
    headers = {"Content-type": "application/ssml+xml",
               "X-Microsoft-OutputFormat": "riff-24khz-16bit-mono-pcm",
               "Authorization": "Bearer " + accesstoken,
               "X-Search-AppId": "07D3234E49CE426DAA29772419F436CA",
               "X-Search-ClientID": "1ECFAE91408841A480F00935DC390960",
               "User-Agent": "TTSForPython"}
    # Connect to server to synthesize the wave
    print("\n连接到服务器以合成音频")
    conn = http.client.HTTPSConnection("eastus.tts.speech.microsoft.com")
    conn.request("POST", "/cognitiveservices/v1", ElementTree.tostring(body), headers)
    response = conn.getresponse()
    # print(response.status, response.reason)
    if response.status == 200:
        # 保存音频文件
        data = response.read()
        conn.close()
        print("验证音频文件波长为: %d" % (len(data)))
        print(audio_path + ".wav")
        f = wave.open(audio_path + ".wav", "wb")
        f.setnchannels(1)  # 单声道
        f.setframerate(24000)  # 采样率
        f.setsampwidth(2)  # sample width 2 bytes(16 bits)
        f.writeframes(data)
        f.close()
        print("字幕：【{}】，音频文件生成成功。\n文件保存位置：{}".format(text, audio_path + ".wav"))
        # 验证音频文件
        try:
            AudioFileClip(audio_path + ".wav")
            print("验证音频通过，文件为 {}".format(audio_path + ".wav"))
        except:
            print("验证音频失败，文件为 {}".format(audio_path + ".wav"))
            print("正在重新生成音频，{}".format(audio_path + ".wav"))
            TTS_make_2(audio_path, text)
        print("字幕：【{}】，音频文件生成成功。\n文件保存位置：{}".format(text, audio_path + ".wav"))
    else:
        print("字幕：【{}】，音频文件生成失败，尝试重新生成")
        TTS_make_2(audio_path, text)
    print("-" * 50)


# 切分文字函数
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        lines = [str(i).replace("\n", "") for i in lines]
        lines = [str(i).replace("。。", "。") for i in lines]
        lines = [str(i).replace("*", "") for i in lines]
        lines = [str(i).replace("#", "") for i in lines]
        lines = [i for i in lines if i != ""]
    return lines


def split_and_accumulate(lines, max_length):
    result = []
    current_sentence = ''
    current_length = 0

    for line in lines:
        sentences = line.strip().split('。')
        for sentence in sentences:
            sentence += '。'  # 加上切分时去除的句号
            sentence_length = len(sentence)

            if current_length + sentence_length <= max_length:
                current_sentence += sentence
                current_length += sentence_length
            else:
                result.append(current_sentence)
                current_sentence = sentence
                current_length = sentence_length

    if current_sentence:
        result.append(current_sentence)

    return result


# TTS_make_2(text,audio_path)


# lines 为洗稿后文档读取的文字数据
lines = read_text_file(txt_save_path)
new_elements = split_and_accumulate(lines, 50)
# print(new_elements)

n = 1
for text in new_elements:
    audio_path = os.path.join(base_audio_path,str(n))
    TTS_make_2(text, audio_path)
    n = n + 1

# 拼接音频
wav_list = os.listdir(base_audio_path)
sorted_filenames = sorted(wav_list, key=lambda x: int(re.search(r'\d+', x).group()))
# print(sorted_filenames)
audio_clips = []
for filename in sorted_filenames:
    filepath = os.path.join(base_audio_path, filename)
    # print(filepath)
    audio_clip = AudioFileClip(filepath)
    audio_clip = audio_clip.audio_fadein(0.2)
    audio_clips.append(audio_clip)

final_clip = concatenate_audioclips(audio_clips)
final_clip.write_audiofile("result.wav")  # 保存拼接后的音频文件