通常在使用VITS进行声音克隆的时候出现声音沙哑或者大佐味,就是日本腔调,这个一方面是由于模型训练的问题,如果觉得模型训练没有问题的话就是参数,或者其他原因。这里介绍一个通用的解决办法。
声音预测参数
按照以下图片进行设置获取模型。
上传好音频之后点击这些选项,然后生成音频。
音频生成
首先使用微软的TTS进行文本转语音的操作,这里有个技巧就是不要整篇文字扔进去,拆分分段生成音频,然后克隆。具体为什么自己体会吧,这个是我尝试了多少次成功的。
先整理好你的文件目录如图。
这里面的TTS_apiKey
要换成你的,split_and_accumulate
方法后面的50是拆分字数间隔。
import http.client
from xml.etree import ElementTree
import wave
import os
import requests
import pandas as pd
import hashlib
import platform
import uuid
import json
from moviepy.editor import *
import re
import time
from moviepy.audio.fx import audio_fadein, audio_fadeout
# 配置文档地址
# https://learn.microsoft.com/zh-cn/azure/cognitive-services/speech-service/language-support?tabs=tts
language = "zh-CN"
# en-US 英语(美国)
# ja-JP 日语(日本)
style = "narration-professional"
gender = "Male"
name = "zh-CN, YunxiNeural"
# name = "zh-CN, XiaochenNeural"
audio_rate = "1.2"
pitch="-10"
txt_save_path = "data.txt"
base_audio_path = "wav_each"
# 获取文件夹中的文件列表
file_list = os.listdir(base_audio_path)
# 删除文件夹中的每个文件
for file_name in file_list:
file_path = os.path.join(base_audio_path, file_name)
os.remove(file_path)
def TTS_make_2(text, audio_path):
TTS_apiKey = ""
params = ""
headers = {"Ocp-Apim-Subscription-Key": TTS_apiKey}
# AccessTokenUri = "https://eastus.api.cognitive.microsoft.com/sts/v1.0/issuetoken";
path = "/sts/v1.0/issueToken"
print("正在连接微软服务器以获取文本转语音访问令牌")
AccessTokenHost = "eastus.api.cognitive.microsoft.com"
conn = http.client.HTTPSConnection(AccessTokenHost)
conn.request("POST", path, params, headers)
response = conn.getresponse()
print(response.status, response.reason)
data = response.read()
conn.close()
accesstoken = data.decode("UTF-8")
# print("Access Token: " + accesstoken)
body = ElementTree.Element('speak', version='1.0')
body.set('{http://www.w3.org/XML/1998/namespace}lang', '{}'.format(language))
voice = ElementTree.SubElement(body, 'voice')
voice.set('{http://www.w3.org/XML/1998/namespace}lang', '{}'.format(language))
voice.set('{http://www.w3.org/XML/1998/namespace}style', '{}'.format(style))
voice.set('{http://www.w3.org/XML/1998/namespace}gender', '{}'.format(gender))
voice.set('name', 'Microsoft Server Speech Text to Speech Voice ({})'.format(name))
voice.set('rate', audio_rate)
voice.set('pitch', "medium")
# 'Microsoft Server Speech Text to Speech Voice (zh-CN, YunxiNeural)'
print('使用的声音是:{},风格是:{},性别是:{})'.format(name, style, gender))
voice.text = text
headers = {"Content-type": "application/ssml+xml",
"X-Microsoft-OutputFormat": "riff-24khz-16bit-mono-pcm",
"Authorization": "Bearer " + accesstoken,
"X-Search-AppId": "07D3234E49CE426DAA29772419F436CA",
"X-Search-ClientID": "1ECFAE91408841A480F00935DC390960",
"User-Agent": "TTSForPython"}
# Connect to server to synthesize the wave
print("\n连接到服务器以合成音频")
conn = http.client.HTTPSConnection("eastus.tts.speech.microsoft.com")
conn.request("POST", "/cognitiveservices/v1", ElementTree.tostring(body), headers)
response = conn.getresponse()
# print(response.status, response.reason)
if response.status == 200:
# 保存音频文件
data = response.read()
conn.close()
print("验证音频文件波长为: %d" % (len(data)))
print(audio_path + ".wav")
f = wave.open(audio_path + ".wav", "wb")
f.setnchannels(1) # 单声道
f.setframerate(24000) # 采样率
f.setsampwidth(2) # sample width 2 bytes(16 bits)
f.writeframes(data)
f.close()
print("字幕:【{}】,音频文件生成成功。\n文件保存位置:{}".format(text, audio_path + ".wav"))
# 验证音频文件
try:
AudioFileClip(audio_path + ".wav")
print("验证音频通过,文件为 {}".format(audio_path + ".wav"))
except:
print("验证音频失败,文件为 {}".format(audio_path + ".wav"))
print("正在重新生成音频,{}".format(audio_path + ".wav"))
TTS_make_2(audio_path, text)
print("字幕:【{}】,音频文件生成成功。\n文件保存位置:{}".format(text, audio_path + ".wav"))
else:
print("字幕:【{}】,音频文件生成失败,尝试重新生成")
TTS_make_2(audio_path, text)
print("-" * 50)
# 切分文字函数
def read_text_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
lines = [str(i).replace("\n", "") for i in lines]
lines = [str(i).replace("。。", "。") for i in lines]
lines = [str(i).replace("*", "") for i in lines]
lines = [str(i).replace("#", "") for i in lines]
lines = [i for i in lines if i != ""]
return lines
def split_and_accumulate(lines, max_length):
result = []
current_sentence = ''
current_length = 0
for line in lines:
sentences = line.strip().split('。')
for sentence in sentences:
sentence += '。' # 加上切分时去除的句号
sentence_length = len(sentence)
if current_length + sentence_length <= max_length:
current_sentence += sentence
current_length += sentence_length
else:
result.append(current_sentence)
current_sentence = sentence
current_length = sentence_length
if current_sentence:
result.append(current_sentence)
return result
# TTS_make_2(text,audio_path)
# lines 为洗稿后文档读取的文字数据
lines = read_text_file(txt_save_path)
new_elements = split_and_accumulate(lines, 50)
# print(new_elements)
n = 1
for text in new_elements:
audio_path = os.path.join(base_audio_path,str(n))
TTS_make_2(text, audio_path)
n = n + 1
# 拼接音频
wav_list = os.listdir(base_audio_path)
sorted_filenames = sorted(wav_list, key=lambda x: int(re.search(r'\d+', x).group()))
# print(sorted_filenames)
audio_clips = []
for filename in sorted_filenames:
filepath = os.path.join(base_audio_path, filename)
# print(filepath)
audio_clip = AudioFileClip(filepath)
audio_clip = audio_clip.audio_fadein(0.2)
audio_clips.append(audio_clip)
final_clip = concatenate_audioclips(audio_clips)
final_clip.write_audiofile("result.wav") # 保存拼接后的音频文件
选择wav_each
文件夹下的音频文件批量批量上传预测就可以了。文章来源:https://www.toymoban.com/news/detail-566144.html
最终将音频文件在合成一个或者不合成都无所谓了。文章来源地址https://www.toymoban.com/news/detail-566144.html
到了这里,关于基于So-VITS-SVC4.1声音克隆音频异常的解决办法的文章就介绍完了。如果您还想了解更多内容,请在右上角搜索TOY模板网以前的文章或继续浏览下面的相关文章,希望大家以后多多支持TOY模板网!