bark_tts/script.py

274 lines
11 KiB
Python

from modules.html_generator import chat_html_wrapper
from modules import chat, shared
from extensions.bark_tts import tts_preprocessor
from scipy.io.wavfile import write as write_wav
from IPython.display import Audio
from bark import SAMPLE_RATE, generate_audio, preload_models
import time
from pathlib import Path
import configparser
import glob
import gradio as gr
import numpy as np
import nltk
nltk.data.path.append('extensions/bark_tts/')
nltk.download('punkt', download_dir='extensions/bark_tts/')
config_file = configparser.ConfigParser()
if Path('extensions/bark_tts/bark_tts.ini').is_file() == False:
config_file.add_section('bark_tts')
config_file.set('bark_tts', 'speaker', 'en_speaker_8')
config_file.set('bark_tts', 'activate', 'True')
config_file.set('bark_tts', 'show_text', 'True')
config_file.set('bark_tts', 'autoplay', 'True')
config_file.set('bark_tts', 'tokenize', 'True')
config_file.set('bark_tts', 'text_temp', '0.6')
config_file.set('bark_tts', 'waveform_temp', '0.6')
config_file.add_section('bark_internals')
config_file.set('bark_internals', 'text_use_gpu', 'True')
config_file.set('bark_internals', 'text_use_small', 'False')
config_file.set('bark_internals', 'coarse_use_gpu', 'True')
config_file.set('bark_internals', 'coarse_use_small', 'False')
config_file.set('bark_internals', 'fine_use_gpu', 'True')
config_file.set('bark_internals', 'fine_use_small', 'False')
config_file.set('bark_internals', 'codec_use_gpu', 'True')
config_file.set('bark_internals', 'force_reload', 'False')
with open(r'extensions/bark_tts/bark_tts.ini', 'w') as configfileObj:
config_file.write(configfileObj)
configfileObj.flush()
configfileObj.close()
print()
print("Config file 'bark_tts.ini' recreated with default settings")
def read_config():
config_file.read('extensions/bark_tts/bark_tts.ini')
return config_file
config = read_config()
def update_config(setting, value):
global config
config_file['bark_tts'][setting] = str(value)
with open('extensions/bark_tts/bark_tts.ini', 'w') as configfileObj:
config_file.write(configfileObj)
configfileObj.flush()
configfileObj.close()
config = read_config()
params = {
'speaker': config['bark_tts']['speaker'],
'activate': config['bark_tts'].getboolean('activate'),
'show_text': config['bark_tts'].getboolean('show_text'),
'autoplay': config['bark_tts'].getboolean('autoplay'),
'tokenize': config['bark_tts'].getboolean('tokenize'),
'text_temp': config['bark_tts'].getfloat('text_temp'),
'waveform_temp': config['bark_tts'].getfloat('waveform_temp')
}
print()
print('Loading Bark models...')
preload_models(
text_use_gpu=config['bark_internals'].getboolean('text_use_gpu'),
text_use_small=config['bark_internals'].getboolean('text_use_small'),
coarse_use_gpu=config['bark_internals'].getboolean('coarse_use_gpu'),
coarse_use_small=config['bark_internals'].getboolean('coarse_use_small'),
fine_use_gpu=config['bark_internals'].getboolean('fine_use_gpu'),
fine_use_small=config['bark_internals'].getboolean('fine_use_small'),
codec_use_gpu=config['bark_internals'].getboolean('codec_use_gpu'),
force_reload=config['bark_internals'].getboolean('force_reload')
)
default_voices = ['en_speaker_0', 'en_speaker_1', 'en_speaker_2', 'en_speaker_3',
'en_speaker_4', 'en_speaker_5', 'en_speaker_6', 'en_speaker_7', 'en_speaker_8', 'en_speaker_9']
custom_voices = glob.glob('extensions/bark_tts/voices/*.npz')
voices = custom_voices + default_voices
# remember if chat streaming was enabled
streaming_state = shared.args.no_stream
def remove_tts_from_history(name1, name2, mode):
for i, entry in enumerate(shared.history['internal']):
shared.history['visible'][i] = [
shared.history['visible'][i][0], entry[1]]
return chat_html_wrapper(shared.history['visible'], name1, name2, mode)
def toggle_text_in_history(name1, name2, mode):
for i, entry in enumerate(shared.history['visible']):
visible_reply = entry[1]
if visible_reply.startswith('<audio'):
if params['show_text']:
reply = shared.history['internal'][i][1]
shared.history['visible'][i] = [shared.history['visible'][i][0],
f"{visible_reply.split('</audio>')[0]}</audio>\n\n{reply}"]
else:
shared.history['visible'][i] = [shared.history['visible']
[i][0], f"{visible_reply.split('</audio>')[0]}</audio>"]
return chat_html_wrapper(shared.history['visible'], name1, name2, mode)
def input_modifier(string):
"""
This function is applied to your text inputs before
they are fed into the model.
"""
# Remove autoplay from the last reply
if shared.is_chat() and len(shared.history['internal']) > 0:
shared.history['visible'][-1] = [shared.history['visible'][-1][0],
shared.history['visible'][-1][1].replace('controls autoplay>', 'controls>')]
shared.processing_message = "*Is recording a voice message...*"
# Disable streaming cause otherwise the audio output will stutter and begin anew every time the message is being updated
shared.args.no_stream = True
return string
def output_modifier(string):
"""
This function is applied to the model outputs.
"""
global model, current_params, streaming_state
for i in params:
if params[i] != current_params[i]:
# model = load_model()
current_params = params.copy()
break
if not params['activate']:
return string
original_string = string
string = tts_preprocessor.preprocess(string)
if string == '':
string = '*Empty reply, try regenerating*'
else:
output_file = Path(
f'extensions/bark_tts/outputs/{shared.character}_{int(time.time())}.wav')
if params['tokenize'] == True:
sentences = nltk.sent_tokenize(string)
audio_array = np.empty(0, dtype=np.int16)
chunks = ['']
token_counter = 0
for sentence in sentences:
current_tokens = len(nltk.Text(sentence))
if token_counter + current_tokens <= 250:
token_counter += current_tokens
chunks[-1] = chunks[-1] + ' ' + sentence
else:
token_counter = current_tokens
chunks.append(sentence)
for chunk in chunks:
audio_chunk = generate_audio(
chunk, history_prompt=params['speaker'], text_temp=params['text_temp'], waveform_temp=params['waveform_temp'])
audio_array = np.concatenate((audio_array, audio_chunk))
else:
audio_array = generate_audio(
string, history_prompt=params['speaker'], text_temp=params['text_temp'], waveform_temp=params['waveform_temp'])
Audio(audio_array, rate=SAMPLE_RATE)
write_wav(output_file, SAMPLE_RATE, audio_array)
autoplay = 'autoplay' if params['autoplay'] else ''
string = f'<audio src="file/{output_file.as_posix()}" controls {autoplay}></audio>'
if params['show_text']:
string += f'\n\n{original_string}'
shared.processing_message = "*Is typing...*"
# restore the streaming option to the previous value
shared.args.no_stream = streaming_state
return string
def bot_prefix_modifier(string):
"""
This function is only applied in chat mode. It modifies
the prefix text for the Bot and can be used to bias its
behavior.
"""
return string
def setup():
global current_params
current_params = params.copy()
def ui():
# Gradio elements
with gr.Accordion("Bark TTS"):
with gr.Row():
activate = gr.Checkbox(
value=params['activate'], label='Activate TTS')
autoplay = gr.Checkbox(
value=params['autoplay'], label='Play TTS automatically')
tokenize = gr.Checkbox(
value=params['tokenize'], label='Tokenize the reply')
show_text = gr.Checkbox(
value=params['show_text'], label='Show message text under audio player')
voice = gr.Dropdown(
value=params['speaker'], choices=voices, label='TTS voice')
with gr.Row():
t_temp = gr.Slider(
0, 1, value=params['text_temp'], step=0.01, label='Text temperature')
w_temp = gr.Slider(
0, 1, value=params['waveform_temp'], step=0.01, label='Waveform temperature')
with gr.Row():
convert = gr.Button(
'Permanently replace audios with the message texts')
convert_cancel = gr.Button('Cancel', visible=False)
convert_confirm = gr.Button(
'Confirm (cannot be undone)', variant="stop", visible=False)
# Convert history with confirmation
convert_arr = [convert_confirm, convert, convert_cancel]
convert.click(lambda: [gr.update(visible=True), gr.update(
visible=False), gr.update(visible=True)], None, convert_arr)
convert_confirm.click(lambda: [gr.update(visible=False), gr.update(
visible=True), gr.update(visible=False)], None, convert_arr)
convert_confirm.click(remove_tts_from_history, [shared.gradio[k] for k in [
'name1', 'name2', 'mode']], shared.gradio['display'])
convert_confirm.click(lambda: chat.save_history(
timestamp=False), [], [], show_progress=False)
convert_cancel.click(lambda: [gr.update(visible=False), gr.update(
visible=True), gr.update(visible=False)], None, convert_arr)
# Toggle message text in history
show_text.change(lambda x: [params.update(
{"show_text": x}), update_config('show_text', x)], show_text, None)
show_text.change(toggle_text_in_history, [shared.gradio[k] for k in [
'name1', 'name2', 'mode']], shared.gradio['display'])
show_text.change(lambda: chat.save_history(
timestamp=False), [], [], show_progress=False)
# Event functions to update the parameters in the backend
activate.change(lambda x: [params.update(
{"activate": x}), update_config('activate', x)], activate, None)
autoplay.change(lambda x: [params.update(
{"autoplay": x}), update_config('autoplay', x)], autoplay, None)
tokenize.change(lambda x: [params.update(
{"tokenize": x}), update_config('tokenize', x)], autoplay, None)
voice.change(lambda x: [params.update(
{"speaker": x}), update_config('speaker', x)], voice, None)
t_temp.change(lambda x: [params.update(
{"text_temp": x}), update_config('text_temp', x)], t_temp, None)
w_temp.change(lambda x: [params.update(
{"waveform_temp": x}), update_config('waveform_temp', x)], w_temp, None)