feat: native speecht5 support

This commit is contained in:
Timothy J. Baek
2024-11-04 01:16:51 -08:00
parent 835eeb6433
commit 1fd67d7e5d
4 changed files with 105 additions and 0 deletions

View File

@@ -74,6 +74,10 @@ app.state.config.TTS_VOICE = AUDIO_TTS_VOICE
app.state.config.TTS_API_KEY = AUDIO_TTS_API_KEY
app.state.config.TTS_SPLIT_ON = AUDIO_TTS_SPLIT_ON
app.state.speech_synthesiser = None
app.state.speech_speaker_embeddings_dataset = None
app.state.config.TTS_AZURE_SPEECH_REGION = AUDIO_TTS_AZURE_SPEECH_REGION
app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT = AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT
@@ -231,6 +235,21 @@ async def update_audio_config(
}
def load_speech_pipeline():
from transformers import pipeline
from datasets import load_dataset
if app.state.speech_synthesiser is None:
app.state.speech_synthesiser = pipeline(
"text-to-speech", "microsoft/speecht5_tts"
)
if app.state.speech_speaker_embeddings_dataset is None:
app.state.speech_speaker_embeddings_dataset = load_dataset(
"Matthijs/cmu-arctic-xvectors", split="validation"
)
@app.post("/speech")
async def speech(request: Request, user=Depends(get_verified_user)):
body = await request.body()
@@ -397,6 +416,43 @@ async def speech(request: Request, user=Depends(get_verified_user)):
raise HTTPException(
status_code=500, detail=f"Error synthesizing speech - {response.reason}"
)
elif app.state.config.TTS_ENGINE == "transformers":
payload = None
try:
payload = json.loads(body.decode("utf-8"))
except Exception as e:
log.exception(e)
raise HTTPException(status_code=400, detail="Invalid JSON payload")
import torch
import soundfile as sf
load_speech_pipeline()
embeddings_dataset = app.state.speech_speaker_embeddings_dataset
speaker_index = 6799
try:
speaker_index = embeddings_dataset["filename"].index(
app.state.config.TTS_MODEL
)
except Exception:
pass
speaker_embedding = torch.tensor(
embeddings_dataset[speaker_index]["xvector"]
).unsqueeze(0)
speech = app.state.speech_synthesiser(
payload["input"],
forward_params={"speaker_embeddings": speaker_embedding},
)
sf.write(file_path, speech["audio"], samplerate=speech["sampling_rate"])
with open(file_body_path, "w") as f:
json.dump(json.loads(body.decode("utf-8")), f)
return FileResponse(file_path)
def transcribe(file_path):