mirror of
https://github.com/matatonic/openedai-speech
synced 2025-06-26 18:16:32 +00:00
voicecraft wip
This commit is contained in:
parent
bd3c7a601a
commit
e8e8bdb8a6
@ -1,10 +1,7 @@
|
||||
FROM ubuntu:22.04
|
||||
|
||||
ENV COQUI_TOS_AGREED=1
|
||||
ENV PRELOAD_MODEL=xtts
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install --no-install-recommends -y ffmpeg curl python-is-python3 python3-pip
|
||||
apt-get install --no-install-recommends -y ffmpeg curl python-is-python3 python3-pip espeak-ng
|
||||
|
||||
#RUN git clone https://github.com/matatonic/openedai-speech /app
|
||||
RUN mkdir -p /app/voices
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/bin/sh
|
||||
export COQUI_TOS_AGREED=1
|
||||
python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('$PRELOAD_MODEL')"
|
||||
#export COQUI_TOS_AGREED=1
|
||||
#python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('$PRELOAD_MODEL')"
|
||||
./download_samples.sh
|
||||
@ -2,4 +2,21 @@ fastapi
|
||||
uvicorn
|
||||
piper-tts
|
||||
onnxruntime-gpu
|
||||
TTS
|
||||
#TTS
|
||||
|
||||
torch==2.0.1 # this assumes your system is compatible with CUDA 11.7, otherwise checkout https://pytorch.org/get-started/previous-versions/#v201
|
||||
git+https://github.com/facebookresearch/audiocraft.git@c5157b5bf14bf83449c17ea1eeb66c19fb4bc7f0#egg=audiocraft
|
||||
apt-get install espeak-ng # backend for the phonemizer installed below
|
||||
#tensorboard==2.16.2
|
||||
phonemizer==3.2.1
|
||||
torchaudio==2.0.2
|
||||
#pip install datasets==2.16.0
|
||||
torchmetrics==0.11.1
|
||||
# install MFA for getting forced-alignment, this could take a few minutes
|
||||
montreal-forced-aligner=2.2.17
|
||||
openfst=1.8.2
|
||||
kaldi=5.5.1068
|
||||
# conda install pocl # above gives an warning for installing pocl, not sure if really need this
|
||||
|
||||
# to run ipynb
|
||||
#ipykernel --update-deps --force-reinstall
|
||||
38
voicecraft.py
Normal file
38
voicecraft.py
Normal file
@ -0,0 +1,38 @@
|
||||
import argparse, pickle
|
||||
import logging
|
||||
import os, random
|
||||
import numpy as np
|
||||
import torch
|
||||
import torchaudio
|
||||
|
||||
from data.tokenizer import (
|
||||
AudioTokenizer,
|
||||
TextTokenizer,
|
||||
tokenize_audio,
|
||||
tokenize_text
|
||||
)
|
||||
|
||||
from models import voicecraft
|
||||
import argparse, time, tqdm
|
||||
|
||||
|
||||
|
||||
class vo_wrapper():
|
||||
def __init__(self, model_name, device):
|
||||
self.model_name = model_name
|
||||
self.xtts = TTS(model_name=model_name, progress_bar=False).to(device)
|
||||
|
||||
def tts(self, text, speaker_wav, speed):
|
||||
tf, file_path = tempfile.mkstemp(suffix='.wav')
|
||||
|
||||
file_path = self.xtts.tts_to_file(
|
||||
text,
|
||||
language='en',
|
||||
speaker_wav=speaker_wav,
|
||||
speed=speed,
|
||||
file_path=file_path,
|
||||
)
|
||||
|
||||
os.unlink(file_path)
|
||||
return tf
|
||||
|
||||
Loading…
Reference in New Issue
Block a user