voicecraft wip

This commit is contained in:
matatonic 2024-04-23 16:14:12 -04:00
parent bd3c7a601a
commit e8e8bdb8a6
4 changed files with 59 additions and 7 deletions

View File

@ -1,10 +1,7 @@
FROM ubuntu:22.04
ENV COQUI_TOS_AGREED=1
ENV PRELOAD_MODEL=xtts
RUN apt-get update && \
apt-get install --no-install-recommends -y ffmpeg curl python-is-python3 python3-pip
apt-get install --no-install-recommends -y ffmpeg curl python-is-python3 python3-pip espeak-ng
#RUN git clone https://github.com/matatonic/openedai-speech /app
RUN mkdir -p /app/voices

View File

@ -1,4 +1,4 @@
#!/bin/sh
export COQUI_TOS_AGREED=1
python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('$PRELOAD_MODEL')"
#export COQUI_TOS_AGREED=1
#python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('$PRELOAD_MODEL')"
./download_samples.sh

View File

@ -2,4 +2,21 @@ fastapi
uvicorn
piper-tts
onnxruntime-gpu
TTS
#TTS
torch==2.0.1 # this assumes your system is compatible with CUDA 11.7, otherwise checkout https://pytorch.org/get-started/previous-versions/#v201
git+https://github.com/facebookresearch/audiocraft.git@c5157b5bf14bf83449c17ea1eeb66c19fb4bc7f0#egg=audiocraft
apt-get install espeak-ng # backend for the phonemizer installed below
#tensorboard==2.16.2
phonemizer==3.2.1
torchaudio==2.0.2
#pip install datasets==2.16.0
torchmetrics==0.11.1
# install MFA for getting forced-alignment, this could take a few minutes
montreal-forced-aligner=2.2.17
openfst=1.8.2
kaldi=5.5.1068
# conda install pocl # above gives an warning for installing pocl, not sure if really need this
# to run ipynb
#ipykernel --update-deps --force-reinstall

38
voicecraft.py Normal file
View File

@ -0,0 +1,38 @@
import argparse, pickle
import logging
import os, random
import numpy as np
import torch
import torchaudio
from data.tokenizer import (
AudioTokenizer,
TextTokenizer,
tokenize_audio,
tokenize_text
)
from models import voicecraft
import argparse, time, tqdm
class vo_wrapper():
def __init__(self, model_name, device):
self.model_name = model_name
self.xtts = TTS(model_name=model_name, progress_bar=False).to(device)
def tts(self, text, speaker_wav, speed):
tf, file_path = tempfile.mkstemp(suffix='.wav')
file_path = self.xtts.tts_to_file(
text,
language='en',
speaker_wav=speaker_wav,
speed=speed,
file_path=file_path,
)
os.unlink(file_path)
return tf