mirror of
https://github.com/matatonic/openedai-speech
synced 2025-06-26 18:16:32 +00:00
0.13.0 final
This commit is contained in:
parent
756799529a
commit
34bf525c89
4
.github/workflows/build-docker.yml
vendored
4
.github/workflows/build-docker.yml
vendored
@ -177,6 +177,8 @@ jobs:
|
||||
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
|
||||
labels: version=${{ github.run_id }}
|
||||
platforms: linux/amd64,linux/arm64
|
||||
build-args: |
|
||||
USE_ROCM=1
|
||||
|
||||
# For tagged releases, build and push the Docker image with the corresponding tag
|
||||
- name: Build and Push Docker Image (Tagged)
|
||||
@ -189,4 +191,6 @@ jobs:
|
||||
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
|
||||
labels: version=${{ github.run_id }}
|
||||
platforms: linux/amd64,linux/arm64
|
||||
build-args: |
|
||||
USE_ROCM=1
|
||||
|
||||
|
@ -14,7 +14,7 @@ ARG USE_ROCM
|
||||
ENV USE_ROCM=${USE_ROCM}
|
||||
|
||||
COPY requirements*.txt /app/
|
||||
RUN if [ ${USE_ROCM} = "1" ]; then mv /app/requirements-rocm.txt /app/requirements.txt; fi
|
||||
RUN if [ "${USE_ROCM}" = "1" ]; then mv /app/requirements-rocm.txt /app/requirements.txt; fi
|
||||
RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements.txt
|
||||
|
||||
COPY speech.py openedai.py say.py *.sh *.default.yaml README.md LICENSE /app/
|
||||
@ -23,7 +23,6 @@ ARG PRELOAD_MODEL
|
||||
ENV PRELOAD_MODEL=${PRELOAD_MODEL}
|
||||
ENV TTS_HOME=voices
|
||||
ENV HF_HOME=voices
|
||||
ENV OPENEDAI_LOG_LEVEL=INFO
|
||||
ENV COQUI_TOS_AGREED=1
|
||||
|
||||
CMD bash startup.sh
|
||||
|
@ -10,12 +10,11 @@ RUN apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
WORKDIR /app
|
||||
RUN mkdir -p voices config
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/pip pip install piper-tts==1.2.0 pyyaml fastapi uvicorn loguru numpy\<2
|
||||
|
||||
COPY requirements*.txt /app/
|
||||
RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements-min.txt
|
||||
COPY speech.py openedai.py say.py *.sh *.default.yaml README.md LICENSE /app/
|
||||
|
||||
ENV TTS_HOME=voices
|
||||
ENV HF_HOME=voices
|
||||
ENV OPENEDAI_LOG_LEVEL=INFO
|
||||
|
||||
CMD bash startup.min.sh
|
||||
|
122
README.md
122
README.md
@ -27,11 +27,11 @@ If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know s
|
||||
|
||||
## Recent Changes
|
||||
|
||||
Version 0.13.0, 2024-06-22
|
||||
Version 0.13.0, 2024-06-25
|
||||
|
||||
* Added [Custom fine-tuned XTTS model support](#custom-fine-tuned-model-support)
|
||||
* Initial prebuilt arm64 image support with MPS (Apple M-series, Raspberry Pi), thanks @JakeStevenson, @hchasens
|
||||
* Initial AMD GPU (rocm 5.7) support, set USE_ROCM=1 when building docker or use requirements-rocm.txt
|
||||
* Initial prebuilt arm64 image support (Apple M-series, Raspberry Pi - MPS is not supported in XTTS/torch), thanks @JakeStevenson, @hchasens
|
||||
* Initial attempt at AMD GPU (ROCm 5.7) support
|
||||
* Parler-tts support removed
|
||||
* Move the *.default.yaml to the root folder
|
||||
* Run the docker as a service by default (`restart: unless-stopped`)
|
||||
@ -86,63 +86,68 @@ Version: 0.7.3, 2024-03-20
|
||||
|
||||
## Installation instructions
|
||||
|
||||
1. Copy the `sample.env` to `speech.env` (customize if needed)
|
||||
### Create a `speech.env` environment file
|
||||
|
||||
Copy the `sample.env` to `speech.env` (customize if needed)
|
||||
```bash
|
||||
cp sample.env speech.env
|
||||
```
|
||||
#### AMD GPU (ROCm support)
|
||||
> If you have an AMD GPU and want to use ROCm, set `USE_ROCM=1` in the `speech.env` before building the docker image. You will need to `docker compose build` before running the container in the next step.
|
||||
|
||||
2. Option: Docker (**recommended**) (prebuilt images are available)
|
||||
|
||||
Run the server:
|
||||
```shell
|
||||
docker compose up
|
||||
#### Defaults
|
||||
```bash
|
||||
TTS_HOME=voices
|
||||
HF_HOME=voices
|
||||
#PRELOAD_MODEL=xtts
|
||||
#PRELOAD_MODEL=xtts_v2.0.2
|
||||
#EXTRA_ARGS=--log-level DEBUG
|
||||
#USE_ROCM=1
|
||||
```
|
||||
> For a minimal docker image with only piper support (<1GB vs. 8GB) use `docker compose -f docker-compose.min.yml up`
|
||||
|
||||
|
||||
|
||||
2. Option: Manual installation:
|
||||
### Option A: Manual installation
|
||||
```shell
|
||||
# install curl and ffmpeg
|
||||
sudo apt install curl ffmpeg
|
||||
# Create & activate a new virtual environment (optional but recommended)
|
||||
python -m venv .venv
|
||||
source .venv/bin/activate
|
||||
# Install the Python requirements - use requirements-rocm.txt for AMD GPU (ROCm support)
|
||||
# Install the Python requirements
|
||||
# - use requirements-rocm.txt for AMD GPU (ROCm support)
|
||||
# - use requirements-min.txt for piper only (CPU only)
|
||||
pip install -r requirements.txt
|
||||
# run the server
|
||||
bash startup.sh
|
||||
```
|
||||
|
||||
> On first run, the voice models will be downloaded automatically. This might take a while depending on your network connection.
|
||||
|
||||
## Usage
|
||||
### Option B: Docker Image (*recommended*)
|
||||
|
||||
```
|
||||
usage: speech.py [-h] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT] [-H HOST] [-L {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
|
||||
|
||||
OpenedAI Speech API Server
|
||||
|
||||
options:
|
||||
-h, --help show this help message and exit
|
||||
--xtts_device XTTS_DEVICE
|
||||
Set the device for the xtts model. The special value of 'none' will use piper for all models. (default: cuda)
|
||||
--preload PRELOAD Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use. (default: None)
|
||||
-P PORT, --port PORT Server tcp port (default: 8000)
|
||||
-H HOST, --host HOST Host to listen on, Ex. 0.0.0.0 (default: 0.0.0.0)
|
||||
-L {DEBUG,INFO,WARNING,ERROR,CRITICAL}, --log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}
|
||||
Set the log level (default: INFO)
|
||||
#### Nvidia GPU (cuda)
|
||||
|
||||
```shell
|
||||
docker compose up
|
||||
```
|
||||
|
||||
## API Documentation
|
||||
#### AMD GPU (ROCm support)
|
||||
|
||||
* [OpenAI Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech)
|
||||
* [OpenAI API Reference](https://platform.openai.com/docs/api-reference/audio/createSpeech)
|
||||
```shell
|
||||
docker compose -d docker-compose.rocm.yml up
|
||||
```
|
||||
|
||||
#### ARM64 (Apple M-series, Raspberry Pi)
|
||||
|
||||
> XTTS only has CPU support here and will be very slow, you can use the Nvidia image for XTTS with CPU (slow), or use the piper only image (recommended)
|
||||
|
||||
#### CPU only, No GPU (piper only)
|
||||
|
||||
> For a minimal docker image with only piper support (<1GB vs. 8GB).
|
||||
|
||||
```shell
|
||||
docker compose -f docker-compose.min.yml up
|
||||
```
|
||||
|
||||
|
||||
### Sample API Usage
|
||||
## Sample Usage
|
||||
|
||||
You can use it like this:
|
||||
|
||||
@ -193,52 +198,19 @@ python say.py -t "The quick brown fox jumped over the lazy dog." -p
|
||||
python say.py -t "The quick brown fox jumped over the lazy dog." -m tts-1-hd -v onyx -f flac -o fox.flac
|
||||
```
|
||||
|
||||
```
|
||||
usage: say.py [-h] [-m MODEL] [-v VOICE] [-f {mp3,aac,opus,flac}] [-s SPEED] [-t TEXT] [-i INPUT] [-o OUTPUT] [-p]
|
||||
|
||||
Text to speech using the OpenAI API
|
||||
|
||||
options:
|
||||
-h, --help show this help message and exit
|
||||
-m MODEL, --model MODEL
|
||||
The model to use (default: tts-1)
|
||||
-v VOICE, --voice VOICE
|
||||
The voice of the speaker (default: alloy)
|
||||
-f {mp3,aac,opus,flac}, --format {mp3,aac,opus,flac}
|
||||
The output audio format (default: mp3)
|
||||
-s SPEED, --speed SPEED
|
||||
playback speed, 0.25-4.0 (default: 1.0)
|
||||
-t TEXT, --text TEXT Provide text to read on the command line (default: None)
|
||||
-i INPUT, --input INPUT
|
||||
Read text from a file (default is to read from stdin) (default: None)
|
||||
-o OUTPUT, --output OUTPUT
|
||||
The filename to save the output to (default: None)
|
||||
-p, --playsound Play the audio (default: False)
|
||||
|
||||
```
|
||||
|
||||
You can also try the included `audio_reader.py` for listening to longer text and streamed input.
|
||||
|
||||
```
|
||||
usage: audio_reader.py [-h] [-m MODEL] [-v VOICE] [-s SPEED]
|
||||
|
||||
Text to speech player
|
||||
|
||||
options:
|
||||
-h, --help show this help message and exit
|
||||
-m MODEL, --model MODEL
|
||||
The OpenAI model (default: tts-1)
|
||||
-v VOICE, --voice VOICE
|
||||
The voice to use (default: alloy)
|
||||
-s SPEED, --speed SPEED
|
||||
How fast to read the audio (default: 1.0)
|
||||
|
||||
```
|
||||
Example usage:
|
||||
```bash
|
||||
$ python audio_reader.py -s 2 < LICENSE
|
||||
python audio_reader.py -s 2 < LICENSE # read the software license - fast
|
||||
```
|
||||
|
||||
## OpenAI API Documentation and Guide
|
||||
|
||||
* [OpenAI Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech)
|
||||
* [OpenAI API Reference](https://platform.openai.com/docs/api-reference/audio/createSpeech)
|
||||
|
||||
|
||||
## Custom Voices Howto
|
||||
|
||||
### Piper
|
||||
|
0
config/config_files_will_go_here.txt
Normal file
0
config/config_files_will_go_here.txt
Normal file
@ -2,9 +2,9 @@ services:
|
||||
server:
|
||||
build:
|
||||
dockerfile: Dockerfile
|
||||
args:
|
||||
- USE_ROCM=1
|
||||
image: ghcr.io/matatonic/openedai-speech-rocm
|
||||
environment:
|
||||
- USE_ROCM=1
|
||||
env_file: speech.env
|
||||
ports:
|
||||
- "8000:8000"
|
||||
|
6
requirements-min.txt
Normal file
6
requirements-min.txt
Normal file
@ -0,0 +1,6 @@
|
||||
pyyaml
|
||||
fastapi
|
||||
uvicorn
|
||||
loguru
|
||||
numpy<2
|
||||
piper-tts==1.2.0
|
@ -12,7 +12,7 @@ spacy==3.7.4
|
||||
# Re: https://github.com/pytorch/pytorch/issues/121834
|
||||
torch==2.2.2; sys_platform != "darwin"
|
||||
torchaudio; sys_platform != "darwin"
|
||||
# for MPS accelerated torch on Mac
|
||||
# for MPS accelerated torch on Mac - doesn't work yet, incomplete support in torch and torchaudio
|
||||
torch==2.2.2; --index-url https://download.pytorch.org/whl/cpu; sys_platform == "darwin"
|
||||
torchaudio==2.2.2; --index-url https://download.pytorch.org/whl/cpu; sys_platform == "darwin"
|
||||
|
||||
|
@ -204,7 +204,7 @@ async def generate_speech(request: GenerateSpeechRequest):
|
||||
|
||||
return StreamingResponse(content=ffmpeg_proc.stdout, media_type=media_type)
|
||||
|
||||
|
||||
# We return 'mps' but currently XTTS will not work with mps devices as the cuda support is incomplete
|
||||
def auto_torch_device():
|
||||
try:
|
||||
import torch
|
||||
@ -213,7 +213,6 @@ def auto_torch_device():
|
||||
except:
|
||||
return 'none'
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description='OpenedAI Speech API Server',
|
||||
|
Loading…
Reference in New Issue
Block a user