mini-omni/webui/omni_gradio.py

"""A simple web interactive chat demo based on gradio."""

import os
import time
import gradio as gr
import base64
import numpy as np
import requests


API_URL = os.getenv("API_URL", None)
client = None

if API_URL is None:
    from inference import OmniInference
    omni_client = OmniInference('./checkpoint', 'cuda:0')
    omni_client.warm_up()


OUT_CHUNK = 4096
OUT_RATE = 24000
OUT_CHANNELS = 1


def process_audio(audio):
    filepath = audio
    print(f"filepath: {filepath}")
    if filepath is None:
        return

    cnt = 0
    if API_URL is not None:
        with open(filepath, "rb") as f:
            data = f.read()
            base64_encoded = str(base64.b64encode(data), encoding="utf-8")
            files = {"audio": base64_encoded}
            tik = time.time()
            with requests.post(API_URL, json=files, stream=True) as response:
                try:
                    for chunk in response.iter_content(chunk_size=OUT_CHUNK):
                        if chunk:
                            # Convert chunk to numpy array
                            if cnt == 0:
                                print(f"first chunk time cost: {time.time() - tik:.3f}")
                            cnt += 1
                            audio_data = np.frombuffer(chunk, dtype=np.int16)
                            audio_data = audio_data.reshape(-1, OUT_CHANNELS)
                            yield OUT_RATE, audio_data.astype(np.int16)

                except Exception as e:
                    print(f"error: {e}")
    else:
        tik = time.time()
        for chunk in omni_client.run_AT_batch_stream(filepath):
            # Convert chunk to numpy array
            if cnt == 0:
                print(f"first chunk time cost: {time.time() - tik:.3f}")
            cnt += 1
            audio_data = np.frombuffer(chunk, dtype=np.int16)
            audio_data = audio_data.reshape(-1, OUT_CHANNELS)
            yield OUT_RATE, audio_data.astype(np.int16)


def main(port=None):

    demo = gr.Interface(
        process_audio,
        inputs=gr.Audio(type="filepath", label="Microphone"),
        outputs=[gr.Audio(label="Response", streaming=True, autoplay=True)],
        title="Chat Mini-Omni Demo",
        live=True,
    )
    if port is not None:
        demo.queue().launch(share=False, server_name="0.0.0.0", server_port=port)
    else:
        demo.queue().launch()


if __name__ == "__main__":
    import fire

    fire.Fire(main)
init mini-omni 2024-08-29 11:06:05 +00:00			`"""A simple web interactive chat demo based on gradio."""`

			`import os`
			`import time`
			`import gradio as gr`
			`import base64`
			`import numpy as np`
			`import requests`


			`API_URL = os.getenv("API_URL", None)`
			`client = None`

			`if API_URL is None:`
			`from inference import OmniInference`
			`omni_client = OmniInference('./checkpoint', 'cuda:0')`
			`omni_client.warm_up()`


			`OUT_CHUNK = 4096`
			`OUT_RATE = 24000`
			`OUT_CHANNELS = 1`


			`def process_audio(audio):`
			`filepath = audio`
			`print(f"filepath: {filepath}")`
			`if filepath is None:`
			`return`

			`cnt = 0`
			`if API_URL is not None:`
			`with open(filepath, "rb") as f:`
			`data = f.read()`
			`base64_encoded = str(base64.b64encode(data), encoding="utf-8")`
			`files = {"audio": base64_encoded}`
			`tik = time.time()`
			`with requests.post(API_URL, json=files, stream=True) as response:`
			`try:`
			`for chunk in response.iter_content(chunk_size=OUT_CHUNK):`
			`if chunk:`
			`# Convert chunk to numpy array`
			`if cnt == 0:`
			`print(f"first chunk time cost: {time.time() - tik:.3f}")`
			`cnt += 1`
			`audio_data = np.frombuffer(chunk, dtype=np.int16)`
			`audio_data = audio_data.reshape(-1, OUT_CHANNELS)`
			`yield OUT_RATE, audio_data.astype(np.int16)`

			`except Exception as e:`
			`print(f"error: {e}")`
			`else:`
Fix undefined variable 'tik' for non-API mode 2024-09-05 17:08:36 +00:00			`tik = time.time()`
init mini-omni 2024-08-29 11:06:05 +00:00			`for chunk in omni_client.run_AT_batch_stream(filepath):`
			`# Convert chunk to numpy array`
			`if cnt == 0:`
			`print(f"first chunk time cost: {time.time() - tik:.3f}")`
			`cnt += 1`
			`audio_data = np.frombuffer(chunk, dtype=np.int16)`
			`audio_data = audio_data.reshape(-1, OUT_CHANNELS)`
			`yield OUT_RATE, audio_data.astype(np.int16)`


			`def main(port=None):`

			`demo = gr.Interface(`
			`process_audio,`
			`inputs=gr.Audio(type="filepath", label="Microphone"),`
			`outputs=[gr.Audio(label="Response", streaming=True, autoplay=True)],`
			`title="Chat Mini-Omni Demo",`
			`live=True,`
			`)`
			`if port is not None:`
			`demo.queue().launch(share=False, server_name="0.0.0.0", server_port=port)`
			`else:`
			`demo.queue().launch()`


			`if __name__ == "__main__":`
			`import fire`

			`fire.Fire(main)`