diff --git a/README.md b/README.md
index d362e9c..350dcac 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ Mini-Omni is an open-source multimodal large language model that can **hear, tal
 
 ✅ **Talking while thinking**, with the ability to generate text and audio at the same time.
 
-✅ **Streaming audio outupt** capabilities.
+✅ **Streaming audio output** capabilities.
 
 ✅ With "Audio-to-Text" and "Audio-to-Audio" **batch inference** to further boost the performance.
 
diff --git a/inference.py b/inference.py
index 4d721d0..d184925 100644
--- a/inference.py
+++ b/inference.py
@@ -399,7 +399,7 @@ class OmniInference:
         model = self.model
 
         with self.fabric.init_tensor():
-            model.set_kv_cache(batch_size=2)
+            model.set_kv_cache(batch_size=2,device=self.device)
 
         mel, leng = load_audio(audio_path)
         audio_feature, input_ids = get_input_ids_whisper_ATBatch(mel, leng, self.whispermodel, self.device)
diff --git a/server.py b/server.py
index 5740613..c6e5d98 100644
--- a/server.py
+++ b/server.py
@@ -46,9 +46,9 @@ def create_app():
     return server.server
 
 
-def serve(ip='0.0.0.0', port=60808):
+def serve(ip='0.0.0.0', port=60808, device='cuda:0'):
 
-    OmniChatServer(ip, port=port, run_app=True)
+    OmniChatServer(ip, port=port,run_app=True, device=device)
 
 
 if __name__ == "__main__":