replicate

2025-06-26 18:16:20 +00:00 · 2024-02-11 14:45:44 +00:00 · 2024-02-11 14:45:44 +00:00 · 32b2faf06e
commit 32b2faf06e
parent db877abb91
3 changed files with 183 additions and 0 deletions
--- a/cog.yaml
+++ b/cog.yaml
@ -0,0 +1,15 @@
+# Configuration for Cog ⚙️
+# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
+
+build:
+  gpu: true
+  python_version: "3.11"
+  python_packages:
+    - torch==2.0.1
+    - torchvision==0.15.2
+    - transformers==4.37.2
+    - accelerate==0.27.0
+    - hf_transfer
+
+# predict.py defines how predictions are run on your model
+predict: "predict.py:Predictor"
--- a/predict.py
+++ b/predict.py
@ -0,0 +1,82 @@
+# Prediction interface for Cog ⚙️
+# https://github.com/replicate/cog/blob/main/docs/python.md
+
+import os
+import time
+from threading import Thread
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
+from transformers.generation.streamers import TextIteratorStreamer
+from cog import BasePredictor, Input, ConcatenateIterator
+
+# Enable faster download speed
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+CACHE_DIR = "model_cache"
+
+
+class Predictor(BasePredictor):
+    def setup(self) -> None:
+        """Load the model into memory to make running multiple predictions efficient"""
+
+        model_name = "deepseek-ai/deepseek-math-7b-base"
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+            cache_dir=CACHE_DIR,
+        )
+        self.model.generation_config = GenerationConfig.from_pretrained(
+            model_name, cache_dir=CACHE_DIR
+        )
+        self.model.generation_config.pad_token_id = (
+            self.model.generation_config.eos_token_id
+        )
+
+    def predict(
+        self,
+        text: str = Input(
+            description="Input text.",
+            default="The integral of x^2 from 0 to 2 is",
+        ),
+        max_new_tokens: int = Input(
+            description="The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.",
+            default=100,
+        ),
+        temperature: float = Input(
+            description="The value used to modulate the next token probabilities.",
+            default=1,
+        ),
+        top_k: int = Input(
+            description="The number of highest probability vocabulary tokens to keep for top-k-filtering.",
+            default=50,
+        ),
+        top_p: float = Input(
+            description="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.",
+            default=0.9,
+        ),
+    ) -> ConcatenateIterator[str]:
+        """Run a single prediction on the model"""
+
+        inputs = self.tokenizer(text, return_tensors="pt")
+        streamer = TextIteratorStreamer(
+            self.tokenizer, skip_prompt=True, skip_special_tokens=True
+        )
+        with torch.inference_mode():
+            thread = Thread(
+                target=self.model.generate,
+                kwargs=dict(
+                    **inputs.to(self.model.device),
+                    do_sample=True,
+                    temperature=temperature,
+                    top_p=top_p,
+                    top_k=top_k,
+                    max_new_tokens=max_new_tokens,
+                    streamer=streamer,
+                    use_cache=True
+                ),
+            )
+            thread.start()
+            for new_token in streamer:
+                yield new_token
+            thread.join()
--- a/predict_instruct.py
+++ b/predict_instruct.py
@ -0,0 +1,86 @@
+# Prediction interface for Cog ⚙️
+# https://github.com/replicate/cog/blob/main/docs/python.md
+
+import os
+import time
+from threading import Thread
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
+from transformers.generation.streamers import TextIteratorStreamer
+from cog import BasePredictor, Input, ConcatenateIterator
+
+# Enable faster download speed
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+CACHE_DIR = "model_cache"
+
+
+class Predictor(BasePredictor):
+    def setup(self) -> None:
+        """Load the model into memory to make running multiple predictions efficient"""
+
+        model_name = "deepseek-ai/deepseek-math-7b-instruct"
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+            cache_dir=CACHE_DIR,
+        )
+        self.model.generation_config = GenerationConfig.from_pretrained(
+            model_name, cache_dir=CACHE_DIR
+        )
+        self.model.generation_config.pad_token_id = (
+            self.model.generation_config.eos_token_id
+        )
+
+    def predict(
+        self,
+        text: str = Input(
+            description="Input text.",
+            default="what is the integral of x^2 from 0 to 2?\nPlease reason step by step, and put your final answer within \boxed{}.",
+        ),
+        max_new_tokens: int = Input(
+            description="The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.",
+            default=100,
+        ),
+        temperature: float = Input(
+            description="The value used to modulate the next token probabilities.",
+            default=1,
+        ),
+        top_k: int = Input(
+            description="The number of highest probability vocabulary tokens to keep for top-k-filtering.",
+            default=50,
+        ),
+        top_p: float = Input(
+            description="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.",
+            default=0.9,
+        ),
+    ) -> ConcatenateIterator[str]:
+        """Run a single prediction on the model"""
+
+        messages = [{"role": "user", "content": text}]
+        input_tensor = self.tokenizer.apply_chat_template(
+            messages, add_generation_prompt=True, return_tensors="pt"
+        )
+        streamer = TextIteratorStreamer(
+            self.tokenizer, skip_prompt=True, skip_special_tokens=True
+        )
+
+        with torch.inference_mode():
+            thread = Thread(
+                target=self.model.generate,
+                kwargs=dict(
+                    input_ids=input_tensor.to(self.model.device),
+                    do_sample=True,
+                    temperature=temperature,
+                    top_p=top_p,
+                    top_k=top_k,
+                    max_new_tokens=max_new_tokens,
+                    streamer=streamer,
+                    use_cache=True,
+                ),
+            )
+            thread.start()
+            for new_token in streamer:
+                yield new_token
+            thread.join()