Merge pull request #6 from chenxwh/main

Add Replicate demo and API
2025-04-05 13:25:01 +00:00 · 2024-02-19 17:05:02 +08:00 · 2024-02-19 17:05:02 +08:00 · 7c34ad4fa4
commit 7c34ad4fa4
parent db877abb91 a0fdfa2682
4 changed files with 184 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -18,7 +18,7 @@
  <a href="https://huggingface.co/deepseek-ai" target="_blank">
    <img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-DeepSeek%20AI-ffc107?color=ffc107&logoColor=white" />
  </a>
-
+   <a href="https://replicate.com/cjwbw/deepseek-math-7b-base" target="_parent"><img src="https://replicate.com/cjwbw/deepseek-math-7b-base/badge" alt="Replicate"/></a> 
 </div>
 <div align="center">
--- a/cog.yaml
+++ b/cog.yaml
@ -0,0 +1,15 @@
 # Configuration for Cog ⚙️
 # Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
 build:
  gpu: true
  python_version: "3.11"
  python_packages:
    - torch==2.0.1
    - torchvision==0.15.2
    - transformers==4.37.2
    - accelerate==0.27.0
    - hf_transfer
 # predict.py defines how predictions are run on your model
 predict: "replicate/predict.py:Predictor"
--- a/replicate/predict.py
+++ b/replicate/predict.py
@ -0,0 +1,82 @@
 # Prediction interface for Cog ⚙️
 # https://github.com/replicate/cog/blob/main/docs/python.md
 import os
 import time
 from threading import Thread
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
 from transformers.generation.streamers import TextIteratorStreamer
 from cog import BasePredictor, Input, ConcatenateIterator
 # Enable faster download speed
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 CACHE_DIR = "model_cache"
 class Predictor(BasePredictor):
    def setup(self) -> None:
        """Load the model into memory to make running multiple predictions efficient"""
        model_name = "deepseek-ai/deepseek-math-7b-base"
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            cache_dir=CACHE_DIR,
        )
        self.model.generation_config = GenerationConfig.from_pretrained(
            model_name, cache_dir=CACHE_DIR
        )
        self.model.generation_config.pad_token_id = (
            self.model.generation_config.eos_token_id
        )
    def predict(
        self,
        text: str = Input(
            description="Input text.",
            default="The integral of x^2 from 0 to 2 is",
        ),
        max_new_tokens: int = Input(
            description="The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.",
            default=100,
        ),
        temperature: float = Input(
            description="The value used to modulate the next token probabilities.",
            default=1,
        ),
        top_k: int = Input(
            description="The number of highest probability vocabulary tokens to keep for top-k-filtering.",
            default=50,
        ),
        top_p: float = Input(
            description="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.",
            default=0.9,
        ),
    ) -> ConcatenateIterator[str]:
        """Run a single prediction on the model"""
        inputs = self.tokenizer(text, return_tensors="pt")
        streamer = TextIteratorStreamer(
            self.tokenizer, skip_prompt=True, skip_special_tokens=True
        )
        with torch.inference_mode():
            thread = Thread(
                target=self.model.generate,
                kwargs=dict(
                    **inputs.to(self.model.device),
                    do_sample=True,
                    temperature=temperature,
                    top_p=top_p,
                    top_k=top_k,
                    max_new_tokens=max_new_tokens,
                    streamer=streamer,
                    use_cache=True
                ),
            )
            thread.start()
            for new_token in streamer:
                yield new_token
            thread.join()
--- a/replicate/predict_instruct.py
+++ b/replicate/predict_instruct.py
@ -0,0 +1,86 @@
 # Prediction interface for Cog ⚙️
 # https://github.com/replicate/cog/blob/main/docs/python.md
 import os
 import time
 from threading import Thread
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
 from transformers.generation.streamers import TextIteratorStreamer
 from cog import BasePredictor, Input, ConcatenateIterator
 # Enable faster download speed
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 CACHE_DIR = "model_cache"
 class Predictor(BasePredictor):
    def setup(self) -> None:
        """Load the model into memory to make running multiple predictions efficient"""
        model_name = "deepseek-ai/deepseek-math-7b-instruct"
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            cache_dir=CACHE_DIR,
        )
        self.model.generation_config = GenerationConfig.from_pretrained(
            model_name, cache_dir=CACHE_DIR
        )
        self.model.generation_config.pad_token_id = (
            self.model.generation_config.eos_token_id
        )
    def predict(
        self,
        text: str = Input(
            description="Input text.",
            default="what is the integral of x^2 from 0 to 2?\nPlease reason step by step, and put your final answer within \boxed{}.",
        ),
        max_new_tokens: int = Input(
            description="The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.",
            default=100,
        ),
        temperature: float = Input(
            description="The value used to modulate the next token probabilities.",
            default=1,
        ),
        top_k: int = Input(
            description="The number of highest probability vocabulary tokens to keep for top-k-filtering.",
            default=50,
        ),
        top_p: float = Input(
            description="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.",
            default=0.9,
        ),
    ) -> ConcatenateIterator[str]:
        """Run a single prediction on the model"""
        messages = [{"role": "user", "content": text}]
        input_tensor = self.tokenizer.apply_chat_template(
            messages, add_generation_prompt=True, return_tensors="pt"
        )
        streamer = TextIteratorStreamer(
            self.tokenizer, skip_prompt=True, skip_special_tokens=True
        )
        with torch.inference_mode():
            thread = Thread(
                target=self.model.generate,
                kwargs=dict(
                    input_ids=input_tensor.to(self.model.device),
                    do_sample=True,
                    temperature=temperature,
                    top_p=top_p,
                    top_k=top_k,
                    max_new_tokens=max_new_tokens,
                    streamer=streamer,
                    use_cache=True,
                ),
            )
            thread.start()
            for new_token in streamer:
                yield new_token
            thread.join()