diff --git a/cog.yaml b/cog.yaml new file mode 100644 index 0000000..d93c702 --- /dev/null +++ b/cog.yaml @@ -0,0 +1,15 @@ +# Configuration for Cog ⚙️ +# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md + +build: + gpu: true + python_version: "3.11" + python_packages: + - torch==2.0.1 + - torchvision==0.15.2 + - transformers==4.37.2 + - accelerate==0.27.0 + - hf_transfer + +# predict.py defines how predictions are run on your model +predict: "predict.py:Predictor" diff --git a/predict.py b/predict.py new file mode 100644 index 0000000..4654b9a --- /dev/null +++ b/predict.py @@ -0,0 +1,82 @@ +# Prediction interface for Cog ⚙️ +# https://github.com/replicate/cog/blob/main/docs/python.md + +import os +import time +from threading import Thread +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig +from transformers.generation.streamers import TextIteratorStreamer +from cog import BasePredictor, Input, ConcatenateIterator + +# Enable faster download speed +os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" +CACHE_DIR = "model_cache" + + +class Predictor(BasePredictor): + def setup(self) -> None: + """Load the model into memory to make running multiple predictions efficient""" + + model_name = "deepseek-ai/deepseek-math-7b-base" + self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR) + self.model = AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype=torch.bfloat16, + device_map="auto", + cache_dir=CACHE_DIR, + ) + self.model.generation_config = GenerationConfig.from_pretrained( + model_name, cache_dir=CACHE_DIR + ) + self.model.generation_config.pad_token_id = ( + self.model.generation_config.eos_token_id + ) + + def predict( + self, + text: str = Input( + description="Input text.", + default="The integral of x^2 from 0 to 2 is", + ), + max_new_tokens: int = Input( + description="The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.", + default=100, + ), + temperature: float = Input( + description="The value used to modulate the next token probabilities.", + default=1, + ), + top_k: int = Input( + description="The number of highest probability vocabulary tokens to keep for top-k-filtering.", + default=50, + ), + top_p: float = Input( + description="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.", + default=0.9, + ), + ) -> ConcatenateIterator[str]: + """Run a single prediction on the model""" + + inputs = self.tokenizer(text, return_tensors="pt") + streamer = TextIteratorStreamer( + self.tokenizer, skip_prompt=True, skip_special_tokens=True + ) + with torch.inference_mode(): + thread = Thread( + target=self.model.generate, + kwargs=dict( + **inputs.to(self.model.device), + do_sample=True, + temperature=temperature, + top_p=top_p, + top_k=top_k, + max_new_tokens=max_new_tokens, + streamer=streamer, + use_cache=True + ), + ) + thread.start() + for new_token in streamer: + yield new_token + thread.join() diff --git a/predict_instruct.py b/predict_instruct.py new file mode 100644 index 0000000..4d73612 --- /dev/null +++ b/predict_instruct.py @@ -0,0 +1,86 @@ +# Prediction interface for Cog ⚙️ +# https://github.com/replicate/cog/blob/main/docs/python.md + +import os +import time +from threading import Thread +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig +from transformers.generation.streamers import TextIteratorStreamer +from cog import BasePredictor, Input, ConcatenateIterator + +# Enable faster download speed +os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" +CACHE_DIR = "model_cache" + + +class Predictor(BasePredictor): + def setup(self) -> None: + """Load the model into memory to make running multiple predictions efficient""" + + model_name = "deepseek-ai/deepseek-math-7b-instruct" + self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR) + self.model = AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype=torch.bfloat16, + device_map="auto", + cache_dir=CACHE_DIR, + ) + self.model.generation_config = GenerationConfig.from_pretrained( + model_name, cache_dir=CACHE_DIR + ) + self.model.generation_config.pad_token_id = ( + self.model.generation_config.eos_token_id + ) + + def predict( + self, + text: str = Input( + description="Input text.", + default="what is the integral of x^2 from 0 to 2?\nPlease reason step by step, and put your final answer within \boxed{}.", + ), + max_new_tokens: int = Input( + description="The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.", + default=100, + ), + temperature: float = Input( + description="The value used to modulate the next token probabilities.", + default=1, + ), + top_k: int = Input( + description="The number of highest probability vocabulary tokens to keep for top-k-filtering.", + default=50, + ), + top_p: float = Input( + description="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.", + default=0.9, + ), + ) -> ConcatenateIterator[str]: + """Run a single prediction on the model""" + + messages = [{"role": "user", "content": text}] + input_tensor = self.tokenizer.apply_chat_template( + messages, add_generation_prompt=True, return_tensors="pt" + ) + streamer = TextIteratorStreamer( + self.tokenizer, skip_prompt=True, skip_special_tokens=True + ) + + with torch.inference_mode(): + thread = Thread( + target=self.model.generate, + kwargs=dict( + input_ids=input_tensor.to(self.model.device), + do_sample=True, + temperature=temperature, + top_p=top_p, + top_k=top_k, + max_new_tokens=max_new_tokens, + streamer=streamer, + use_cache=True, + ), + ) + thread.start() + for new_token in streamer: + yield new_token + thread.join()