From 32b2faf06e74eec42bf70a6e1805f561e23fe2db Mon Sep 17 00:00:00 2001 From: chenxwh Date: Sun, 11 Feb 2024 14:45:44 +0000 Subject: [PATCH 1/3] replicate --- cog.yaml | 15 ++++++++ predict.py | 82 ++++++++++++++++++++++++++++++++++++++++++ predict_instruct.py | 86 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 183 insertions(+) create mode 100644 cog.yaml create mode 100644 predict.py create mode 100644 predict_instruct.py diff --git a/cog.yaml b/cog.yaml new file mode 100644 index 0000000..d93c702 --- /dev/null +++ b/cog.yaml @@ -0,0 +1,15 @@ +# Configuration for Cog ⚙️ +# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md + +build: + gpu: true + python_version: "3.11" + python_packages: + - torch==2.0.1 + - torchvision==0.15.2 + - transformers==4.37.2 + - accelerate==0.27.0 + - hf_transfer + +# predict.py defines how predictions are run on your model +predict: "predict.py:Predictor" diff --git a/predict.py b/predict.py new file mode 100644 index 0000000..4654b9a --- /dev/null +++ b/predict.py @@ -0,0 +1,82 @@ +# Prediction interface for Cog ⚙️ +# https://github.com/replicate/cog/blob/main/docs/python.md + +import os +import time +from threading import Thread +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig +from transformers.generation.streamers import TextIteratorStreamer +from cog import BasePredictor, Input, ConcatenateIterator + +# Enable faster download speed +os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" +CACHE_DIR = "model_cache" + + +class Predictor(BasePredictor): + def setup(self) -> None: + """Load the model into memory to make running multiple predictions efficient""" + + model_name = "deepseek-ai/deepseek-math-7b-base" + self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR) + self.model = AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype=torch.bfloat16, + device_map="auto", + cache_dir=CACHE_DIR, + ) + self.model.generation_config = GenerationConfig.from_pretrained( + model_name, cache_dir=CACHE_DIR + ) + self.model.generation_config.pad_token_id = ( + self.model.generation_config.eos_token_id + ) + + def predict( + self, + text: str = Input( + description="Input text.", + default="The integral of x^2 from 0 to 2 is", + ), + max_new_tokens: int = Input( + description="The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.", + default=100, + ), + temperature: float = Input( + description="The value used to modulate the next token probabilities.", + default=1, + ), + top_k: int = Input( + description="The number of highest probability vocabulary tokens to keep for top-k-filtering.", + default=50, + ), + top_p: float = Input( + description="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.", + default=0.9, + ), + ) -> ConcatenateIterator[str]: + """Run a single prediction on the model""" + + inputs = self.tokenizer(text, return_tensors="pt") + streamer = TextIteratorStreamer( + self.tokenizer, skip_prompt=True, skip_special_tokens=True + ) + with torch.inference_mode(): + thread = Thread( + target=self.model.generate, + kwargs=dict( + **inputs.to(self.model.device), + do_sample=True, + temperature=temperature, + top_p=top_p, + top_k=top_k, + max_new_tokens=max_new_tokens, + streamer=streamer, + use_cache=True + ), + ) + thread.start() + for new_token in streamer: + yield new_token + thread.join() diff --git a/predict_instruct.py b/predict_instruct.py new file mode 100644 index 0000000..4d73612 --- /dev/null +++ b/predict_instruct.py @@ -0,0 +1,86 @@ +# Prediction interface for Cog ⚙️ +# https://github.com/replicate/cog/blob/main/docs/python.md + +import os +import time +from threading import Thread +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig +from transformers.generation.streamers import TextIteratorStreamer +from cog import BasePredictor, Input, ConcatenateIterator + +# Enable faster download speed +os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" +CACHE_DIR = "model_cache" + + +class Predictor(BasePredictor): + def setup(self) -> None: + """Load the model into memory to make running multiple predictions efficient""" + + model_name = "deepseek-ai/deepseek-math-7b-instruct" + self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR) + self.model = AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype=torch.bfloat16, + device_map="auto", + cache_dir=CACHE_DIR, + ) + self.model.generation_config = GenerationConfig.from_pretrained( + model_name, cache_dir=CACHE_DIR + ) + self.model.generation_config.pad_token_id = ( + self.model.generation_config.eos_token_id + ) + + def predict( + self, + text: str = Input( + description="Input text.", + default="what is the integral of x^2 from 0 to 2?\nPlease reason step by step, and put your final answer within \boxed{}.", + ), + max_new_tokens: int = Input( + description="The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.", + default=100, + ), + temperature: float = Input( + description="The value used to modulate the next token probabilities.", + default=1, + ), + top_k: int = Input( + description="The number of highest probability vocabulary tokens to keep for top-k-filtering.", + default=50, + ), + top_p: float = Input( + description="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.", + default=0.9, + ), + ) -> ConcatenateIterator[str]: + """Run a single prediction on the model""" + + messages = [{"role": "user", "content": text}] + input_tensor = self.tokenizer.apply_chat_template( + messages, add_generation_prompt=True, return_tensors="pt" + ) + streamer = TextIteratorStreamer( + self.tokenizer, skip_prompt=True, skip_special_tokens=True + ) + + with torch.inference_mode(): + thread = Thread( + target=self.model.generate, + kwargs=dict( + input_ids=input_tensor.to(self.model.device), + do_sample=True, + temperature=temperature, + top_p=top_p, + top_k=top_k, + max_new_tokens=max_new_tokens, + streamer=streamer, + use_cache=True, + ), + ) + thread.start() + for new_token in streamer: + yield new_token + thread.join() From 555ba27526a0c8714d0e2a12a44c69edb6abc304 Mon Sep 17 00:00:00 2001 From: chenxwh Date: Sun, 11 Feb 2024 14:51:06 +0000 Subject: [PATCH 2/3] repliate --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4e364a3..f8df02c 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ Hugging Face - + Replicate
From a0fdfa268258d4104aeb3a97a6adc84ee8e8e988 Mon Sep 17 00:00:00 2001 From: chenxwh Date: Mon, 12 Feb 2024 21:30:24 +0000 Subject: [PATCH 3/3] replicate --- README.md | 2 +- cog.yaml | 2 +- predict.py => replicate/predict.py | 0 predict_instruct.py => replicate/predict_instruct.py | 0 4 files changed, 2 insertions(+), 2 deletions(-) rename predict.py => replicate/predict.py (100%) rename predict_instruct.py => replicate/predict_instruct.py (100%) diff --git a/README.md b/README.md index f8df02c..e96f1f5 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ Hugging Face - Replicate + Replicate
diff --git a/cog.yaml b/cog.yaml index d93c702..75bddf9 100644 --- a/cog.yaml +++ b/cog.yaml @@ -12,4 +12,4 @@ build: - hf_transfer # predict.py defines how predictions are run on your model -predict: "predict.py:Predictor" +predict: "replicate/predict.py:Predictor" diff --git a/predict.py b/replicate/predict.py similarity index 100% rename from predict.py rename to replicate/predict.py diff --git a/predict_instruct.py b/replicate/predict_instruct.py similarity index 100% rename from predict_instruct.py rename to replicate/predict_instruct.py