replicate

This commit is contained in:
chenxwh 2024-02-11 14:45:44 +00:00
parent db877abb91
commit 32b2faf06e
3 changed files with 183 additions and 0 deletions

15
cog.yaml Normal file
View File

@ -0,0 +1,15 @@
# Configuration for Cog ⚙️
# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
build:
gpu: true
python_version: "3.11"
python_packages:
- torch==2.0.1
- torchvision==0.15.2
- transformers==4.37.2
- accelerate==0.27.0
- hf_transfer
# predict.py defines how predictions are run on your model
predict: "predict.py:Predictor"

82
predict.py Normal file
View File

@ -0,0 +1,82 @@
# Prediction interface for Cog ⚙️
# https://github.com/replicate/cog/blob/main/docs/python.md
import os
import time
from threading import Thread
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from transformers.generation.streamers import TextIteratorStreamer
from cog import BasePredictor, Input, ConcatenateIterator
# Enable faster download speed
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
CACHE_DIR = "model_cache"
class Predictor(BasePredictor):
def setup(self) -> None:
"""Load the model into memory to make running multiple predictions efficient"""
model_name = "deepseek-ai/deepseek-math-7b-base"
self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR)
self.model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
device_map="auto",
cache_dir=CACHE_DIR,
)
self.model.generation_config = GenerationConfig.from_pretrained(
model_name, cache_dir=CACHE_DIR
)
self.model.generation_config.pad_token_id = (
self.model.generation_config.eos_token_id
)
def predict(
self,
text: str = Input(
description="Input text.",
default="The integral of x^2 from 0 to 2 is",
),
max_new_tokens: int = Input(
description="The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.",
default=100,
),
temperature: float = Input(
description="The value used to modulate the next token probabilities.",
default=1,
),
top_k: int = Input(
description="The number of highest probability vocabulary tokens to keep for top-k-filtering.",
default=50,
),
top_p: float = Input(
description="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.",
default=0.9,
),
) -> ConcatenateIterator[str]:
"""Run a single prediction on the model"""
inputs = self.tokenizer(text, return_tensors="pt")
streamer = TextIteratorStreamer(
self.tokenizer, skip_prompt=True, skip_special_tokens=True
)
with torch.inference_mode():
thread = Thread(
target=self.model.generate,
kwargs=dict(
**inputs.to(self.model.device),
do_sample=True,
temperature=temperature,
top_p=top_p,
top_k=top_k,
max_new_tokens=max_new_tokens,
streamer=streamer,
use_cache=True
),
)
thread.start()
for new_token in streamer:
yield new_token
thread.join()

86
predict_instruct.py Normal file
View File

@ -0,0 +1,86 @@
# Prediction interface for Cog ⚙️
# https://github.com/replicate/cog/blob/main/docs/python.md
import os
import time
from threading import Thread
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from transformers.generation.streamers import TextIteratorStreamer
from cog import BasePredictor, Input, ConcatenateIterator
# Enable faster download speed
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
CACHE_DIR = "model_cache"
class Predictor(BasePredictor):
def setup(self) -> None:
"""Load the model into memory to make running multiple predictions efficient"""
model_name = "deepseek-ai/deepseek-math-7b-instruct"
self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR)
self.model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
device_map="auto",
cache_dir=CACHE_DIR,
)
self.model.generation_config = GenerationConfig.from_pretrained(
model_name, cache_dir=CACHE_DIR
)
self.model.generation_config.pad_token_id = (
self.model.generation_config.eos_token_id
)
def predict(
self,
text: str = Input(
description="Input text.",
default="what is the integral of x^2 from 0 to 2?\nPlease reason step by step, and put your final answer within \boxed{}.",
),
max_new_tokens: int = Input(
description="The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.",
default=100,
),
temperature: float = Input(
description="The value used to modulate the next token probabilities.",
default=1,
),
top_k: int = Input(
description="The number of highest probability vocabulary tokens to keep for top-k-filtering.",
default=50,
),
top_p: float = Input(
description="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.",
default=0.9,
),
) -> ConcatenateIterator[str]:
"""Run a single prediction on the model"""
messages = [{"role": "user", "content": text}]
input_tensor = self.tokenizer.apply_chat_template(
messages, add_generation_prompt=True, return_tensors="pt"
)
streamer = TextIteratorStreamer(
self.tokenizer, skip_prompt=True, skip_special_tokens=True
)
with torch.inference_mode():
thread = Thread(
target=self.model.generate,
kwargs=dict(
input_ids=input_tensor.to(self.model.device),
do_sample=True,
temperature=temperature,
top_p=top_p,
top_k=top_k,
max_new_tokens=max_new_tokens,
streamer=streamer,
use_cache=True,
),
)
thread.start()
for new_token in streamer:
yield new_token
thread.join()