mirror of
https://github.com/deepseek-ai/DeepSeek-Math
synced 2024-11-21 19:17:41 +00:00
commit
7c34ad4fa4
@ -18,7 +18,7 @@
|
|||||||
<a href="https://huggingface.co/deepseek-ai" target="_blank">
|
<a href="https://huggingface.co/deepseek-ai" target="_blank">
|
||||||
<img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-DeepSeek%20AI-ffc107?color=ffc107&logoColor=white" />
|
<img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-DeepSeek%20AI-ffc107?color=ffc107&logoColor=white" />
|
||||||
</a>
|
</a>
|
||||||
|
<a href="https://replicate.com/cjwbw/deepseek-math-7b-base" target="_parent"><img src="https://replicate.com/cjwbw/deepseek-math-7b-base/badge" alt="Replicate"/></a>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div align="center">
|
<div align="center">
|
||||||
|
15
cog.yaml
Normal file
15
cog.yaml
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
# Configuration for Cog ⚙️
|
||||||
|
# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
|
||||||
|
|
||||||
|
build:
|
||||||
|
gpu: true
|
||||||
|
python_version: "3.11"
|
||||||
|
python_packages:
|
||||||
|
- torch==2.0.1
|
||||||
|
- torchvision==0.15.2
|
||||||
|
- transformers==4.37.2
|
||||||
|
- accelerate==0.27.0
|
||||||
|
- hf_transfer
|
||||||
|
|
||||||
|
# predict.py defines how predictions are run on your model
|
||||||
|
predict: "replicate/predict.py:Predictor"
|
82
replicate/predict.py
Normal file
82
replicate/predict.py
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
# Prediction interface for Cog ⚙️
|
||||||
|
# https://github.com/replicate/cog/blob/main/docs/python.md
|
||||||
|
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from threading import Thread
|
||||||
|
import torch
|
||||||
|
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
|
||||||
|
from transformers.generation.streamers import TextIteratorStreamer
|
||||||
|
from cog import BasePredictor, Input, ConcatenateIterator
|
||||||
|
|
||||||
|
# Enable faster download speed
|
||||||
|
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
||||||
|
CACHE_DIR = "model_cache"
|
||||||
|
|
||||||
|
|
||||||
|
class Predictor(BasePredictor):
|
||||||
|
def setup(self) -> None:
|
||||||
|
"""Load the model into memory to make running multiple predictions efficient"""
|
||||||
|
|
||||||
|
model_name = "deepseek-ai/deepseek-math-7b-base"
|
||||||
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR)
|
||||||
|
self.model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_name,
|
||||||
|
torch_dtype=torch.bfloat16,
|
||||||
|
device_map="auto",
|
||||||
|
cache_dir=CACHE_DIR,
|
||||||
|
)
|
||||||
|
self.model.generation_config = GenerationConfig.from_pretrained(
|
||||||
|
model_name, cache_dir=CACHE_DIR
|
||||||
|
)
|
||||||
|
self.model.generation_config.pad_token_id = (
|
||||||
|
self.model.generation_config.eos_token_id
|
||||||
|
)
|
||||||
|
|
||||||
|
def predict(
|
||||||
|
self,
|
||||||
|
text: str = Input(
|
||||||
|
description="Input text.",
|
||||||
|
default="The integral of x^2 from 0 to 2 is",
|
||||||
|
),
|
||||||
|
max_new_tokens: int = Input(
|
||||||
|
description="The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.",
|
||||||
|
default=100,
|
||||||
|
),
|
||||||
|
temperature: float = Input(
|
||||||
|
description="The value used to modulate the next token probabilities.",
|
||||||
|
default=1,
|
||||||
|
),
|
||||||
|
top_k: int = Input(
|
||||||
|
description="The number of highest probability vocabulary tokens to keep for top-k-filtering.",
|
||||||
|
default=50,
|
||||||
|
),
|
||||||
|
top_p: float = Input(
|
||||||
|
description="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.",
|
||||||
|
default=0.9,
|
||||||
|
),
|
||||||
|
) -> ConcatenateIterator[str]:
|
||||||
|
"""Run a single prediction on the model"""
|
||||||
|
|
||||||
|
inputs = self.tokenizer(text, return_tensors="pt")
|
||||||
|
streamer = TextIteratorStreamer(
|
||||||
|
self.tokenizer, skip_prompt=True, skip_special_tokens=True
|
||||||
|
)
|
||||||
|
with torch.inference_mode():
|
||||||
|
thread = Thread(
|
||||||
|
target=self.model.generate,
|
||||||
|
kwargs=dict(
|
||||||
|
**inputs.to(self.model.device),
|
||||||
|
do_sample=True,
|
||||||
|
temperature=temperature,
|
||||||
|
top_p=top_p,
|
||||||
|
top_k=top_k,
|
||||||
|
max_new_tokens=max_new_tokens,
|
||||||
|
streamer=streamer,
|
||||||
|
use_cache=True
|
||||||
|
),
|
||||||
|
)
|
||||||
|
thread.start()
|
||||||
|
for new_token in streamer:
|
||||||
|
yield new_token
|
||||||
|
thread.join()
|
86
replicate/predict_instruct.py
Normal file
86
replicate/predict_instruct.py
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
# Prediction interface for Cog ⚙️
|
||||||
|
# https://github.com/replicate/cog/blob/main/docs/python.md
|
||||||
|
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from threading import Thread
|
||||||
|
import torch
|
||||||
|
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
|
||||||
|
from transformers.generation.streamers import TextIteratorStreamer
|
||||||
|
from cog import BasePredictor, Input, ConcatenateIterator
|
||||||
|
|
||||||
|
# Enable faster download speed
|
||||||
|
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
||||||
|
CACHE_DIR = "model_cache"
|
||||||
|
|
||||||
|
|
||||||
|
class Predictor(BasePredictor):
|
||||||
|
def setup(self) -> None:
|
||||||
|
"""Load the model into memory to make running multiple predictions efficient"""
|
||||||
|
|
||||||
|
model_name = "deepseek-ai/deepseek-math-7b-instruct"
|
||||||
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR)
|
||||||
|
self.model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_name,
|
||||||
|
torch_dtype=torch.bfloat16,
|
||||||
|
device_map="auto",
|
||||||
|
cache_dir=CACHE_DIR,
|
||||||
|
)
|
||||||
|
self.model.generation_config = GenerationConfig.from_pretrained(
|
||||||
|
model_name, cache_dir=CACHE_DIR
|
||||||
|
)
|
||||||
|
self.model.generation_config.pad_token_id = (
|
||||||
|
self.model.generation_config.eos_token_id
|
||||||
|
)
|
||||||
|
|
||||||
|
def predict(
|
||||||
|
self,
|
||||||
|
text: str = Input(
|
||||||
|
description="Input text.",
|
||||||
|
default="what is the integral of x^2 from 0 to 2?\nPlease reason step by step, and put your final answer within \boxed{}.",
|
||||||
|
),
|
||||||
|
max_new_tokens: int = Input(
|
||||||
|
description="The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.",
|
||||||
|
default=100,
|
||||||
|
),
|
||||||
|
temperature: float = Input(
|
||||||
|
description="The value used to modulate the next token probabilities.",
|
||||||
|
default=1,
|
||||||
|
),
|
||||||
|
top_k: int = Input(
|
||||||
|
description="The number of highest probability vocabulary tokens to keep for top-k-filtering.",
|
||||||
|
default=50,
|
||||||
|
),
|
||||||
|
top_p: float = Input(
|
||||||
|
description="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.",
|
||||||
|
default=0.9,
|
||||||
|
),
|
||||||
|
) -> ConcatenateIterator[str]:
|
||||||
|
"""Run a single prediction on the model"""
|
||||||
|
|
||||||
|
messages = [{"role": "user", "content": text}]
|
||||||
|
input_tensor = self.tokenizer.apply_chat_template(
|
||||||
|
messages, add_generation_prompt=True, return_tensors="pt"
|
||||||
|
)
|
||||||
|
streamer = TextIteratorStreamer(
|
||||||
|
self.tokenizer, skip_prompt=True, skip_special_tokens=True
|
||||||
|
)
|
||||||
|
|
||||||
|
with torch.inference_mode():
|
||||||
|
thread = Thread(
|
||||||
|
target=self.model.generate,
|
||||||
|
kwargs=dict(
|
||||||
|
input_ids=input_tensor.to(self.model.device),
|
||||||
|
do_sample=True,
|
||||||
|
temperature=temperature,
|
||||||
|
top_p=top_p,
|
||||||
|
top_k=top_k,
|
||||||
|
max_new_tokens=max_new_tokens,
|
||||||
|
streamer=streamer,
|
||||||
|
use_cache=True,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
thread.start()
|
||||||
|
for new_token in streamer:
|
||||||
|
yield new_token
|
||||||
|
thread.join()
|
Loading…
Reference in New Issue
Block a user