diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..4522d57 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,17 @@ +# The .dockerignore file excludes files from the container build process. +# +# https://docs.docker.com/engine/reference/builder/#dockerignore-file + +# Exclude Git files +.git +.github +.gitignore + +# Exclude Python cache files +__pycache__ +.mypy_cache +.pytest_cache +.ruff_cache + +# Exclude Python virtual environment +/venv diff --git a/README.md b/README.md index 6d24eaf..6fc9f6d 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,9 @@ Hugging Face - + + Replicate + diff --git a/cog.yaml b/cog.yaml new file mode 100644 index 0000000..97a33c3 --- /dev/null +++ b/cog.yaml @@ -0,0 +1,19 @@ +# Configuration for Cog ⚙️ +# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md + +build: + gpu: true + python_version: "3.9" + python_packages: + - "accelerate==0.27.2" + - "attrdict==2.0.1" + - "einops==0.7.0" + - "sentencepiece==0.2.0" + - "torch==2.0.1" + - "torchvision==0.15.2" + - "transformers>=4.38.2" + - "timm>=0.9.16" + - "hf_transfer==0.1.6" + +# predict.py defines how predictions are run on your model +predict: "predict.py:Predictor" diff --git a/predict.py b/predict.py new file mode 100644 index 0000000..4954973 --- /dev/null +++ b/predict.py @@ -0,0 +1,82 @@ +# Prediction interface for Cog ⚙️ +# https://github.com/replicate/cog/blob/main/docs/python.md + +from cog import BasePredictor, Input, Path, ConcatenateIterator +import os +import torch +from threading import Thread +from deepseek_vl.utils.io import load_pil_images +from transformers import AutoModelForCausalLM, TextIteratorStreamer +from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM + +# Enable faster download speed +os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" +MODEL_NAME = "deepseek-ai/deepseek-vl-7b-base" +CACHE_DIR = "checkpoints" + + +class Predictor(BasePredictor): + def setup(self) -> None: + """Load the model into memory to make running multiple predictions efficient""" + self.vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained( + MODEL_NAME, + cache_dir=CACHE_DIR + ) + self.tokenizer = self.vl_chat_processor.tokenizer + vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained( + MODEL_NAME, + torch_dtype=torch.bfloat16, + cache_dir=CACHE_DIR + ) + self.vl_gpt = vl_gpt.to('cuda') + + @torch.inference_mode() + def predict( + self, + image: Path = Input(description="Input image"), + prompt: str = Input(description="Input prompt", default="Describe this image"), + max_new_tokens: int = Input(description="Maximum number of tokens to generate", default=512) + ) -> ConcatenateIterator[str]: + """Run a single prediction on the model""" + conversation = [ + { + "role": "User", + "content": ""+prompt, + "images": [str(image)] + }, + { + "role": "Assistant", + "content": "" + } + ] + + # load images and prepare for inputs + pil_images = load_pil_images(conversation) + prepare_inputs = self.vl_chat_processor( + conversations=conversation, + images=pil_images, + force_batchify=True + ).to('cuda') + + streamer = TextIteratorStreamer( + self.tokenizer, skip_prompt=True, skip_special_tokens=True + ) + + thread = Thread( + target=self.vl_gpt.language_model.generate, + kwargs={ + "inputs_embeds": self.vl_gpt.prepare_inputs_embeds(**prepare_inputs), + "attention_mask": prepare_inputs.attention_mask, + "pad_token_id": self.tokenizer.eos_token_id, + "bos_token_id": self.tokenizer.bos_token_id, + "eos_token_id": self.tokenizer.eos_token_id, + "max_new_tokens": max_new_tokens, + "do_sample": False, + "use_cache": True, + "streamer": streamer, + }, + ) + thread.start() + for new_token in streamer: + yield new_token + thread.join()