diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..4522d57
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,17 @@
+# The .dockerignore file excludes files from the container build process.
+#
+# https://docs.docker.com/engine/reference/builder/#dockerignore-file
+
+# Exclude Git files
+.git
+.github
+.gitignore
+
+# Exclude Python cache files
+__pycache__
+.mypy_cache
+.pytest_cache
+.ruff_cache
+
+# Exclude Python virtual environment
+/venv
diff --git a/README.md b/README.md
index 6d24eaf..6fc9f6d 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,9 @@
-
+
+
+
diff --git a/cog.yaml b/cog.yaml
new file mode 100644
index 0000000..97a33c3
--- /dev/null
+++ b/cog.yaml
@@ -0,0 +1,19 @@
+# Configuration for Cog ⚙️
+# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
+
+build:
+ gpu: true
+ python_version: "3.9"
+ python_packages:
+ - "accelerate==0.27.2"
+ - "attrdict==2.0.1"
+ - "einops==0.7.0"
+ - "sentencepiece==0.2.0"
+ - "torch==2.0.1"
+ - "torchvision==0.15.2"
+ - "transformers>=4.38.2"
+ - "timm>=0.9.16"
+ - "hf_transfer==0.1.6"
+
+# predict.py defines how predictions are run on your model
+predict: "predict.py:Predictor"
diff --git a/predict.py b/predict.py
new file mode 100644
index 0000000..4954973
--- /dev/null
+++ b/predict.py
@@ -0,0 +1,82 @@
+# Prediction interface for Cog ⚙️
+# https://github.com/replicate/cog/blob/main/docs/python.md
+
+from cog import BasePredictor, Input, Path, ConcatenateIterator
+import os
+import torch
+from threading import Thread
+from deepseek_vl.utils.io import load_pil_images
+from transformers import AutoModelForCausalLM, TextIteratorStreamer
+from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM
+
+# Enable faster download speed
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+MODEL_NAME = "deepseek-ai/deepseek-vl-7b-base"
+CACHE_DIR = "checkpoints"
+
+
+class Predictor(BasePredictor):
+ def setup(self) -> None:
+ """Load the model into memory to make running multiple predictions efficient"""
+ self.vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(
+ MODEL_NAME,
+ cache_dir=CACHE_DIR
+ )
+ self.tokenizer = self.vl_chat_processor.tokenizer
+ vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
+ MODEL_NAME,
+ torch_dtype=torch.bfloat16,
+ cache_dir=CACHE_DIR
+ )
+ self.vl_gpt = vl_gpt.to('cuda')
+
+ @torch.inference_mode()
+ def predict(
+ self,
+ image: Path = Input(description="Input image"),
+ prompt: str = Input(description="Input prompt", default="Describe this image"),
+ max_new_tokens: int = Input(description="Maximum number of tokens to generate", default=512)
+ ) -> ConcatenateIterator[str]:
+ """Run a single prediction on the model"""
+ conversation = [
+ {
+ "role": "User",
+ "content": ""+prompt,
+ "images": [str(image)]
+ },
+ {
+ "role": "Assistant",
+ "content": ""
+ }
+ ]
+
+ # load images and prepare for inputs
+ pil_images = load_pil_images(conversation)
+ prepare_inputs = self.vl_chat_processor(
+ conversations=conversation,
+ images=pil_images,
+ force_batchify=True
+ ).to('cuda')
+
+ streamer = TextIteratorStreamer(
+ self.tokenizer, skip_prompt=True, skip_special_tokens=True
+ )
+
+ thread = Thread(
+ target=self.vl_gpt.language_model.generate,
+ kwargs={
+ "inputs_embeds": self.vl_gpt.prepare_inputs_embeds(**prepare_inputs),
+ "attention_mask": prepare_inputs.attention_mask,
+ "pad_token_id": self.tokenizer.eos_token_id,
+ "bos_token_id": self.tokenizer.bos_token_id,
+ "eos_token_id": self.tokenizer.eos_token_id,
+ "max_new_tokens": max_new_tokens,
+ "do_sample": False,
+ "use_cache": True,
+ "streamer": streamer,
+ },
+ )
+ thread.start()
+ for new_token in streamer:
+ yield new_token
+ thread.join()