Add Replicate Badge and Web demo

2025-04-30 02:40:18 +00:00 · 2024-03-12 16:29:05 +00:00 · 2024-03-12 16:29:05 +00:00 · 2cd14b3a5d
commit 2cd14b3a5d
parent 8d4d9a6ccf
4 changed files with 121 additions and 1 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,17 @@
 # The .dockerignore file excludes files from the container build process.
 #
 # https://docs.docker.com/engine/reference/builder/#dockerignore-file
 # Exclude Git files
 .git
 .github
 .gitignore
 # Exclude Python cache files
 __pycache__
 .mypy_cache
 .pytest_cache
 .ruff_cache
 # Exclude Python virtual environment
 /venv
--- a/README.md
+++ b/README.md
@ -17,7 +17,9 @@
  <a href="https://huggingface.co/deepseek-ai" target="_blank">
    <img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-DeepSeek%20AI-ffc107?color=ffc107&logoColor=white" />
  </a>
-
+  <a href="https://replicate.com/lucataco/deepseek-vl-7b-base" target="_blank_">
    <img src="https://replicate.com/lucataco/deepseek-vl-7b-base/badge" alt="Replicate"/>
  </a>
 </div>
--- a/cog.yaml
+++ b/cog.yaml
@ -0,0 +1,19 @@
 # Configuration for Cog ⚙️
 # Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
 build:
  gpu: true
  python_version: "3.9"
  python_packages:
    - "accelerate==0.27.2"
    - "attrdict==2.0.1"
    - "einops==0.7.0"
    - "sentencepiece==0.2.0"
    - "torch==2.0.1"
    - "torchvision==0.15.2"
    - "transformers>=4.38.2"
    - "timm>=0.9.16"
    - "hf_transfer==0.1.6"
 # predict.py defines how predictions are run on your model
 predict: "predict.py:Predictor"
--- a/predict.py
+++ b/predict.py
@ -0,0 +1,82 @@
 # Prediction interface for Cog ⚙️
 # https://github.com/replicate/cog/blob/main/docs/python.md
 from cog import BasePredictor, Input, Path, ConcatenateIterator
 import os
 import torch
 from threading import Thread
 from deepseek_vl.utils.io import load_pil_images
 from transformers import AutoModelForCausalLM, TextIteratorStreamer
 from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM
 # Enable faster download speed
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 MODEL_NAME = "deepseek-ai/deepseek-vl-7b-base"
 CACHE_DIR = "checkpoints"
 class Predictor(BasePredictor):
    def setup(self) -> None:
        """Load the model into memory to make running multiple predictions efficient"""
        self.vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(
            MODEL_NAME,
            cache_dir=CACHE_DIR
        )
        self.tokenizer = self.vl_chat_processor.tokenizer
        vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            torch_dtype=torch.bfloat16,
            cache_dir=CACHE_DIR
        )
        self.vl_gpt = vl_gpt.to('cuda')
    @torch.inference_mode()
    def predict(
        self,
        image: Path = Input(description="Input image"),
        prompt: str = Input(description="Input prompt", default="Describe this image"),
        max_new_tokens: int = Input(description="Maximum number of tokens to generate", default=512)
    ) -> ConcatenateIterator[str]:
        """Run a single prediction on the model"""
        conversation = [
            {
                "role": "User",
                "content": "<image_placeholder>"+prompt,
                "images": [str(image)]
            },
            {
                "role": "Assistant",
                "content": ""
            }
        ]
        # load images and prepare for inputs
        pil_images = load_pil_images(conversation)
        prepare_inputs = self.vl_chat_processor(
            conversations=conversation,
            images=pil_images,
            force_batchify=True
        ).to('cuda')
        streamer = TextIteratorStreamer(
            self.tokenizer, skip_prompt=True, skip_special_tokens=True
        )
        thread = Thread(
            target=self.vl_gpt.language_model.generate,
            kwargs={
                "inputs_embeds": self.vl_gpt.prepare_inputs_embeds(**prepare_inputs),
                "attention_mask": prepare_inputs.attention_mask,
                "pad_token_id": self.tokenizer.eos_token_id,
                "bos_token_id": self.tokenizer.bos_token_id,
                "eos_token_id": self.tokenizer.eos_token_id,
                "max_new_tokens": max_new_tokens,
                "do_sample": False,
                "use_cache": True,
                "streamer": streamer,
            },
        )
        thread.start()
        for new_token in streamer:
            yield new_token
        thread.join()