feat: add multiple images (or in-context learning) conversation examples (#47)

Co-authored-by: Bo Liu <benjaminliu.eecs@gmail.com>
2025-06-26 18:27:43 +00:00 · 2024-04-16 13:58:43 +08:00 · 2024-04-16 13:58:43 +08:00 · 9bb02cc50d
commit 9bb02cc50d
parent 3c02b24219
8 changed files with 121 additions and 5 deletions
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -0,0 +1,68 @@
 name: Lint
 on:
  push:
    branches:
      - main
  pull_request:
  # Allow to trigger the workflow manually
  workflow_dispatch:
 permissions:
  contents: read
 concurrency:
  group: "${{ github.workflow }}-${{ github.ref }}"
  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
 env:
  CUDA_VERSION: "11.7"
 jobs:
  lint:
    runs-on: ubuntu-latest
    timeout-minutes: 30
    steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
          submodules: "recursive"
          fetch-depth: 1
      - name: Set up Python 3.9
        uses: actions/setup-python@v5
        with:
          python-version: "3.9"
          update-environment: true
      - name: Upgrade pip
        run: |
          python -m pip install --upgrade pip setuptools wheel
      - name: Install TorchOpt
        env:
          USE_FP16: "OFF"
          TORCH_CUDA_ARCH_LIST: "Auto"
        run: |
          python -m pip install torch numpy pybind11
          python -m pip install -vvv --no-build-isolation --editable '.[lint]'
      - name: pre-commit
        run: |
          make pre-commit
      - name: ruff
        run: |
          make ruff
      - name: flake8
        run: |
          make flake8
      - name: isort and black
        run: |
          make py-format
      - name: addlicense
        run: |
          make addlicense
--- a/README.md
+++ b/README.md
@ -132,18 +132,34 @@ tokenizer = vl_chat_processor.tokenizer
 vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
 vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
 ## single image conversation example
 conversation = [
    {
        "role": "User",
        "content": "<image_placeholder>Describe each stage of this image.",
-        "images": ["./images/training_pipelines.jpg"]
+        "images": ["./images/training_pipelines.jpg"],
    },
-    {
+    {"role": "Assistant", "content": ""},
        "role": "Assistant",
        "content": ""
    }
 ]
 ## multiple images (or in-context learning) conversation example
 # conversation = [
 #     {
 #         "role": "User",
 #         "content": "<image_placeholder>A dog wearing nothing in the foreground, "
 #                    "<image_placeholder>a dog wearing a santa hat, "
 #                    "<image_placeholder>a dog wearing a wizard outfit, and "
 #                    "<image_placeholder>what's the dog wearing?",
 #         "images": [
 #             "images/dog_a.png",
 #             "images/dog_b.png",
 #             "images/dog_c.png",
 #             "images/dog_d.png",
 #         ],
 #     },
 #     {"role": "Assistant", "content": ""}
 # ]
 # load images and prepare for inputs
 pil_images = load_pil_images(conversation)
 prepare_inputs = vl_chat_processor(
--- a/images/dog_a.png
+++ b/images/dog_a.png
--- a/images/dog_b.png
+++ b/images/dog_b.png
--- a/images/dog_c.png
+++ b/images/dog_c.png
--- a/images/dog_d.png
+++ b/images/dog_d.png
--- a/inference.py
+++ b/inference.py
@ -33,6 +33,7 @@ vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
 )
 vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
 # single image conversation example
 conversation = [
    {
        "role": "User",
@ -42,6 +43,23 @@ conversation = [
    {"role": "Assistant", "content": ""},
 ]
 # multiple images (or in-context learning) conversation example
 # conversation = [
 #     {
 #         "role": "User",
 #         "content": "<image_placeholder>A dog wearing nothing in the foreground, "
 #                    "<image_placeholder>a dog wearing a santa hat, "
 #                    "<image_placeholder>a dog wearing a wizard outfit, and "
 #                    "<image_placeholder>what's the dog wearing?",
 #         "images": [
 #             "images/dog_a.png",
 #             "images/dog_b.png",
 #             "images/dog_c.png",
 #             "images/dog_d.png",
 #         ],
 #     },
 #     {"role": "Assistant", "content": ""}
 # ]
 # load images and prepare for inputs
 pil_images = load_pil_images(conversation)
--- a/pyproject.toml
+++ b/pyproject.toml
@ -34,6 +34,20 @@ gradio = [
    "markdown==3.4.1",
    "SentencePiece==0.1.96"
 ]
 lint = [
    "isort",
    "black[jupyter] >= 22.6.0",
    "pylint[spelling] >= 2.15.0",
    "flake8",
    "flake8-bugbear",
    "flake8-comprehensions",
    "flake8-docstrings",
    "flake8-pyi",
    "flake8-simplify",
    "ruff",
    "pyenchant",
    "pre-commit",
 ]
 [tool.setuptools]
 packages = {find = {exclude = ["images"]}}