mirror of
https://github.com/deepseek-ai/DeepSeek-VL
synced 2025-01-22 10:35:29 +00:00
feat: add multiple images (or in-context learning) conversation examples (#47)
Co-authored-by: Bo Liu <benjaminliu.eecs@gmail.com>
This commit is contained in:
parent
3c02b24219
commit
9bb02cc50d
68
.github/workflows/lint.yml
vendored
Normal file
68
.github/workflows/lint.yml
vendored
Normal file
@ -0,0 +1,68 @@
|
||||
name: Lint
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
pull_request:
|
||||
# Allow to trigger the workflow manually
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
concurrency:
|
||||
group: "${{ github.workflow }}-${{ github.ref }}"
|
||||
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
|
||||
|
||||
env:
|
||||
CUDA_VERSION: "11.7"
|
||||
|
||||
jobs:
|
||||
lint:
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 30
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: "recursive"
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Set up Python 3.9
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.9"
|
||||
update-environment: true
|
||||
|
||||
- name: Upgrade pip
|
||||
run: |
|
||||
python -m pip install --upgrade pip setuptools wheel
|
||||
|
||||
- name: Install TorchOpt
|
||||
env:
|
||||
USE_FP16: "OFF"
|
||||
TORCH_CUDA_ARCH_LIST: "Auto"
|
||||
run: |
|
||||
python -m pip install torch numpy pybind11
|
||||
python -m pip install -vvv --no-build-isolation --editable '.[lint]'
|
||||
|
||||
- name: pre-commit
|
||||
run: |
|
||||
make pre-commit
|
||||
|
||||
- name: ruff
|
||||
run: |
|
||||
make ruff
|
||||
|
||||
- name: flake8
|
||||
run: |
|
||||
make flake8
|
||||
|
||||
- name: isort and black
|
||||
run: |
|
||||
make py-format
|
||||
|
||||
- name: addlicense
|
||||
run: |
|
||||
make addlicense
|
26
README.md
26
README.md
@ -132,18 +132,34 @@ tokenizer = vl_chat_processor.tokenizer
|
||||
vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
||||
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
|
||||
|
||||
## single image conversation example
|
||||
conversation = [
|
||||
{
|
||||
"role": "User",
|
||||
"content": "<image_placeholder>Describe each stage of this image.",
|
||||
"images": ["./images/training_pipelines.jpg"]
|
||||
"images": ["./images/training_pipelines.jpg"],
|
||||
},
|
||||
{
|
||||
"role": "Assistant",
|
||||
"content": ""
|
||||
}
|
||||
{"role": "Assistant", "content": ""},
|
||||
]
|
||||
|
||||
## multiple images (or in-context learning) conversation example
|
||||
# conversation = [
|
||||
# {
|
||||
# "role": "User",
|
||||
# "content": "<image_placeholder>A dog wearing nothing in the foreground, "
|
||||
# "<image_placeholder>a dog wearing a santa hat, "
|
||||
# "<image_placeholder>a dog wearing a wizard outfit, and "
|
||||
# "<image_placeholder>what's the dog wearing?",
|
||||
# "images": [
|
||||
# "images/dog_a.png",
|
||||
# "images/dog_b.png",
|
||||
# "images/dog_c.png",
|
||||
# "images/dog_d.png",
|
||||
# ],
|
||||
# },
|
||||
# {"role": "Assistant", "content": ""}
|
||||
# ]
|
||||
|
||||
# load images and prepare for inputs
|
||||
pil_images = load_pil_images(conversation)
|
||||
prepare_inputs = vl_chat_processor(
|
||||
|
BIN
images/dog_a.png
Normal file
BIN
images/dog_a.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 204 KiB |
BIN
images/dog_b.png
Normal file
BIN
images/dog_b.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 356 KiB |
BIN
images/dog_c.png
Normal file
BIN
images/dog_c.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 418 KiB |
BIN
images/dog_d.png
Normal file
BIN
images/dog_d.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 363 KiB |
18
inference.py
18
inference.py
@ -33,6 +33,7 @@ vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
|
||||
)
|
||||
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
|
||||
|
||||
# single image conversation example
|
||||
conversation = [
|
||||
{
|
||||
"role": "User",
|
||||
@ -42,6 +43,23 @@ conversation = [
|
||||
{"role": "Assistant", "content": ""},
|
||||
]
|
||||
|
||||
# multiple images (or in-context learning) conversation example
|
||||
# conversation = [
|
||||
# {
|
||||
# "role": "User",
|
||||
# "content": "<image_placeholder>A dog wearing nothing in the foreground, "
|
||||
# "<image_placeholder>a dog wearing a santa hat, "
|
||||
# "<image_placeholder>a dog wearing a wizard outfit, and "
|
||||
# "<image_placeholder>what's the dog wearing?",
|
||||
# "images": [
|
||||
# "images/dog_a.png",
|
||||
# "images/dog_b.png",
|
||||
# "images/dog_c.png",
|
||||
# "images/dog_d.png",
|
||||
# ],
|
||||
# },
|
||||
# {"role": "Assistant", "content": ""}
|
||||
# ]
|
||||
|
||||
# load images and prepare for inputs
|
||||
pil_images = load_pil_images(conversation)
|
||||
|
@ -34,6 +34,20 @@ gradio = [
|
||||
"markdown==3.4.1",
|
||||
"SentencePiece==0.1.96"
|
||||
]
|
||||
lint = [
|
||||
"isort",
|
||||
"black[jupyter] >= 22.6.0",
|
||||
"pylint[spelling] >= 2.15.0",
|
||||
"flake8",
|
||||
"flake8-bugbear",
|
||||
"flake8-comprehensions",
|
||||
"flake8-docstrings",
|
||||
"flake8-pyi",
|
||||
"flake8-simplify",
|
||||
"ruff",
|
||||
"pyenchant",
|
||||
"pre-commit",
|
||||
]
|
||||
|
||||
[tool.setuptools]
|
||||
packages = {find = {exclude = ["images"]}}
|
||||
|
Loading…
Reference in New Issue
Block a user