mirror of
https://github.com/deepseek-ai/DeepSeek-VL
synced 2024-11-21 10:57:39 +00:00
feat: add multiple images (or in-context learning) conversation examples (#47)
Co-authored-by: Bo Liu <benjaminliu.eecs@gmail.com>
This commit is contained in:
parent
3c02b24219
commit
9bb02cc50d
68
.github/workflows/lint.yml
vendored
Normal file
68
.github/workflows/lint.yml
vendored
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
name: Lint
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
pull_request:
|
||||||
|
# Allow to trigger the workflow manually
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: "${{ github.workflow }}-${{ github.ref }}"
|
||||||
|
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
|
||||||
|
|
||||||
|
env:
|
||||||
|
CUDA_VERSION: "11.7"
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
lint:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
timeout-minutes: 30
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
submodules: "recursive"
|
||||||
|
fetch-depth: 1
|
||||||
|
|
||||||
|
- name: Set up Python 3.9
|
||||||
|
uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: "3.9"
|
||||||
|
update-environment: true
|
||||||
|
|
||||||
|
- name: Upgrade pip
|
||||||
|
run: |
|
||||||
|
python -m pip install --upgrade pip setuptools wheel
|
||||||
|
|
||||||
|
- name: Install TorchOpt
|
||||||
|
env:
|
||||||
|
USE_FP16: "OFF"
|
||||||
|
TORCH_CUDA_ARCH_LIST: "Auto"
|
||||||
|
run: |
|
||||||
|
python -m pip install torch numpy pybind11
|
||||||
|
python -m pip install -vvv --no-build-isolation --editable '.[lint]'
|
||||||
|
|
||||||
|
- name: pre-commit
|
||||||
|
run: |
|
||||||
|
make pre-commit
|
||||||
|
|
||||||
|
- name: ruff
|
||||||
|
run: |
|
||||||
|
make ruff
|
||||||
|
|
||||||
|
- name: flake8
|
||||||
|
run: |
|
||||||
|
make flake8
|
||||||
|
|
||||||
|
- name: isort and black
|
||||||
|
run: |
|
||||||
|
make py-format
|
||||||
|
|
||||||
|
- name: addlicense
|
||||||
|
run: |
|
||||||
|
make addlicense
|
26
README.md
26
README.md
@ -132,18 +132,34 @@ tokenizer = vl_chat_processor.tokenizer
|
|||||||
vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
||||||
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
|
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
|
||||||
|
|
||||||
|
## single image conversation example
|
||||||
conversation = [
|
conversation = [
|
||||||
{
|
{
|
||||||
"role": "User",
|
"role": "User",
|
||||||
"content": "<image_placeholder>Describe each stage of this image.",
|
"content": "<image_placeholder>Describe each stage of this image.",
|
||||||
"images": ["./images/training_pipelines.jpg"]
|
"images": ["./images/training_pipelines.jpg"],
|
||||||
},
|
},
|
||||||
{
|
{"role": "Assistant", "content": ""},
|
||||||
"role": "Assistant",
|
|
||||||
"content": ""
|
|
||||||
}
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
## multiple images (or in-context learning) conversation example
|
||||||
|
# conversation = [
|
||||||
|
# {
|
||||||
|
# "role": "User",
|
||||||
|
# "content": "<image_placeholder>A dog wearing nothing in the foreground, "
|
||||||
|
# "<image_placeholder>a dog wearing a santa hat, "
|
||||||
|
# "<image_placeholder>a dog wearing a wizard outfit, and "
|
||||||
|
# "<image_placeholder>what's the dog wearing?",
|
||||||
|
# "images": [
|
||||||
|
# "images/dog_a.png",
|
||||||
|
# "images/dog_b.png",
|
||||||
|
# "images/dog_c.png",
|
||||||
|
# "images/dog_d.png",
|
||||||
|
# ],
|
||||||
|
# },
|
||||||
|
# {"role": "Assistant", "content": ""}
|
||||||
|
# ]
|
||||||
|
|
||||||
# load images and prepare for inputs
|
# load images and prepare for inputs
|
||||||
pil_images = load_pil_images(conversation)
|
pil_images = load_pil_images(conversation)
|
||||||
prepare_inputs = vl_chat_processor(
|
prepare_inputs = vl_chat_processor(
|
||||||
|
BIN
images/dog_a.png
Normal file
BIN
images/dog_a.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 204 KiB |
BIN
images/dog_b.png
Normal file
BIN
images/dog_b.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 356 KiB |
BIN
images/dog_c.png
Normal file
BIN
images/dog_c.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 418 KiB |
BIN
images/dog_d.png
Normal file
BIN
images/dog_d.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 363 KiB |
18
inference.py
18
inference.py
@ -33,6 +33,7 @@ vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
|
|||||||
)
|
)
|
||||||
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
|
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
|
||||||
|
|
||||||
|
# single image conversation example
|
||||||
conversation = [
|
conversation = [
|
||||||
{
|
{
|
||||||
"role": "User",
|
"role": "User",
|
||||||
@ -42,6 +43,23 @@ conversation = [
|
|||||||
{"role": "Assistant", "content": ""},
|
{"role": "Assistant", "content": ""},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# multiple images (or in-context learning) conversation example
|
||||||
|
# conversation = [
|
||||||
|
# {
|
||||||
|
# "role": "User",
|
||||||
|
# "content": "<image_placeholder>A dog wearing nothing in the foreground, "
|
||||||
|
# "<image_placeholder>a dog wearing a santa hat, "
|
||||||
|
# "<image_placeholder>a dog wearing a wizard outfit, and "
|
||||||
|
# "<image_placeholder>what's the dog wearing?",
|
||||||
|
# "images": [
|
||||||
|
# "images/dog_a.png",
|
||||||
|
# "images/dog_b.png",
|
||||||
|
# "images/dog_c.png",
|
||||||
|
# "images/dog_d.png",
|
||||||
|
# ],
|
||||||
|
# },
|
||||||
|
# {"role": "Assistant", "content": ""}
|
||||||
|
# ]
|
||||||
|
|
||||||
# load images and prepare for inputs
|
# load images and prepare for inputs
|
||||||
pil_images = load_pil_images(conversation)
|
pil_images = load_pil_images(conversation)
|
||||||
|
@ -34,6 +34,20 @@ gradio = [
|
|||||||
"markdown==3.4.1",
|
"markdown==3.4.1",
|
||||||
"SentencePiece==0.1.96"
|
"SentencePiece==0.1.96"
|
||||||
]
|
]
|
||||||
|
lint = [
|
||||||
|
"isort",
|
||||||
|
"black[jupyter] >= 22.6.0",
|
||||||
|
"pylint[spelling] >= 2.15.0",
|
||||||
|
"flake8",
|
||||||
|
"flake8-bugbear",
|
||||||
|
"flake8-comprehensions",
|
||||||
|
"flake8-docstrings",
|
||||||
|
"flake8-pyi",
|
||||||
|
"flake8-simplify",
|
||||||
|
"ruff",
|
||||||
|
"pyenchant",
|
||||||
|
"pre-commit",
|
||||||
|
]
|
||||||
|
|
||||||
[tool.setuptools]
|
[tool.setuptools]
|
||||||
packages = {find = {exclude = ["images"]}}
|
packages = {find = {exclude = ["images"]}}
|
||||||
|
Loading…
Reference in New Issue
Block a user