feat: add multiple images (or in-context learning) conversation examples (#47)

Co-authored-by: Bo Liu <benjaminliu.eecs@gmail.com>
This commit is contained in:
StevenLiuWen 2024-04-16 13:58:43 +08:00 committed by GitHub
parent 3c02b24219
commit 9bb02cc50d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 121 additions and 5 deletions

68
.github/workflows/lint.yml vendored Normal file
View File

@ -0,0 +1,68 @@
name: Lint
on:
push:
branches:
- main
pull_request:
# Allow to trigger the workflow manually
workflow_dispatch:
permissions:
contents: read
concurrency:
group: "${{ github.workflow }}-${{ github.ref }}"
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
env:
CUDA_VERSION: "11.7"
jobs:
lint:
runs-on: ubuntu-latest
timeout-minutes: 30
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: "recursive"
fetch-depth: 1
- name: Set up Python 3.9
uses: actions/setup-python@v5
with:
python-version: "3.9"
update-environment: true
- name: Upgrade pip
run: |
python -m pip install --upgrade pip setuptools wheel
- name: Install TorchOpt
env:
USE_FP16: "OFF"
TORCH_CUDA_ARCH_LIST: "Auto"
run: |
python -m pip install torch numpy pybind11
python -m pip install -vvv --no-build-isolation --editable '.[lint]'
- name: pre-commit
run: |
make pre-commit
- name: ruff
run: |
make ruff
- name: flake8
run: |
make flake8
- name: isort and black
run: |
make py-format
- name: addlicense
run: |
make addlicense

View File

@ -132,18 +132,34 @@ tokenizer = vl_chat_processor.tokenizer
vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval() vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
## single image conversation example
conversation = [ conversation = [
{ {
"role": "User", "role": "User",
"content": "<image_placeholder>Describe each stage of this image.", "content": "<image_placeholder>Describe each stage of this image.",
"images": ["./images/training_pipelines.jpg"] "images": ["./images/training_pipelines.jpg"],
}, },
{ {"role": "Assistant", "content": ""},
"role": "Assistant",
"content": ""
}
] ]
## multiple images (or in-context learning) conversation example
# conversation = [
# {
# "role": "User",
# "content": "<image_placeholder>A dog wearing nothing in the foreground, "
# "<image_placeholder>a dog wearing a santa hat, "
# "<image_placeholder>a dog wearing a wizard outfit, and "
# "<image_placeholder>what's the dog wearing?",
# "images": [
# "images/dog_a.png",
# "images/dog_b.png",
# "images/dog_c.png",
# "images/dog_d.png",
# ],
# },
# {"role": "Assistant", "content": ""}
# ]
# load images and prepare for inputs # load images and prepare for inputs
pil_images = load_pil_images(conversation) pil_images = load_pil_images(conversation)
prepare_inputs = vl_chat_processor( prepare_inputs = vl_chat_processor(

BIN
images/dog_a.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 204 KiB

BIN
images/dog_b.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 356 KiB

BIN
images/dog_c.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 418 KiB

BIN
images/dog_d.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 363 KiB

View File

@ -33,6 +33,7 @@ vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
) )
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval() vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
# single image conversation example
conversation = [ conversation = [
{ {
"role": "User", "role": "User",
@ -42,6 +43,23 @@ conversation = [
{"role": "Assistant", "content": ""}, {"role": "Assistant", "content": ""},
] ]
# multiple images (or in-context learning) conversation example
# conversation = [
# {
# "role": "User",
# "content": "<image_placeholder>A dog wearing nothing in the foreground, "
# "<image_placeholder>a dog wearing a santa hat, "
# "<image_placeholder>a dog wearing a wizard outfit, and "
# "<image_placeholder>what's the dog wearing?",
# "images": [
# "images/dog_a.png",
# "images/dog_b.png",
# "images/dog_c.png",
# "images/dog_d.png",
# ],
# },
# {"role": "Assistant", "content": ""}
# ]
# load images and prepare for inputs # load images and prepare for inputs
pil_images = load_pil_images(conversation) pil_images = load_pil_images(conversation)

View File

@ -34,6 +34,20 @@ gradio = [
"markdown==3.4.1", "markdown==3.4.1",
"SentencePiece==0.1.96" "SentencePiece==0.1.96"
] ]
lint = [
"isort",
"black[jupyter] >= 22.6.0",
"pylint[spelling] >= 2.15.0",
"flake8",
"flake8-bugbear",
"flake8-comprehensions",
"flake8-docstrings",
"flake8-pyi",
"flake8-simplify",
"ruff",
"pyenchant",
"pre-commit",
]
[tool.setuptools] [tool.setuptools]
packages = {find = {exclude = ["images"]}} packages = {find = {exclude = ["images"]}}