mirror of
https://github.com/deepseek-ai/DeepSeek-VL2
synced 2025-01-22 12:25:32 +00:00
191 lines
7.0 KiB
Python
191 lines
7.0 KiB
Python
# Copyright (c) 2023-2024 DeepSeek.
|
|
#
|
|
# Permission is hereby granted, free of charge, to any person obtaining a copy of
|
|
# this software and associated documentation files (the "Software"), to deal in
|
|
# the Software without restriction, including without limitation the rights to
|
|
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
|
# the Software, and to permit persons to whom the Software is furnished to do so,
|
|
# subject to the following conditions:
|
|
#
|
|
# The above copyright notice and this permission notice shall be included in all
|
|
# copies or substantial portions of the Software.
|
|
#
|
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
|
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
|
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
|
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
|
|
from argparse import ArgumentParser
|
|
from typing import List, Dict
|
|
import torch
|
|
from transformers import AutoModelForCausalLM
|
|
import PIL.Image
|
|
|
|
from deepseek_vl2.models import DeepseekVLV2ForCausalLM, DeepseekVLV2Processor
|
|
from deepseek_vl2.serve.app_modules.utils import parse_ref_bbox
|
|
|
|
|
|
def load_pil_images(conversations: List[Dict[str, str]]) -> List[PIL.Image.Image]:
|
|
"""
|
|
|
|
Args:
|
|
conversations (List[Dict[str, str]]): the conversations with a list of messages. An example is :
|
|
[
|
|
{
|
|
"role": "User",
|
|
"content": "<image>\nExtract all information from this image and convert them into markdown format.",
|
|
"images": ["./examples/table_datasets.png"]
|
|
},
|
|
{"role": "Assistant", "content": ""},
|
|
]
|
|
|
|
Returns:
|
|
pil_images (List[PIL.Image.Image]): the list of PIL images.
|
|
|
|
"""
|
|
|
|
pil_images = []
|
|
|
|
for message in conversations:
|
|
if "images" not in message:
|
|
continue
|
|
|
|
for image_path in message["images"]:
|
|
pil_img = PIL.Image.open(image_path)
|
|
pil_img = pil_img.convert("RGB")
|
|
pil_images.append(pil_img)
|
|
|
|
return pil_images
|
|
|
|
|
|
def main(args):
|
|
|
|
dtype = torch.bfloat16
|
|
|
|
# specify the path to the model
|
|
model_path = args.model_path
|
|
vl_chat_processor: DeepseekVLV2Processor = DeepseekVLV2Processor.from_pretrained(model_path)
|
|
tokenizer = vl_chat_processor.tokenizer
|
|
|
|
vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(
|
|
model_path,
|
|
trust_remote_code=True,
|
|
torch_dtype=dtype
|
|
)
|
|
vl_gpt = vl_gpt.cuda().eval()
|
|
|
|
# single image conversation example
|
|
conversation = [
|
|
{
|
|
"role": "<|User|>",
|
|
"content": "<image>\n<image>\n<|grounding|>In the first image, an object within the red rectangle is marked. Locate the object of the same category in the second image.",
|
|
"images": [
|
|
"images/incontext_visual_grounding_1.jpeg",
|
|
"images/icl_vg_2.jpeg"
|
|
],
|
|
},
|
|
{"role": "<|Assistant|>", "content": ""},
|
|
]
|
|
|
|
# conversation = [
|
|
# {
|
|
# "role": "<|User|>",
|
|
# "content": "<image>\n<|ref|>The giraffe at the back.<|/ref|>.",
|
|
# "images": ["./images/visual_grounding_1.jpeg"],
|
|
# },
|
|
# {"role": "<|Assistant|>", "content": ""},
|
|
# ]
|
|
|
|
# load images and prepare for inputs
|
|
pil_images = load_pil_images(conversation)
|
|
print(f"len(pil_images) = {len(pil_images)}")
|
|
|
|
# input_ids = batched_input_ids,
|
|
# attention_mask = batched_attention_mask,
|
|
# labels = batched_labels,
|
|
# images_tiles = batched_images,
|
|
# images_seq_mask = batched_images_seq_mask,
|
|
# images_spatial_crop = batched_images_spatial_crop,
|
|
# sft_format = batched_sft_format,
|
|
# seq_lens = seq_lens
|
|
|
|
prepare_inputs = vl_chat_processor.__call__(
|
|
conversations=conversation,
|
|
images=pil_images,
|
|
force_batchify=True,
|
|
system_prompt=""
|
|
).to(vl_gpt.device, dtype=dtype)
|
|
|
|
# for key in prepare_inputs.keys():
|
|
# value = prepare_inputs[key]
|
|
# if isinstance(value, list):
|
|
# print(key, len(value), type(value))
|
|
# elif isinstance(value, torch.Tensor):
|
|
# print(key, value.shape, type(value))
|
|
|
|
with torch.no_grad():
|
|
|
|
if args.chunk_size == -1:
|
|
inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
|
|
past_key_values = None
|
|
else:
|
|
# incremental_prefilling when using 40G GPU for vl2-small
|
|
inputs_embeds, past_key_values = vl_gpt.incremental_prefilling(
|
|
input_ids=prepare_inputs.input_ids,
|
|
images=prepare_inputs.images,
|
|
images_seq_mask=prepare_inputs.images_seq_mask,
|
|
images_spatial_crop=prepare_inputs.images_spatial_crop,
|
|
attention_mask=prepare_inputs.attention_mask,
|
|
chunk_size=args.chunk_size
|
|
)
|
|
|
|
# run the model to get the response
|
|
outputs = vl_gpt.generate(
|
|
# inputs_embeds=inputs_embeds[:, -1:],
|
|
# input_ids=prepare_inputs.input_ids[:, -1:],
|
|
inputs_embeds=inputs_embeds,
|
|
input_ids=prepare_inputs.input_ids,
|
|
images=prepare_inputs.images,
|
|
images_seq_mask=prepare_inputs.images_seq_mask,
|
|
images_spatial_crop=prepare_inputs.images_spatial_crop,
|
|
attention_mask=prepare_inputs.attention_mask,
|
|
past_key_values=past_key_values,
|
|
|
|
pad_token_id=tokenizer.eos_token_id,
|
|
bos_token_id=tokenizer.bos_token_id,
|
|
eos_token_id=tokenizer.eos_token_id,
|
|
max_new_tokens=512,
|
|
|
|
# do_sample=False,
|
|
# repetition_penalty=1.1,
|
|
|
|
do_sample=True,
|
|
temperature=0.4,
|
|
top_p=0.9,
|
|
repetition_penalty=1.1,
|
|
|
|
use_cache=True,
|
|
)
|
|
|
|
answer = tokenizer.decode(outputs[0][len(prepare_inputs.input_ids[0]):].cpu().tolist(), skip_special_tokens=False)
|
|
print(f"{prepare_inputs['sft_format'][0]}", answer)
|
|
|
|
vg_image = parse_ref_bbox(answer, image=pil_images[-1])
|
|
if vg_image is not None:
|
|
vg_image.save("./vg.jpg", format="JPEG", quality=85)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = ArgumentParser()
|
|
parser.add_argument("--model_path", type=str, required=True,
|
|
default="deepseek-ai/deepseek-vl2",
|
|
help="model name or local path to the model")
|
|
parser.add_argument("--chunk_size", type=int, default=-1,
|
|
help="chunk size for the model for prefiiling. "
|
|
"When using 40G gpu for vl2-small, set a chunk_size for incremental_prefilling."
|
|
"Otherwise, default value is -1, which means we do not use incremental_prefilling.")
|
|
args = parser.parse_args()
|
|
main(args)
|