mirror of
https://github.com/deepseek-ai/DeepSeek-VL2
synced 2025-01-22 12:25:32 +00:00
138 lines
4.6 KiB
Python
138 lines
4.6 KiB
Python
|
# Copyright (c) 2023-2024 DeepSeek.
|
||
|
#
|
||
|
# Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||
|
# this software and associated documentation files (the "Software"), to deal in
|
||
|
# the Software without restriction, including without limitation the rights to
|
||
|
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
||
|
# the Software, and to permit persons to whom the Software is furnished to do so,
|
||
|
# subject to the following conditions:
|
||
|
#
|
||
|
# The above copyright notice and this permission notice shall be included in all
|
||
|
# copies or substantial portions of the Software.
|
||
|
#
|
||
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||
|
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||
|
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||
|
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||
|
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||
|
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||
|
|
||
|
from argparse import ArgumentParser
|
||
|
from typing import List, Dict
|
||
|
import torch
|
||
|
from transformers import AutoModelForCausalLM
|
||
|
|
||
|
import PIL.Image
|
||
|
|
||
|
from deepseek_vl.models import DeepseekVLV2ForCausalLM, DeepseekVLV2Processor
|
||
|
from deepseek_vl.serve.app_modules.utils import parse_ref_bbox
|
||
|
|
||
|
|
||
|
def load_pil_images(conversations: List[Dict[str, str]]) -> List[PIL.Image.Image]:
|
||
|
"""
|
||
|
|
||
|
Args:
|
||
|
conversations (List[Dict[str, str]]): the conversations with a list of messages. An example is :
|
||
|
[
|
||
|
{
|
||
|
"role": "User",
|
||
|
"content": "<image>\nExtract all information from this image and convert them into markdown format.",
|
||
|
"images": ["./examples/table_datasets.png"]
|
||
|
},
|
||
|
{"role": "Assistant", "content": ""},
|
||
|
]
|
||
|
|
||
|
Returns:
|
||
|
pil_images (List[PIL.Image.Image]): the list of PIL images.
|
||
|
|
||
|
"""
|
||
|
|
||
|
pil_images = []
|
||
|
|
||
|
for message in conversations:
|
||
|
if "images" not in message:
|
||
|
continue
|
||
|
|
||
|
for image_path in message["images"]:
|
||
|
pil_img = PIL.Image.open(image_path)
|
||
|
pil_img = pil_img.convert("RGB")
|
||
|
pil_images.append(pil_img)
|
||
|
|
||
|
return pil_images
|
||
|
|
||
|
|
||
|
def main(args):
|
||
|
|
||
|
dtype = torch.bfloat16
|
||
|
|
||
|
# specify the path to the model
|
||
|
model_path = args.model_path
|
||
|
vl_chat_processor: DeepseekVLV2Processor = DeepseekVLV2Processor.from_pretrained(model_path)
|
||
|
tokenizer = vl_chat_processor.tokenizer
|
||
|
|
||
|
vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(
|
||
|
model_path,
|
||
|
trust_remote_code=True,
|
||
|
torch_dtype=dtype
|
||
|
)
|
||
|
vl_gpt = vl_gpt.cuda().eval()
|
||
|
|
||
|
# single image conversation example
|
||
|
conversation = [
|
||
|
{
|
||
|
"role": "<|User|>",
|
||
|
"content": "<image>\n<|ref|>The giraffe at the back.<|/ref|>.",
|
||
|
"images": ["./images/visual_grounding.jpeg"],
|
||
|
},
|
||
|
{"role": "<|Assistant|>", "content": ""},
|
||
|
]
|
||
|
|
||
|
# load images and prepare for inputs
|
||
|
pil_images = load_pil_images(conversation)
|
||
|
prepare_inputs = vl_chat_processor.__call__(
|
||
|
conversations=conversation,
|
||
|
images=pil_images,
|
||
|
force_batchify=True,
|
||
|
system_prompt=""
|
||
|
).to(vl_gpt.device, dtype=dtype)
|
||
|
|
||
|
with torch.no_grad():
|
||
|
# run image encoder to get the image embeddings
|
||
|
inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
|
||
|
|
||
|
# run the model to get the response
|
||
|
outputs = vl_gpt.generate(
|
||
|
inputs_embeds=inputs_embeds,
|
||
|
attention_mask=prepare_inputs.attention_mask,
|
||
|
pad_token_id=tokenizer.eos_token_id,
|
||
|
bos_token_id=tokenizer.bos_token_id,
|
||
|
eos_token_id=tokenizer.eos_token_id,
|
||
|
max_new_tokens=1024,
|
||
|
|
||
|
do_sample=False,
|
||
|
# repetition_penalty=1.1,
|
||
|
|
||
|
# do_sample=True,
|
||
|
# temperature=1.0,
|
||
|
# top_p=0.9,
|
||
|
# repetition_penalty=1.1,
|
||
|
|
||
|
use_cache=True,
|
||
|
)
|
||
|
|
||
|
answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=False)
|
||
|
print(f"{prepare_inputs['sft_format'][0]}", answer)
|
||
|
|
||
|
vg_image = parse_ref_bbox(answer, image=pil_images[0])
|
||
|
if vg_image is not None:
|
||
|
vg_image.save("./vg.jpg", format="JPEG", quality=85)
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
parser = ArgumentParser()
|
||
|
parser.add_argument("--model_path", type=str, required=True,
|
||
|
default="deepseek-ai/deepseek-vl2-27b-moe",
|
||
|
help="model name or local path to the model")
|
||
|
args = parser.parse_args()
|
||
|
main(args)
|