DeepSeek-VL/deepseek_vl/utils/io.py

# Copyright (c) 2023-2024 DeepSeek.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

import json
from typing import Dict, List

import PIL.Image
import torch
from transformers import AutoModelForCausalLM

from deepseek_vl.models import MultiModalityCausalLM, VLChatProcessor


def load_pretrained_model(model_path: str):
    vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
    tokenizer = vl_chat_processor.tokenizer

    vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
        model_path, trust_remote_code=True
    )
    vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

    return tokenizer, vl_chat_processor, vl_gpt


def load_pil_images(conversations: List[Dict[str, str]]) -> List[PIL.Image.Image]:
    """

    Args:
        conversations (List[Dict[str, str]]): the conversations with a list of messages. An example is :
            [
                {
                    "role": "User",
                    "content": "<image_placeholder>\nExtract all information from this image and convert them into markdown format.",
                    "images": ["./examples/table_datasets.png"]
                },
                {"role": "Assistant", "content": ""},
            ]

    Returns:
        pil_images (List[PIL.Image.Image]): the list of PIL images.

    """

    pil_images = []

    for message in conversations:
        if "images" not in message:
            continue

        for image_path in message["images"]:
            pil_img = PIL.Image.open(image_path)
            pil_img = pil_img.convert("RGB")
            pil_images.append(pil_img)

    return pil_images


def load_json(filepath):
    with open(filepath, "r") as f:
        data = json.load(f)
        return data
chore: rebase commits 2024-03-08 06:34:44 +00:00			`# Copyright (c) 2023-2024 DeepSeek.`
			`#`
			`# Permission is hereby granted, free of charge, to any person obtaining a copy of`
			`# this software and associated documentation files (the "Software"), to deal in`
			`# the Software without restriction, including without limitation the rights to`
			`# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of`
			`# the Software, and to permit persons to whom the Software is furnished to do so,`
			`# subject to the following conditions:`
			`#`
			`# The above copyright notice and this permission notice shall be included in all`
			`# copies or substantial portions of the Software.`
			`#`
			`# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR`
			`# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS`
			`# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR`
			`# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER`
			`# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN`
			`# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.`

			`import json`
			`from typing import Dict, List`

			`import PIL.Image`
			`import torch`
			`from transformers import AutoModelForCausalLM`

			`from deepseek_vl.models import MultiModalityCausalLM, VLChatProcessor`


			`def load_pretrained_model(model_path: str):`
			`vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)`
			`tokenizer = vl_chat_processor.tokenizer`

			`vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(`
			`model_path, trust_remote_code=True`
			`)`
			`vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()`

			`return tokenizer, vl_chat_processor, vl_gpt`


			`def load_pil_images(conversations: List[Dict[str, str]]) -> List[PIL.Image.Image]:`
			`"""`

			`Args:`
			`conversations (List[Dict[str, str]]): the conversations with a list of messages. An example is :`
			`[`
			`{`
			`"role": "User",`
			`"content": "<image_placeholder>\nExtract all information from this image and convert them into markdown format.",`
			`"images": ["./examples/table_datasets.png"]`
			`},`
			`{"role": "Assistant", "content": ""},`
			`]`

			`Returns:`
			`pil_images (List[PIL.Image.Image]): the list of PIL images.`

			`"""`

			`pil_images = []`

			`for message in conversations:`
			`if "images" not in message:`
			`continue`

			`for image_path in message["images"]:`
			`pil_img = PIL.Image.open(image_path)`
			`pil_img = pil_img.convert("RGB")`
			`pil_images.append(pil_img)`

			`return pil_images`


			`def load_json(filepath):`
			`with open(filepath, "r") as f:`
			`data = json.load(f)`
			`return data`