DeepSeek-VL/cli_chat.py

# Copyright (c) 2023-2024 DeepSeek.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

# -*- coding: utf-8 -*-

import argparse
import os
import sys
from threading import Thread

import torch
from PIL import Image
from transformers import TextIteratorStreamer

from deepseek_vl.utils.io import load_pretrained_model


def load_image(image_file):
    image = Image.open(image_file).convert("RGB")
    return image


def get_help_message(image_token):
    help_msg = (
        f"\t\t DeepSeek-VL-Chat is a chatbot that can answer questions based on the given image. Enjoy it! \n"
        f"Usage: \n"
        f"    1. type `exit` to quit. \n"
        f"    2. type `{image_token}` to indicate there is an image. You can enter multiple images, "
        f"e.g '{image_token} is a dot, {image_token} is a cat, and what is it in {image_token}?'. "
        f"When you type `{image_token}`, the chatbot will ask you to input image file path. \n"
        f"    4. type `help` to get the help messages. \n"
        f"    5. type `new` to start a new conversation. \n"
        f"    Here is an example, you can type: '<image_placeholder>Describe the image.'\n"
    )

    return help_msg


@torch.inference_mode()
def response(
    args, conv, pil_images, tokenizer, vl_chat_processor, vl_gpt, generation_config
):
    prompt = conv.get_prompt()
    prepare_inputs = vl_chat_processor.__call__(
        prompt=prompt, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    streamer = TextIteratorStreamer(
        tokenizer=tokenizer, skip_prompt=True, skip_special_tokens=True
    )
    generation_config["inputs_embeds"] = inputs_embeds
    generation_config["attention_mask"] = prepare_inputs.attention_mask
    generation_config["streamer"] = streamer

    thread = Thread(target=vl_gpt.language_model.generate, kwargs=generation_config)
    thread.start()

    yield from streamer


def get_user_input(hint: str):
    user_input = ""
    while user_input == "":
        try:
            user_input = input(f"{hint}")
        except KeyboardInterrupt:
            print()
            continue
        except EOFError:
            user_input = "exit"

    return user_input


def chat(args, tokenizer, vl_chat_processor, vl_gpt, generation_config):
    image_token = vl_chat_processor.image_token
    help_msg = get_help_message(image_token)

    while True:
        print(help_msg)

        pil_images = []
        conv = vl_chat_processor.new_chat_template()
        roles = conv.roles

        while True:
            # get user input
            user_input = get_user_input(
                f"{roles[0]} [{image_token} indicates an image]: "
            )

            if user_input == "exit":
                print("Chat program exited.")
                sys.exit(0)

            elif user_input == "help":
                print(help_msg)

            elif user_input == "new":
                os.system("clear")
                pil_images = []
                conv = vl_chat_processor.new_chat_template()
                torch.cuda.empty_cache()
                print("New conversation started.")

            else:
                conv.append_message(conv.roles[0], user_input)
                conv.append_message(conv.roles[1], None)

                # check if the user input is an image token
                num_images = user_input.count(image_token)
                cur_img_idx = 0

                while cur_img_idx < num_images:
                    try:
                        image_file = input(
                            f"({cur_img_idx + 1}/{num_images}) Input the image file path: "
                        )
                        image_file = (
                            image_file.strip()
                        )  # trim whitespaces around path, enables drop-in from for example Dolphin

                    except KeyboardInterrupt:
                        print()
                        continue

                    except EOFError:
                        image_file = None

                    if image_file and os.path.exists(image_file):
                        pil_image = load_image(image_file)
                        pil_images.append(pil_image)
                        cur_img_idx += 1

                    elif image_file == "exit":
                        print("Chat program exited.")
                        sys.exit(0)

                    else:
                        print(
                            f"File error, `{image_file}` does not exist. Please input the correct file path."
                        )

                # get the answer by the model's prediction
                answer = ""
                answer_iter = response(
                    args,
                    conv,
                    pil_images,
                    tokenizer,
                    vl_chat_processor,
                    vl_gpt,
                    generation_config,
                )
                sys.stdout.write(f"{conv.roles[1]}: ")
                for char in answer_iter:
                    answer += char
                    sys.stdout.write(char)
                    sys.stdout.flush()

                sys.stdout.write("\n")
                sys.stdout.flush()
                conv.update_last_message(answer)
                # conv.messages[-1][-1] = answer


def main(args):
    # setup
    tokenizer, vl_chat_processor, vl_gpt = load_pretrained_model(args.model_path)
    generation_config = dict(
        pad_token_id=vl_chat_processor.tokenizer.eos_token_id,
        bos_token_id=vl_chat_processor.tokenizer.bos_token_id,
        eos_token_id=vl_chat_processor.tokenizer.eos_token_id,
        max_new_tokens=args.max_gen_len,
        use_cache=True,
    )
    if args.temperature > 0:
        generation_config.update(
            {
                "do_sample": True,
                "top_p": args.top_p,
                "temperature": args.temperature,
                "repetition_penalty": args.repetition_penalty,
            }
        )
    else:
        generation_config.update({"do_sample": False})

    chat(args, tokenizer, vl_chat_processor, vl_gpt, generation_config)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--model_path",
        type=str,
        default="deepseek-ai/deepseek-vl-7b-chat",
        help="the huggingface model name or the local path of the downloaded huggingface model.",
    )
    parser.add_argument("--temperature", type=float, default=0.2)
    parser.add_argument("--top_p", type=float, default=0.95)
    parser.add_argument("--repetition_penalty", type=float, default=1.1)
    parser.add_argument("--max_gen_len", type=int, default=512)
    args = parser.parse_args()
    main(args)
feat: gradio demo integration (#16) Co-authored-by: Bo Liu <benjaminliu.eecs@gmail.com> Co-authored-by: Haoyu Lu <ruclhy1998@163.com> 2024-03-13 09:47:43 +00:00			`# Copyright (c) 2023-2024 DeepSeek.`
			`#`
			`# Permission is hereby granted, free of charge, to any person obtaining a copy of`
			`# this software and associated documentation files (the "Software"), to deal in`
			`# the Software without restriction, including without limitation the rights to`
			`# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of`
			`# the Software, and to permit persons to whom the Software is furnished to do so,`
			`# subject to the following conditions:`
			`#`
			`# The above copyright notice and this permission notice shall be included in all`
			`# copies or substantial portions of the Software.`
			`#`
			`# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR`
			`# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS`
			`# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR`
			`# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER`
			`# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN`
			`# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.`

chore: rebase commits 2024-03-08 06:34:44 +00:00			`# -- coding: utf-8 --`

			`import argparse`
			`import os`
			`import sys`
			`from threading import Thread`
feat: gradio demo integration (#16) Co-authored-by: Bo Liu <benjaminliu.eecs@gmail.com> Co-authored-by: Haoyu Lu <ruclhy1998@163.com> 2024-03-13 09:47:43 +00:00
chore: rebase commits 2024-03-08 06:34:44 +00:00			`import torch`
feat: gradio demo integration (#16) Co-authored-by: Bo Liu <benjaminliu.eecs@gmail.com> Co-authored-by: Haoyu Lu <ruclhy1998@163.com> 2024-03-13 09:47:43 +00:00			`from PIL import Image`
chore: rebase commits 2024-03-08 06:34:44 +00:00			`from transformers import TextIteratorStreamer`

			`from deepseek_vl.utils.io import load_pretrained_model`


			`def load_image(image_file):`
			`image = Image.open(image_file).convert("RGB")`
			`return image`


			`def get_help_message(image_token):`
			`help_msg = (`
			`f"\t\t DeepSeek-VL-Chat is a chatbot that can answer questions based on the given image. Enjoy it! \n"`
			`f"Usage: \n"`
			f" 1. type `exit` to quit. \n"
			f" 2. type `{image_token}` to indicate there is an image. You can enter multiple images, "
			`f"e.g '{image_token} is a dot, {image_token} is a cat, and what is it in {image_token}?'. "`
			f"When you type `{image_token}`, the chatbot will ask you to input image file path. \n"
			f" 4. type `help` to get the help messages. \n"
			f" 5. type `new` to start a new conversation. \n"
			`f" Here is an example, you can type: '<image_placeholder>Describe the image.'\n"`
			`)`

			`return help_msg`


			`@torch.inference_mode()`
feat: gradio demo integration (#16) Co-authored-by: Bo Liu <benjaminliu.eecs@gmail.com> Co-authored-by: Haoyu Lu <ruclhy1998@163.com> 2024-03-13 09:47:43 +00:00			`def response(`
			`args, conv, pil_images, tokenizer, vl_chat_processor, vl_gpt, generation_config`
			`):`
chore: rebase commits 2024-03-08 06:34:44 +00:00			`prompt = conv.get_prompt()`
			`prepare_inputs = vl_chat_processor.__call__(`
feat: gradio demo integration (#16) Co-authored-by: Bo Liu <benjaminliu.eecs@gmail.com> Co-authored-by: Haoyu Lu <ruclhy1998@163.com> 2024-03-13 09:47:43 +00:00			`prompt=prompt, images=pil_images, force_batchify=True`
chore: rebase commits 2024-03-08 06:34:44 +00:00			`).to(vl_gpt.device)`

			`# run image encoder to get the image embeddings`
			`inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)`

			`streamer = TextIteratorStreamer(`
feat: gradio demo integration (#16) Co-authored-by: Bo Liu <benjaminliu.eecs@gmail.com> Co-authored-by: Haoyu Lu <ruclhy1998@163.com> 2024-03-13 09:47:43 +00:00			`tokenizer=tokenizer, skip_prompt=True, skip_special_tokens=True`
chore: rebase commits 2024-03-08 06:34:44 +00:00			`)`
			`generation_config["inputs_embeds"] = inputs_embeds`
			`generation_config["attention_mask"] = prepare_inputs.attention_mask`
			`generation_config["streamer"] = streamer`

			`thread = Thread(target=vl_gpt.language_model.generate, kwargs=generation_config)`
			`thread.start()`

			`yield from streamer`


			`def get_user_input(hint: str):`
			`user_input = ""`
			`while user_input == "":`
			`try:`
			`user_input = input(f"{hint}")`
			`except KeyboardInterrupt:`
			`print()`
			`continue`
			`except EOFError:`
			`user_input = "exit"`

			`return user_input`


			`def chat(args, tokenizer, vl_chat_processor, vl_gpt, generation_config):`
			`image_token = vl_chat_processor.image_token`
			`help_msg = get_help_message(image_token)`

			`while True:`
			`print(help_msg)`

			`pil_images = []`
			`conv = vl_chat_processor.new_chat_template()`
			`roles = conv.roles`

			`while True:`
			`# get user input`
feat: gradio demo integration (#16) Co-authored-by: Bo Liu <benjaminliu.eecs@gmail.com> Co-authored-by: Haoyu Lu <ruclhy1998@163.com> 2024-03-13 09:47:43 +00:00			`user_input = get_user_input(`
			`f"{roles[0]} [{image_token} indicates an image]: "`
			`)`
chore: rebase commits 2024-03-08 06:34:44 +00:00
			`if user_input == "exit":`
			`print("Chat program exited.")`
			`sys.exit(0)`

			`elif user_input == "help":`
			`print(help_msg)`

			`elif user_input == "new":`
			`os.system("clear")`
			`pil_images = []`
			`conv = vl_chat_processor.new_chat_template()`
			`torch.cuda.empty_cache()`
			`print("New conversation started.")`

			`else:`
			`conv.append_message(conv.roles[0], user_input)`
			`conv.append_message(conv.roles[1], None)`

			`# check if the user input is an image token`
			`num_images = user_input.count(image_token)`
			`cur_img_idx = 0`

			`while cur_img_idx < num_images:`
			`try:`
feat: automatically patch `collections` for >python 3.10 support (#21) Co-authored-by: Bo Liu <benjaminliu.eecs@gmail.com> 2024-03-13 14:34:10 +00:00			`image_file = input(`
			`f"({cur_img_idx + 1}/{num_images}) Input the image file path: "`
			`)`
			`image_file = (`
			`image_file.strip()`
			`) # trim whitespaces around path, enables drop-in from for example Dolphin`
chore: rebase commits 2024-03-08 06:34:44 +00:00
			`except KeyboardInterrupt:`
			`print()`
			`continue`

			`except EOFError:`
			`image_file = None`

			`if image_file and os.path.exists(image_file):`
			`pil_image = load_image(image_file)`
			`pil_images.append(pil_image)`
			`cur_img_idx += 1`

			`elif image_file == "exit":`
			`print("Chat program exited.")`
			`sys.exit(0)`

			`else:`
feat: gradio demo integration (#16) Co-authored-by: Bo Liu <benjaminliu.eecs@gmail.com> Co-authored-by: Haoyu Lu <ruclhy1998@163.com> 2024-03-13 09:47:43 +00:00			`print(`
			f"File error, `{image_file}` does not exist. Please input the correct file path."
			`)`
chore: rebase commits 2024-03-08 06:34:44 +00:00
			`# get the answer by the model's prediction`
			`answer = ""`
feat: gradio demo integration (#16) Co-authored-by: Bo Liu <benjaminliu.eecs@gmail.com> Co-authored-by: Haoyu Lu <ruclhy1998@163.com> 2024-03-13 09:47:43 +00:00			`answer_iter = response(`
			`args,`
			`conv,`
			`pil_images,`
			`tokenizer,`
			`vl_chat_processor,`
			`vl_gpt,`
			`generation_config,`
			`)`
chore: rebase commits 2024-03-08 06:34:44 +00:00			`sys.stdout.write(f"{conv.roles[1]}: ")`
			`for char in answer_iter:`
			`answer += char`
			`sys.stdout.write(char)`
			`sys.stdout.flush()`

			`sys.stdout.write("\n")`
			`sys.stdout.flush()`
			`conv.update_last_message(answer)`
			`# conv.messages[-1][-1] = answer`


			`def main(args):`
			`# setup`
			`tokenizer, vl_chat_processor, vl_gpt = load_pretrained_model(args.model_path)`
			`generation_config = dict(`
			`pad_token_id=vl_chat_processor.tokenizer.eos_token_id,`
			`bos_token_id=vl_chat_processor.tokenizer.bos_token_id,`
			`eos_token_id=vl_chat_processor.tokenizer.eos_token_id,`
			`max_new_tokens=args.max_gen_len,`
			`use_cache=True,`
			`)`
			`if args.temperature > 0:`
feat: gradio demo integration (#16) Co-authored-by: Bo Liu <benjaminliu.eecs@gmail.com> Co-authored-by: Haoyu Lu <ruclhy1998@163.com> 2024-03-13 09:47:43 +00:00			`generation_config.update(`
			`{`
			`"do_sample": True,`
			`"top_p": args.top_p,`
			`"temperature": args.temperature,`
			`"repetition_penalty": args.repetition_penalty,`
			`}`
			`)`
chore: rebase commits 2024-03-08 06:34:44 +00:00			`else:`
			`generation_config.update({"do_sample": False})`

			`chat(args, tokenizer, vl_chat_processor, vl_gpt, generation_config)`


			`if __name__ == "__main__":`
			`parser = argparse.ArgumentParser()`
feat: gradio demo integration (#16) Co-authored-by: Bo Liu <benjaminliu.eecs@gmail.com> Co-authored-by: Haoyu Lu <ruclhy1998@163.com> 2024-03-13 09:47:43 +00:00			`parser.add_argument(`
			`"--model_path",`
			`type=str,`
			`default="deepseek-ai/deepseek-vl-7b-chat",`
			`help="the huggingface model name or the local path of the downloaded huggingface model.",`
			`)`
chore: rebase commits 2024-03-08 06:34:44 +00:00			`parser.add_argument("--temperature", type=float, default=0.2)`
			`parser.add_argument("--top_p", type=float, default=0.95)`
			`parser.add_argument("--repetition_penalty", type=float, default=1.1)`
			`parser.add_argument("--max_gen_len", type=int, default=512)`
			`args = parser.parse_args()`
			`main(args)`