DreamCraft3D/metric_utils.py

# * evaluate use laion/CLIP-ViT-H-14-laion2B-s32B-b79K
# best open source clip so far: laion/CLIP-ViT-bigG-14-laion2B-39B-b160k
# code adapted from NeuralLift-360

import torch
import torch.nn as nn
import os
import torchvision.transforms as T
import torchvision.transforms.functional as TF
import matplotlib.pyplot as plt
# import clip
from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTokenizer, CLIPProcessor
from torchvision import transforms
import numpy as np
import torch.nn.functional as F
from tqdm import tqdm
import cv2
from PIL import Image
# import torchvision.transforms as transforms
import glob
from skimage.metrics import peak_signal_noise_ratio as compare_psnr
import lpips
from os.path import join as osp
import argparse
import pandas as pd
import contextual_loss as cl

criterion = cl.ContextualLoss(use_vgg=True, vgg_layer='relu5_4')

class CLIP(nn.Module):

    def __init__(self,
                 device,
                 clip_name='laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
                 size=224):  #'laion/CLIP-ViT-B-32-laion2B-s34B-b79K'):
        super().__init__()
        self.size = size
        self.device = f"cuda:{device}"

        clip_name = clip_name

        self.feature_extractor = CLIPFeatureExtractor.from_pretrained(
            clip_name)
        self.clip_model = CLIPModel.from_pretrained(clip_name).to(self.device)
        self.tokenizer = CLIPTokenizer.from_pretrained(
            'openai/clip-vit-base-patch32')

        self.normalize = transforms.Normalize(
            mean=self.feature_extractor.image_mean,
            std=self.feature_extractor.image_std)

        self.resize = transforms.Resize(224)
        self.to_tensor = transforms.ToTensor()

        # image augmentation
        self.aug = T.Compose([
            T.Resize((224, 224)),
            T.Normalize((0.48145466, 0.4578275, 0.40821073),
                        (0.26862954, 0.26130258, 0.27577711)),
        ])

    # * recommend to use this function for evaluation
    @torch.no_grad()
    def score_gt(self, ref_img_path, novel_views):
        # assert len(novel_views) == 100
        clip_scores = []
        for novel in novel_views:
            clip_scores.append(self.score_from_path(ref_img_path, [novel]))
        return np.mean(clip_scores)

    # * recommend to use this function for evaluation
    # def score_gt(self, ref_paths, novel_paths):
    #     clip_scores = []
    #     for img1_path, img2_path in zip(ref_paths, novel_paths):
    #         clip_scores.append(self.score_from_path(img1_path, img2_path))

    #     return np.mean(clip_scores)

    def similarity(self, image1_features: torch.Tensor,
                   image2_features: torch.Tensor) -> float:
        with torch.no_grad(), torch.cuda.amp.autocast():
            y = image1_features.T.view(image1_features.T.shape[1],
                                       image1_features.T.shape[0])
            similarity = torch.matmul(y, image2_features.T)
            # print(similarity)
            return similarity[0][0].item()

    def get_img_embeds(self, img):
        if img.shape[0] == 4:
            img = img[:3, :, :]

        img = self.aug(img).to(self.device)
        img = img.unsqueeze(0)  # b,c,h,w

        # plt.imshow(img.cpu().squeeze(0).permute(1, 2, 0).numpy())
        # plt.show()
        # print(img)

        image_z = self.clip_model.get_image_features(img)
        image_z = image_z / image_z.norm(dim=-1,
                                         keepdim=True)  # normalize features
        return image_z

    def score_from_feature(self, img1, img2):
        img1_feature, img2_feature = self.get_img_embeds(
            img1), self.get_img_embeds(img2)
        # for debug
        return self.similarity(img1_feature, img2_feature)

    def read_img_list(self, img_list):
        size = self.size
        images = []
        # white_background = np.ones((size, size, 3), dtype=np.uint8) * 255

        for img_path in img_list:
            img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED)
            # print(img_path)
            if img.shape[2] == 4:  # Handle BGRA images
                alpha = img[:, :, 3]  # Extract alpha channel
                img = cv2.cvtColor(img,cv2.COLOR_BGRA2RGB)  # Convert BGRA to BGR
                img[np.where(alpha == 0)] = [
                    255, 255, 255
                ]  # Set transparent pixels to white
            else:  # Handle other image formats like JPG and PNG
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

            img = cv2.resize(img, (size, size), interpolation=cv2.INTER_AREA)

            # plt.imshow(img)
            # plt.show()

            images.append(img)

        images = np.stack(images, axis=0)
        # images[np.where(images == 0)] = 255  # Set black pixels to white
        # images = np.where(images == 0, white_background, images)  # Set transparent pixels to white
        # images = images.astype(np.float32)

        return images

    def score_from_path(self, img1_path, img2_path):
        img1, img2 = self.read_img_list(img1_path), self.read_img_list(img2_path)
        img1 = np.squeeze(img1)
        img2 = np.squeeze(img2)
        # plt.imshow(img1)
        # plt.show()
        # plt.imshow(img2)
        # plt.show()

        img1, img2 = self.to_tensor(img1), self.to_tensor(img2)
        # print("img1 to tensor ",img1)
        return self.score_from_feature(img1, img2)


def numpy_to_torch(images):
    images = images * 2.0 - 1.0
    images = torch.from_numpy(images.transpose((0, 3, 1, 2))).float()
    return images.cuda()


class LPIPSMeter:

    def __init__(self,
                 net='alex',
                 device=None,
                 size=224):  # or we can use 'alex', 'vgg' as network
        self.size = size
        self.net = net
        self.results = []
        self.device = device if device is not None else torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.fn = lpips.LPIPS(net=net).eval().to(self.device)

    def measure(self):
        return np.mean(self.results)

    def report(self):
        return f'LPIPS ({self.net}) = {self.measure():.6f}'

    def read_img_list(self, img_list):
        size = self.size
        images = []
        white_background = np.ones((size, size, 3), dtype=np.uint8) * 255

        for img_path in img_list:
            img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED)

            if img.shape[2] == 4:  # Handle BGRA images
                alpha = img[:, :, 3]  # Extract alpha channel
                img = cv2.cvtColor(img,
                                   cv2.COLOR_BGRA2BGR)  # Convert BGRA to BGR

                img = cv2.cvtColor(img,
                                   cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
                img[np.where(alpha == 0)] = [
                    255, 255, 255
                ]  # Set transparent pixels to white
            else:  # Handle other image formats like JPG and PNG
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

            img = cv2.resize(img, (size, size), interpolation=cv2.INTER_AREA)
            images.append(img)

        images = np.stack(images, axis=0)
        # images[np.where(images == 0)] = 255  # Set black pixels to white
        # images = np.where(images == 0, white_background, images)  # Set transparent pixels to white
        images = images.astype(np.float32) / 255.0

        return images

    # * recommend to use this function for evaluation
    @torch.no_grad()
    def score_gt(self, ref_paths, novel_paths):
        self.results = []
        for path0, path1 in zip(ref_paths, novel_paths):
            # Load images
            # img0 = lpips.im2tensor(lpips.load_image(path0)).cuda() # RGB image from [-1,1]
            # img1 = lpips.im2tensor(lpips.load_image(path1)).cuda()
            img0, img1 = self.read_img_list([path0]), self.read_img_list(
                [path1])
            img0, img1 = numpy_to_torch(img0), numpy_to_torch(img1)
            # print(img0.shape,img1.shape)
            img0 = F.interpolate(img0,
                                    size=(self.size, self.size),
                                    mode='area')
            img1 = F.interpolate(img1,
                                    size=(self.size, self.size),
                                    mode='area')

            # for debug vis
            # plt.imshow(img0.cpu().squeeze(0).permute(1, 2, 0).numpy())
            # plt.show()
            # plt.imshow(img1.cpu().squeeze(0).permute(1, 2, 0).numpy())
            # plt.show()
            # equivalent to cv2.resize(rgba, (w, h), interpolation=cv2.INTER_AREA

            # print(img0.shape,img1.shape)

            self.results.append(self.fn.forward(img0, img1).cpu().numpy())

        return self.measure()

class CXMeter:

    def __init__(self,
                 net='vgg',
                 device=None,
                 size=512):  # or we can use 'alex', 'vgg' as network
        self.size = size
        self.net = net
        self.results = []
        self.device = device if device is not None else torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.fn = lpips.LPIPS(net=net).eval().to(self.device)

    def measure(self):
        return np.mean(self.results)

    def report(self):
        return f'LPIPS ({self.net}) = {self.measure():.6f}'

    def read_img_list(self, img_list):
        size = self.size
        images = []
        white_background = np.ones((size, size, 3), dtype=np.uint8) * 255

        for img_path in img_list:
            img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED)

            if img.shape[2] == 4:  # Handle BGRA images
                alpha = img[:, :, 3]  # Extract alpha channel
                img = cv2.cvtColor(img,
                                   cv2.COLOR_BGRA2BGR)  # Convert BGRA to BGR

                img = cv2.cvtColor(img,
                                   cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
                img[np.where(alpha == 0)] = [
                    255, 255, 255
                ]  # Set transparent pixels to white
            else:  # Handle other image formats like JPG and PNG
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

            img = cv2.resize(img, (size, size), interpolation=cv2.INTER_AREA)
            images.append(img)

        images = np.stack(images, axis=0)
        # images[np.where(images == 0)] = 255  # Set black pixels to white
        # images = np.where(images == 0, white_background, images)  # Set transparent pixels to white
        images = images.astype(np.float32) / 255.0

        return images

    # * recommend to use this function for evaluation
    @torch.no_grad()
    def score_gt(self, ref_paths, novel_paths):
        self.results = []
        path0 = ref_paths[0]
        print('calculating CX loss')
        for path1 in tqdm(novel_paths):
            # Load images
            img0, img1 = self.read_img_list([path0]), self.read_img_list(
                [path1])
            img0, img1 = numpy_to_torch(img0), numpy_to_torch(img1)
            img0, img1 = img0 * 0.5 + 0.5, img1 * 0.5 + 0.5
            img0 = F.interpolate(img0,
                                    size=(self.size, self.size),
                                    mode='area')
            img1 = F.interpolate(img1,
                                    size=(self.size, self.size),
                                    mode='area')
            loss = criterion(img0.cpu(), img1.cpu())
            self.results.append(loss.cpu().numpy())

        return self.measure()

class PSNRMeter:

    def __init__(self, size=800):
        self.results = []
        self.size = size

    def read_img_list(self, img_list):
        size = self.size
        images = []
        white_background = np.ones((size, size, 3), dtype=np.uint8) * 255
        for img_path in img_list:
            img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED)

            if img.shape[2] == 4:  # Handle BGRA images
                alpha = img[:, :, 3]  # Extract alpha channel
                img = cv2.cvtColor(img,
                                   cv2.COLOR_BGRA2BGR)  # Convert BGRA to BGR

                img = cv2.cvtColor(img,
                                   cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
                img[np.where(alpha == 0)] = [
                    255, 255, 255
                ]  # Set transparent pixels to white
            else:  # Handle other image formats like JPG and PNG
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

            img = cv2.resize(img, (size, size), interpolation=cv2.INTER_AREA)
            images.append(img)

        images = np.stack(images, axis=0)
        # images[np.where(images == 0)] = 255  # Set black pixels to white
        # images = np.where(images == 0, white_background, images)  # Set transparent pixels to white
        images = images.astype(np.float32) / 255.0
        # print(images.shape)
        return images

    def update(self, preds, truths):
        # print(preds.shape)

        psnr_values = []
        # For each pair of images in the batches
        for img1, img2 in zip(preds, truths):
            # Compute the PSNR and add it to the list
            # print(img1.shape,img2.shape)

            # for debug
            # plt.imshow(img1)
            # plt.show()
            # plt.imshow(img2)
            # plt.show()

            psnr = compare_psnr(
                img1, img2,
                data_range=1.0)  # assuming your images are scaled to [0,1]
            # print(f"temp psnr {psnr}")
            psnr_values.append(psnr)

        # Convert the list of PSNR values to a numpy array
        self.results = psnr_values

    def measure(self):
        return np.mean(self.results)

    def report(self):
        return f'PSNR = {self.measure():.6f}'

    # * recommend to use this function for evaluation
    def score_gt(self, ref_paths, novel_paths):
        self.results = []
        # [B, N, 3] or [B, H, W, 3], range[0, 1]
        preds = self.read_img_list(ref_paths)
        print('novel_paths', novel_paths)
        truths = self.read_img_list(novel_paths)
        self.update(preds, truths)
        return self.measure()

# all_inputs = 'data'
# nerf_dataset = os.listdir(osp(all_inputs, 'nerf4'))
# realfusion_dataset = os.listdir(osp(all_inputs, 'realfusion15'))
# meta_examples = {
#    'nerf4': nerf_dataset,
#    'realfusion15': realfusion_dataset,
# }
# all_datasets = meta_examples.keys()

# organization 1
def deprecated_score_from_method_for_dataset(my_scorer,
                                  method,
                                  dataset,
                                  input,
                                  output,
                                  score_type='clip',
                                  ):  # psnr, lpips
    # print("\n\n\n")
    # print(f"______{method}___{dataset}___{score_type}_________")
    scores = {}
    final_res = 0
    examples = meta_examples[dataset]
    for i in range(len(examples)):

        # compare entire folder for clip
        if score_type == 'clip':
            novel_view = osp(pred_path, examples[i], 'colors')
        # compare first image for other metrics
        else:
            if method == '3d_fuse': method = '3d_fuse_0'
            novel_view = list(
                glob.glob(
                    osp(pred_path, examples[i], 'colors',
                        'step_0000*')))[0]

        score_i = my_scorer.score_gt(
            [], [novel_view])
        scores[examples[i]] = score_i
        final_res += score_i
    # print(scores, " Avg : ", final_res / len(examples))
    # print("``````````````````````")
    return scores

# results organization 2
def score_from_method_for_dataset(my_scorer,
                                  input_path,
                                  pred_path,
                                  score_type='clip',
                                  rgb_name='lambertian',
                                  result_folder='results/images',
                                  first_str='*0000*'
                                  ):  # psnr, lpips
    scores = {}
    final_res = 0
    examples = os.listdir(input_path)
    for i in range(len(examples)):
        # ref path
        ref_path = osp(input_path, examples[i], 'rgba.png')
        # compare entire folder for clip
        print(pred_path,'*'+examples[i]+'*', result_folder, f'*{rgb_name}*')
        exit(0)
        if score_type == 'clip':
            novel_view = glob.glob(osp(pred_path,'*'+examples[i]+'*', result_folder, f'*{rgb_name}*'))
            print(f'[INOF] {score_type} loss for example {examples[i]} between 1 GT and {len(novel_view)} predictions')
        # compare first image for other metrics
        else:
            novel_view = glob.glob(osp(pred_path, '*'+examples[i]+'*/', result_folder, f'{first_str}{rgb_name}*'))
            print(f'[INOF] {score_type} loss for example {examples[i]} between {ref_path} and {novel_view}')
        # breakpoint()
        score_i = my_scorer.score_gt([ref_path], novel_view)
        scores[examples[i]] = score_i
        final_res += score_i
    avg_score = final_res / len(examples)
    scores['average'] = avg_score
    return scores


# results organization 2
def score_from_my_method_for_dataset(my_scorer,
                                  input_path, dataset,
                                  score_type='clip'
                                  ):  # psnr, lpips
    scores = {}
    final_res = 0
    input_path = osp(input_path, dataset)
    ref_path = glob.glob(osp(input_path, "*_rgba.png"))
    novel_view = [osp(input_path, '%d.png' % i) for i in range(120)]
    # print(ref_path)
    # print(novel_view)
    for i in tqdm(range(120)):
        if os.path.exists(osp(input_path, '%d_color.png' % i)):
            continue
        img = cv2.imread(novel_view[i])
        H = img.shape[0]
        img = img[:, :H]
        cv2.imwrite(osp(input_path, '%d_color.png' % i), img)
    if score_type == 'clip' or score_type == 'cx':
        novel_view = [osp(input_path, '%d_color.png' % i) for i in range(120)]
    else:
        novel_view = [osp(input_path, '%d_color.png' % i) for i in range(1)]
    print(novel_view)
    scores['%s_average' % dataset] = my_scorer.score_gt(ref_path, novel_view)
    return scores

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Script to accept three string arguments")
    parser.add_argument("--input_path",
                        default=None,
                        help="Specify the input path")
    parser.add_argument("--pred_pattern",
                        default="out/magic123*",
                        help="Specify the pattern of predition paths")
    parser.add_argument("--results_folder",
                        default="results/images",
                        help="where are the results under each pred_path")
    parser.add_argument("--rgb_name",
                        default="lambertian",
                        help="the postfix of the image")
    parser.add_argument("--first_str",
                        default="*0000*",
                        help="the str to indicate the first view")
    parser.add_argument("--datasets",
                        default=None,
                        nargs='*',
                        help="Specify the output path")
    parser.add_argument("--device",
                        type=int,
                        default=0,
                        help="Specify the GPU device to be used")
    parser.add_argument("--save_dir", type=str, default='all_metrics/results')
    args = parser.parse_args()

    clip_scorer = CLIP(args.device)
    lpips_scorer = LPIPSMeter()
    psnr_scorer = PSNRMeter()
    CX_scorer = CXMeter()
    # criterion = criterion.to(args.device)

    os.makedirs(args.save_dir, exist_ok=True)

    for dataset in os.listdir(args.input_path):
        print(dataset)
        results_dict = {}
        results_dict['clip'] = score_from_my_method_for_dataset(
            clip_scorer, args.input_path, dataset, 'clip')

        results_dict['psnr'] = score_from_my_method_for_dataset(
            psnr_scorer, args.input_path, dataset,  'psnr')

        results_dict['lpips'] = score_from_my_method_for_dataset(
            lpips_scorer, args.input_path, dataset,  'lpips')

        results_dict['CX'] = score_from_my_method_for_dataset(
            CX_scorer, args.input_path, dataset,  'cx')

    df = pd.DataFrame(results_dict)
    print(df)
    df.to_csv(f"{args.save_dir}/result.csv")


    # for dataset in args.datasets:
    #     input_path = osp(args.input_path, dataset)

    #     # assume the pred_path is organized as: pred_path/methods/dataset
    #     pred_pattern = osp(args.pred_pattern, dataset)
    #     pred_paths = glob.glob(pred_pattern)
    #     print(f"[INFO] Following the pattern {pred_pattern}, find {len(pred_paths)} pred_paths: \n", pred_paths)
    #     if len(pred_paths) == 0:
    #         raise IOError
    #     for pred_path in pred_paths:
    #         if not os.path.exists(pred_path):
    #             print(f'[WARN] prediction does not exit for {pred_path}')
    #         else:
    #             print(f'[INFO] evaluate {pred_path}')