From 7857f24260c60faa36c666bb90a0d306cb8d07f5 Mon Sep 17 00:00:00 2001 From: xvdp Date: Thu, 26 Oct 2023 04:25:56 -0700 Subject: [PATCH 1/7] moved magick to PIL as even singlethreaded is 4x the speed in resizing that magick --- convert.py | 41 ++++++++++++++--------------------------- 1 file changed, 14 insertions(+), 27 deletions(-) diff --git a/convert.py b/convert.py index 7894884..6974c63 100644 --- a/convert.py +++ b/convert.py @@ -8,12 +8,16 @@ # # For inquiries contact george.drettakis@inria.fr # +# xvdp removed magick, it is 3x slower than single threaded PIL for resizing + import os import logging from argparse import ArgumentParser import shutil +from PIL import Image + # This Python script is based on the shell converter script provided in the MipNerF 360 repository. parser = ArgumentParser("Colmap converter") parser.add_argument("--no_gpu", action='store_true') @@ -25,7 +29,7 @@ parser.add_argument("--resize", action="store_true") parser.add_argument("--magick_executable", default="", type=str) args = parser.parse_args() colmap_command = '"{}"'.format(args.colmap_executable) if len(args.colmap_executable) > 0 else "colmap" -magick_command = '"{}"'.format(args.magick_executable) if len(args.magick_executable) > 0 else "magick" + use_gpu = 1 if not args.no_gpu else 0 if not args.skip_matching: @@ -87,38 +91,21 @@ for file in files: destination_file = os.path.join(args.source_path, "sparse", "0", file) shutil.move(source_file, destination_file) -if(args.resize): +if args.resize: print("Copying and resizing...") # Resize images. - os.makedirs(args.source_path + "/images_2", exist_ok=True) - os.makedirs(args.source_path + "/images_4", exist_ok=True) - os.makedirs(args.source_path + "/images_8", exist_ok=True) + for div in [2,4,8]: + os.makedirs(args.source_path + f"/images_{div}", exist_ok=True) # Get the list of files in the source directory files = os.listdir(args.source_path + "/images") # Copy each file from the source directory to the destination directory - for file in files: + for j, file in enumerate(files): source_file = os.path.join(args.source_path, "images", file) - - destination_file = os.path.join(args.source_path, "images_2", file) - shutil.copy2(source_file, destination_file) - exit_code = os.system(magick_command + " mogrify -resize 50% " + destination_file) - if exit_code != 0: - logging.error(f"50% resize failed with code {exit_code}. Exiting.") - exit(exit_code) - - destination_file = os.path.join(args.source_path, "images_4", file) - shutil.copy2(source_file, destination_file) - exit_code = os.system(magick_command + " mogrify -resize 25% " + destination_file) - if exit_code != 0: - logging.error(f"25% resize failed with code {exit_code}. Exiting.") - exit(exit_code) - - destination_file = os.path.join(args.source_path, "images_8", file) - shutil.copy2(source_file, destination_file) - exit_code = os.system(magick_command + " mogrify -resize 12.5% " + destination_file) - if exit_code != 0: - logging.error(f"12.5% resize failed with code {exit_code}. Exiting.") - exit(exit_code) + im = Image.open(source_file) + logging.info(f"processing image [{j}/{len(files)}] {source_file}") + for div in [2,4,8]: + destination_file = os.path.join(args.source_path, f"images_{div}", file) + im.resize([round(i/div) for i in im.size], Image.BICUBIC).save(destination_file) print("Done.") From a0dc5af86fc3f244161b20363a330ba3c252c1c2 Mon Sep 17 00:00:00 2001 From: xvdp Date: Thu, 26 Oct 2023 05:36:14 -0700 Subject: [PATCH 2/7] moved magick to PIL, fixed quality setting to 100 to match matgick --- convert.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert.py b/convert.py index 6974c63..9351c64 100644 --- a/convert.py +++ b/convert.py @@ -8,7 +8,7 @@ # # For inquiries contact george.drettakis@inria.fr # -# xvdp removed magick, it is 3x slower than single threaded PIL for resizing +# xvdp removed magick, even single threaded PIL resizes 4X faster import os @@ -106,6 +106,6 @@ if args.resize: logging.info(f"processing image [{j}/{len(files)}] {source_file}") for div in [2,4,8]: destination_file = os.path.join(args.source_path, f"images_{div}", file) - im.resize([round(i/div) for i in im.size], Image.BICUBIC).save(destination_file) + im.resize([round(i/div) for i in im.size], Image.BICUBIC).save(destination_file, quality=100) print("Done.") From 2311f4e764f8f49f1f2213c604ed20a934002d42 Mon Sep 17 00:00:00 2001 From: xvdp Date: Tue, 31 Oct 2023 04:48:52 -0700 Subject: [PATCH 3/7] Added option to disable asynchronous operations which can cause cuda to fails, network_gui.listener can block resources, clean up on break. --- train.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/train.py b/train.py index 36faf0d..cfaf369 100644 --- a/train.py +++ b/train.py @@ -48,7 +48,7 @@ def training(dataset, opt, pipe, testing_iterations, saving_iterations, checkpoi ema_loss_for_log = 0.0 progress_bar = tqdm(range(first_iter, opt.iterations), desc="Training progress") first_iter += 1 - for iteration in range(first_iter, opt.iterations + 1): + for iteration in range(first_iter, opt.iterations + 1): if network_gui.conn == None: network_gui.try_connect() while network_gui.conn != None: @@ -62,7 +62,10 @@ def training(dataset, opt, pipe, testing_iterations, saving_iterations, checkpoi if do_training and ((iteration < int(opt.iterations)) or not keep_alive): break except Exception as e: + network_gui.conn.close() network_gui.conn = None + network_gui.listener.close() + network_gui.listener = None iter_start.record() @@ -159,7 +162,7 @@ def training_report(tb_writer, iteration, Ll1, loss, l1_loss, elapsed, testing_i # Report test and samples of training set if iteration in testing_iterations: torch.cuda.empty_cache() - validation_configs = ({'name': 'test', 'cameras' : scene.getTestCameras()}, + validation_configs = ({'name': 'test', 'cameras' : scene.getTestCameras()}, {'name': 'train', 'cameras' : [scene.getTrainCameras()[idx % len(scene.getTrainCameras())] for idx in range(5, 30, 5)]}) for config in validation_configs: @@ -202,18 +205,23 @@ if __name__ == "__main__": parser.add_argument("--quiet", action="store_true") parser.add_argument("--checkpoint_iterations", nargs="+", type=int, default=[]) parser.add_argument("--start_checkpoint", type=str, default = None) + parser.add_argument('--cuda_blocking', action='store_true', default=True) args = parser.parse_args(sys.argv[1:]) args.save_iterations.append(args.iterations) - + print("Optimizing " + args.model_path) # Initialize system state (RNG) safe_state(args.quiet) + # CUDA sometimes fails - option to disable asynchronous operations + if args.cuda_blocking: + os.environ['CUDA_LAUNCH_BLOCKING'] = "1" # Start GUI server, configure and run training network_gui.init(args.ip, args.port) torch.autograd.set_detect_anomaly(args.detect_anomaly) - training(lp.extract(args), op.extract(args), pp.extract(args), args.test_iterations, args.save_iterations, args.checkpoint_iterations, args.start_checkpoint, args.debug_from) + training(lp.extract(args), op.extract(args), pp.extract(args), args.test_iterations, args.save_iterations, + args.checkpoint_iterations, args.start_checkpoint, args.debug_from) # All done print("\nTraining complete.") From 25b3cb8cc95fca3b194928798c4885f664b23253 Mon Sep 17 00:00:00 2001 From: xvdp Date: Tue, 31 Oct 2023 05:11:09 -0700 Subject: [PATCH 4/7] save checkpoints by default --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index 223b2d9..7b73515 100644 --- a/train.py +++ b/train.py @@ -206,7 +206,7 @@ if __name__ == "__main__": parser.add_argument("--test_iterations", nargs="+", type=int, default=[7_000, 30_000]) parser.add_argument("--save_iterations", nargs="+", type=int, default=[7_000, 30_000]) parser.add_argument("--quiet", action="store_true") - parser.add_argument("--checkpoint_iterations", nargs="+", type=int, default=[]) + parser.add_argument("--checkpoint_iterations", nargs="+", type=int, default=[7_000, 30_000]) parser.add_argument("--start_checkpoint", type=str, default = None) parser.add_argument('--cuda_blocking', action='store_true', default=True) args = parser.parse_args(sys.argv[1:]) From 27ef163bdfbd8e0ca750c73cf9faa57729f0c188 Mon Sep 17 00:00:00 2001 From: xvdp Date: Sat, 4 Nov 2023 03:52:01 -0700 Subject: [PATCH 5/7] added __repr__ to GaussianModel --- scene/gaussian_model.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/scene/gaussian_model.py b/scene/gaussian_model.py index 632a1e8..23bee42 100644 --- a/scene/gaussian_model.py +++ b/scene/gaussian_model.py @@ -58,6 +58,17 @@ class GaussianModel: self.spatial_lr_scale = 0 self.setup_functions() + + def __repr__(self): + format_string = self.__class__.__name__ + '()' + for k, v in self.__dict__.items(): + if torch.is_tensor(v): + format_string +=f" {k}:\t{tuple(v.shape)}\n" + else: + format_string += f"{k}:\t{v}\n" + return format_string + + def capture(self): return ( self.active_sh_degree, From f934e701b25f31cda4b341d832aede40effd5055 Mon Sep 17 00:00:00 2001 From: xvdp Date: Sat, 4 Nov 2023 08:29:03 -0700 Subject: [PATCH 6/7] added __repr__ to Camera() --- scene/cameras.py | 10 ++++++++++ scene/gaussian_model.py | 4 ++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/scene/cameras.py b/scene/cameras.py index abf6e52..bc59fcc 100644 --- a/scene/cameras.py +++ b/scene/cameras.py @@ -56,6 +56,16 @@ class Camera(nn.Module): self.full_proj_transform = (self.world_view_transform.unsqueeze(0).bmm(self.projection_matrix.unsqueeze(0))).squeeze(0) self.camera_center = self.world_view_transform.inverse()[3, :3] + def __repr__(self): + format_string = self.__class__.__name__ + '()\n' + for k, v in self.__dict__.items(): + if torch.is_tensor(v) and v.numel() > 16: + format_string +=f" {k}:\t{tuple(v.shape)}\n" + else: + format_string += f"{k}:\t{v}\n" + return format_string + + class MiniCam: def __init__(self, width, height, fovy, fovx, znear, zfar, world_view_transform, full_proj_transform): self.image_width = width diff --git a/scene/gaussian_model.py b/scene/gaussian_model.py index 23bee42..07a30d0 100644 --- a/scene/gaussian_model.py +++ b/scene/gaussian_model.py @@ -60,7 +60,7 @@ class GaussianModel: def __repr__(self): - format_string = self.__class__.__name__ + '()' + format_string = self.__class__.__name__ + '()\n' for k, v in self.__dict__.items(): if torch.is_tensor(v): format_string +=f" {k}:\t{tuple(v.shape)}\n" @@ -84,7 +84,7 @@ class GaussianModel: self.optimizer.state_dict(), self.spatial_lr_scale, ) - + def restore(self, model_args, training_args): (self.active_sh_degree, self._xyz, From 4ea5609081b9c2df83f7774c615972b4bf21a5d1 Mon Sep 17 00:00:00 2001 From: xvdp Date: Sun, 5 Nov 2023 04:15:39 -0800 Subject: [PATCH 7/7] fix Conversion pil to torch introduces potential cuda error #430 --- train.py | 5 +---- utils/general_utils.py | 14 ++++++++------ 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/train.py b/train.py index 7b73515..d57acae 100644 --- a/train.py +++ b/train.py @@ -208,7 +208,7 @@ if __name__ == "__main__": parser.add_argument("--quiet", action="store_true") parser.add_argument("--checkpoint_iterations", nargs="+", type=int, default=[7_000, 30_000]) parser.add_argument("--start_checkpoint", type=str, default = None) - parser.add_argument('--cuda_blocking', action='store_true', default=True) + args = parser.parse_args(sys.argv[1:]) args.save_iterations.append(args.iterations) @@ -217,9 +217,6 @@ if __name__ == "__main__": # Initialize system state (RNG) safe_state(args.quiet) - # CUDA sometimes fails - option to disable asynchronous operations - if args.cuda_blocking: - os.environ['CUDA_LAUNCH_BLOCKING'] = "1" # Start GUI server, configure and run training network_gui.init(args.ip, args.port) torch.autograd.set_detect_anomaly(args.detect_anomaly) diff --git a/utils/general_utils.py b/utils/general_utils.py index 541c082..43f56ee 100644 --- a/utils/general_utils.py +++ b/utils/general_utils.py @@ -18,13 +18,15 @@ import random def inverse_sigmoid(x): return torch.log(x/(1-x)) -def PILtoTorch(pil_image, resolution): +def PILtoTorch(pil_image, resolution, pin_memory=True): resized_image_PIL = pil_image.resize(resolution) - resized_image = torch.from_numpy(np.array(resized_image_PIL)) / 255.0 - if len(resized_image.shape) == 3: - return resized_image.permute(2, 0, 1) - else: - return resized_image.unsqueeze(dim=-1).permute(2, 0, 1) + resized_image = torch.from_numpy(np.array(resized_image_PIL, dtype=np.float32)) / 255.0 + if resized_image.ndim == 2: + resized_image = resized_image[None] + resized_image = resized_image.permute(2, 0, 1).contiguous() + if pin_memory: + resized_image.pin_memory = True + return resized_image def get_expon_lr_func( lr_init, lr_final, lr_delay_steps=0, lr_delay_mult=1.0, max_steps=1000000