From 11f21fcfb60aeb4dc677aca1d760665844989615 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Mon, 25 Oct 2021 13:15:05 +0300 Subject: [PATCH] Fix PyTorch distributed TimeoutSocket issue in Windows --- .../pytorch/pytorch_distributed_example.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/examples/frameworks/pytorch/pytorch_distributed_example.py b/examples/frameworks/pytorch/pytorch_distributed_example.py index 9130f3c4..93e82bdc 100644 --- a/examples/frameworks/pytorch/pytorch_distributed_example.py +++ b/examples/frameworks/pytorch/pytorch_distributed_example.py @@ -1,23 +1,22 @@ # ClearML - example of ClearML torch distributed support # notice all nodes will be reporting to the master Task (experiment) - import os import subprocess import sys from argparse import ArgumentParser +from datetime import timedelta from math import ceil from random import Random import torch as th -import torch.nn as nn import torch.distributed as dist +import torch.nn as nn import torch.nn.functional as F from torch import optim from torchvision import datasets, transforms from clearml import Task - local_dataset_path = './MNIST_data' @@ -159,7 +158,12 @@ if __name__ == "__main__": exit(0) if os.environ.get('MASTER_ADDR'): - dist.init_process_group(backend='gloo', rank=args.rank, world_size=args.nodes) + # Use a large timeout value since in Windows timeout issues may cause + # pg = ProcessGroupGloo(prefix_store, rank, world_size, timeout=timeout) + # RuntimeError: Socket TimeoutSocket + dist.init_process_group( + backend='gloo', rank=args.rank, world_size=args.nodes, timeout=timedelta(days=1) + ) run(args.workers_in_node) else: # first let's download the dataset, if we have multiple machines,