diff --git a/clearml/backend_interface/task/task.py b/clearml/backend_interface/task/task.py index 07a6a385..871d86c9 100644 --- a/clearml/backend_interface/task/task.py +++ b/clearml/backend_interface/task/task.py @@ -1158,24 +1158,11 @@ class Task(IdObjectBase, AccessMixin, SetupUploadMixin): str_value = str(value) if isinstance(value, (tuple, list, dict)): - if 'None' in re.split(r'[ ,\[\]{}()]', str_value): - # If we have None in the string we have to use json to replace it with null, - # otherwise we end up with None as string when running remotely - try: - str_json = json.dumps(value) - # verify we actually have a null in the string, otherwise prefer the str cast - # This is because we prefer to have \' as in str and not \" used in json - if 'null' in re.split(r'[ ,\[\]{}()]', str_json): - return str_json - except TypeError: - # if we somehow failed to json serialize, revert to previous std casting - pass - elif any('\\' in str(v) for v in value): - try: - str_json = json.dumps(value) - return str_json - except TypeError: - pass + try: + str_json = json.dumps(value) + return str_json + except TypeError: + pass if isinstance(value, Enum): # remove the class name @@ -1920,10 +1907,10 @@ class Task(IdObjectBase, AccessMixin, SetupUploadMixin): :return: http/s URL link. """ - return '{}/projects/{}/experiments/{}/output/log'.format( - self._get_app_server(), - self.project if self.project is not None else '*', - self.id, + return self.get_task_output_log_web_page( + task_id=self.id, + project_id=self.project, + app_server_host=self._get_app_server() ) def get_reported_scalars( @@ -2727,6 +2714,30 @@ class Task(IdObjectBase, AccessMixin, SetupUploadMixin): task = get_single_result(entity='task', query=task_name, results=res.response.tasks) return cls(task_id=task.id) + @classmethod + def get_task_output_log_web_page(cls, task_id, project_id=None, app_server_host=None): + # type: (str, Optional[str], Optional[str]) -> str + """ + Return the Task results & outputs web page address. + For example: https://demoapp.demo.clear.ml/projects/216431/experiments/60763e04/output/log + + :param str task_id: Task ID. + :param str project_id: Project ID for this task. + :param str app_server_host: ClearML Application server host name. + If not provided, the current session will be used to resolve the host name. + :return: http/s URL link. + """ + if not app_server_host: + if not hasattr(cls, "__cached_app_server_host"): + cls.__cached_app_server_host = Session.get_app_server_host() + app_server_host = cls.__cached_app_server_host + + return "{}/projects/{}/experiments/{}/output/log".format( + app_server_host.rstrip("/"), + project_id if project_id is not None else '*', + task_id, + ) + @classmethod def _get_project_name(cls, project_id): res = cls._send(cls._get_default_session(), projects.GetByIdRequest(project=project_id), raise_on_errors=False) diff --git a/clearml/binding/jsonargs_bind.py b/clearml/binding/jsonargs_bind.py index eff8483b..6094de23 100644 --- a/clearml/binding/jsonargs_bind.py +++ b/clearml/binding/jsonargs_bind.py @@ -1,4 +1,5 @@ import json +import logging try: from jsonargparse import ArgumentParser @@ -102,7 +103,8 @@ class PatchJsonArgParse(object): for k, v in params.items(): params_namespace[k] = v return params_namespace - except Exception: + except Exception as e: + logging.getLogger(__file__).warning("Failed parsing jsonargparse arguments: {}".format(e)) return original_fn(obj, **kwargs) parsed_args = original_fn(obj, **kwargs) # noinspection PyBroadException @@ -114,10 +116,14 @@ class PatchJsonArgParse(object): PatchJsonArgParse._args_type[ns_name] = PatchJsonArgParse._command_type subcommand = ns_val try: - import pytorch_lightning + import lightning except ImportError: - pytorch_lightning = None - if subcommand and subcommand in PatchJsonArgParse._args and pytorch_lightning: + try: + import pytorch_lightning + lightning = pytorch_lightning + except ImportError: + lightning = None + if subcommand and subcommand in PatchJsonArgParse._args and lightning: subcommand_args = flatten_dictionary( PatchJsonArgParse._args[subcommand], prefix=subcommand + PatchJsonArgParse._commands_sep, @@ -127,8 +133,8 @@ class PatchJsonArgParse(object): PatchJsonArgParse._args.update(subcommand_args) PatchJsonArgParse._args = {k: v for k, v in PatchJsonArgParse._args.items()} PatchJsonArgParse._update_task_args() - except Exception: - pass + except Exception as e: + logging.getLogger(__file__).warning("Failed parsing jsonargparse arguments: {}".format(e)) return parsed_args @staticmethod diff --git a/clearml/config/default/sdk.conf b/clearml/config/default/sdk.conf index d711b7f5..bb92674c 100644 --- a/clearml/config/default/sdk.conf +++ b/clearml/config/default/sdk.conf @@ -205,6 +205,9 @@ # compatibility feature, report memory usage for the entire machine # default (false), report only on the running process and its sub-processes report_global_mem_used: false + + # if provided, start resource reporting after this amount of seconds + #report_start_sec: 30 } } diff --git a/clearml/datasets/dataset.py b/clearml/datasets/dataset.py index 5318bd5d..93f29e16 100644 --- a/clearml/datasets/dataset.py +++ b/clearml/datasets/dataset.py @@ -463,8 +463,10 @@ class Dataset(object): dataset_paths = itertools.repeat(dataset_path) else: if len(dataset_path) != len(source_url): - raise ValueError("dataset_path must be a string or a list of strings with the same length as source_url" - f" (received {len(dataset_path)} paths for {len(source_url)} source urls))") + raise ValueError( + f"dataset_path must be a string or a list of strings with the same length as source_url" + f" (received {len(dataset_path)} paths for {len(source_url)} source urls))" + ) dataset_paths = dataset_path with ThreadPoolExecutor(max_workers=max_workers) as tp: for source_url_, dataset_path_ in zip(source_url_list, dataset_paths): diff --git a/clearml/storage/helper.py b/clearml/storage/helper.py index 994823c1..ff785925 100644 --- a/clearml/storage/helper.py +++ b/clearml/storage/helper.py @@ -10,6 +10,7 @@ import platform import shutil import sys import threading +import uuid from abc import ABCMeta, abstractmethod from collections import namedtuple from concurrent.futures import ThreadPoolExecutor @@ -1036,19 +1037,19 @@ class StorageHelper(object): def check_write_permissions(self, dest_path=None): # create a temporary file, then delete it base_url = dest_path or self._base_url - dest_path = base_url + '/.clearml.test' + dest_path = base_url + "/.clearml.{}.test".format(str(uuid.uuid4())) # do not check http/s connection permissions - if dest_path.startswith('http'): + if dest_path.startswith("http"): return True try: - self.upload_from_stream(stream=six.BytesIO(b'clearml'), dest_path=dest_path) + self.upload_from_stream(stream=six.BytesIO(b"clearml"), dest_path=dest_path) except Exception: - raise ValueError('Insufficient permissions (write failed) for {}'.format(base_url)) + raise ValueError("Insufficient permissions (write failed) for {}".format(base_url)) try: self.delete(path=dest_path) except Exception: - raise ValueError('Insufficient permissions (delete failed) for {}'.format(base_url)) + raise ValueError("Insufficient permissions (delete failed) for {}".format(base_url)) return True @classmethod diff --git a/clearml/task.py b/clearml/task.py index aead25ef..8a356136 100644 --- a/clearml/task.py +++ b/clearml/task.py @@ -1,4 +1,5 @@ import atexit +import copy import json import os import shutil @@ -100,6 +101,7 @@ from .utilities.lowlevel.threads import get_current_thread_id from .utilities.process.mp import BackgroundMonitor, leave_process from .utilities.matching import matches_any_wildcard from .utilities.parallel import FutureTaskCaller +from .utilities.networking import get_private_ip # noinspection PyProtectedMember from .backend_interface.task.args import _Arguments @@ -174,6 +176,9 @@ class Task(_Task): __detect_repo_async = deferred_config('development.vcs_repo_detect_async', False) __default_output_uri = DEV_DEFAULT_OUTPUT_URI.get() or deferred_config('development.default_output_uri', None) + _launch_multi_node_section = "launch_multi_node" + _launch_multi_node_instance_tag = "multi_node_instance" + class _ConnectedParametersType(object): argparse = "argument_parser" dictionary = "dictionary" @@ -702,8 +707,10 @@ class Task(_Task): resource_monitor_cls = auto_resource_monitoring \ if isinstance(auto_resource_monitoring, six.class_types) else ResourceMonitor task._resource_monitor = resource_monitor_cls( - task, report_mem_used_per_process=not config.get( - 'development.worker.report_global_mem_used', False)) + task, + report_mem_used_per_process=not config.get('development.worker.report_global_mem_used', False), + first_report_sec=config.get('development.worker.report_start_sec', None), + ) task._resource_monitor.start() # make sure all random generators are initialized with new seed @@ -1025,19 +1032,18 @@ class Task(_Task): :return: The Tasks specified by the parameter combinations (see the parameters). """ + task_filter = task_filter or {} if tags: - task_filter = task_filter or {} task_filter['tags'] = (task_filter.get('tags') or []) + list(tags) return_fields = {} if additional_return_fields: - task_filter = task_filter or {} return_fields = set(list(additional_return_fields) + ['id']) task_filter['only_fields'] = (task_filter.get('only_fields') or []) + list(return_fields) if task_filter.get('type'): task_filter['type'] = [str(task_type) for task_type in task_filter['type']] - results = cls._query_tasks(project_name=project_name, task_name=task_name, **(task_filter or {})) + results = cls._query_tasks(project_name=project_name, task_name=task_name, **task_filter) return [t.id for t in results] if not additional_return_fields else \ [{k: cls._get_data_property(prop_path=k, data=r, raise_on_error=False, log_on_error=False) for k in return_fields} @@ -1637,6 +1643,144 @@ class Task(_Task): """ return self._get_logger(auto_connect_streams=self._log_to_backend) + def launch_multi_node(self, total_num_nodes, port=29500, queue=None, wait=False, addr=None): + # type: (int, Optional[int], Optional[str], bool, Optional[str]) -> dict + """ + Enqueue multiple clones of the current task to a queue, allowing the task + to be ran by multiple workers in parallel. Each task running this way is called a node. + Each node has a rank The node that initialized the execution of the other nodes + is called the `master node` and it has a rank equal to 0. + + A dictionary named `multi_node_instance` will be connected to the tasks. + One can use this dictionary to modify the behaviour of this function when running remotely. + The contents of this dictionary correspond to the parameters of this function, and they are: + - `total_num_nodes` - the total number of nodes, including the master node + - `queue` - the queue to enqueue the nodes to + + The following environment variables, will be set: + - `MASTER_ADDR` - the address of the machine that the master node is running on + - `MASTER_PORT` - the open port of the machine that the master node is running on + - `WORLD_SIZE` - the total number of nodes, including the master + - `RANK` - the rank of the current node (master has rank 0) + + One may use this function in conjuction with PyTorch's distributed communication package. + Note that `Task.launch_multi_node` should be called before `torch.distributed.init_process_group`. + For example: + + .. code-block:: py + + from clearml import Task + import torch + import torch.distributed as dist + + def run(rank, size): + print('World size is ', size) + tensor = torch.zeros(1) + if rank == 0: + for i in range(1, size): + tensor += 1 + dist.send(tensor=tensor, dst=i) + print('Sending from rank ', rank, ' to rank ', i, ' data: ', tensor[0]) + else: + dist.recv(tensor=tensor, src=0) + print('Rank ', rank, ' received data: ', tensor[0]) + + if __name__ == '__main__': + task = Task.init('some_name', 'some_name') + task.execute_remotely(queue_name='queue') + config = task.launch_multi_node(4) + dist.init_process_group('gloo') + run(config.get('node_rank'), config.get('total_num_nodes')) + + :param total_num_nodes: The total number of nodes to be enqueued, including the master node, + which should already be enqueued when running remotely + :param port: Port opened by the master node. If the environment variable `CLEARML_MULTI_NODE_MASTER_DEF_PORT` + is set, the value of this parameter will be set to the one defined in `CLEARML_MULTI_NODE_MASTER_DEF_PORT`. + If `CLEARML_MULTI_NODE_MASTER_DEF_PORT` doesn't exist, but `MASTER_PORT` does, then the value of this + parameter will be set to the one defined in `MASTER_PORT`. If neither environment variables exist, + the value passed to the parameter will be used + :param queue: The queue to enqueue the nodes to. Can be different than the queue the master + node is enqueued to. If None, the nodes will be enqueued to the same queue as the master node + :param wait: If True, the master node will wait for the other nodes to start + :param addr: The address of the master node's worker. If not set, it defaults to the private IP + of the machine the master is running on + + :return: A dictionary containing relevant information regarding the multi node run. This dictionary + has the following entries: + - `master_addr` - the address of the machine that the master node is running on + - `master_port` - the open port of the machine that the master node is running on + - `total_num_nodes` - the total number of nodes, including the master + - `queue` - the queue the nodes are enqueued to, excluding the master + - `node_rank` - the rank of the current node (master has rank 0) + - `wait` - if True, the master node will wait for the other nodes to start + """ + def set_launch_multi_node_runtime_props(task, conf): + # noinspection PyProtectedMember + task._set_runtime_properties({"{}/{}".format(self._launch_multi_node_section, k): v for k, v in conf.items()}) + + if total_num_nodes < 1: + raise UsageError("total_num_nodes needs to be at least 1") + if running_remotely() and not (self.data.execution and self.data.execution.queue) and not queue: + raise UsageError("Master task is not enqueued to any queue and the queue parameter is None") + + master_conf = { + "master_addr": get_private_ip(), + "master_port": int(os.environ.get("CLEARML_MULTI_NODE_MASTER_DEF_PORT", os.environ.get("MASTER_PORT", port))), + "node_rank": 0, + "wait": wait + } + editable_conf = {"total_num_nodes": total_num_nodes, "queue": queue} + editable_conf = self.connect(editable_conf, name=self._launch_multi_node_section) + if not running_remotely(): + return master_conf + master_conf.update(editable_conf) + runtime_properties = self._get_runtime_properties() + remote_node_rank = runtime_properties.get("{}/node_rank".format(self._launch_multi_node_section)) + + if remote_node_rank: + # self is a child node, build the conf from the runtime proprerties + current_conf = { + entry: runtime_properties.get("{}/{}".format(self._launch_multi_node_section, entry)) + for entry in master_conf.keys() + } + else: + nodes_to_wait = [] + # self is the master node, enqueue the other nodes + set_launch_multi_node_runtime_props(self, master_conf) + current_conf = master_conf + for node_rank in range(1, master_conf.get("total_num_nodes", total_num_nodes)): + node = self.clone(source_task=self) + node_conf = copy.deepcopy(master_conf) + node_conf["node_rank"] = node_rank + set_launch_multi_node_runtime_props(node, node_conf) + node.set_system_tags(node.get_system_tags() + [self._launch_multi_node_instance_tag]) + if master_conf.get("queue"): + Task.enqueue(node, queue_name=master_conf["queue"]) + else: + Task.enqueue(node, queue_id=self.data.execution.queue) + if master_conf.get("wait"): + nodes_to_wait.append(node) + for node_to_wait, rank in zip(nodes_to_wait, range(1, master_conf.get("total_num_nodes", total_num_nodes))): + self.log.info("Waiting for node with task ID {} and rank {}".format(node_to_wait.id, rank)) + node_to_wait.wait_for_status( + status=( + Task.TaskStatusEnum.completed, + Task.TaskStatusEnum.stopped, + Task.TaskStatusEnum.closed, + Task.TaskStatusEnum.failed, + Task.TaskStatusEnum.in_progress + ), + check_interval_sec=10 + ) + self.log.info("Node with task ID {} and rank {} detected".format(node_to_wait.id, rank)) + + os.environ["MASTER_ADDR"] = current_conf.get("master_addr", "") + os.environ["MASTER_PORT"] = str(current_conf.get("master_port", "")) + os.environ["WORLD_SIZE"] = str(current_conf.get("total_num_nodes", "")) + os.environ["RANK"] = str(current_conf.get("node_rank", "")) + + return current_conf + def mark_started(self, force=False): # type: (bool) -> () """ @@ -4506,3 +4650,14 @@ class Task(_Task): auto_connect_frameworks={'detect_repository': False}) \ if state['main'] else Task.get_task(task_id=state['id']) self.__dict__ = task.__dict__ + + def __getattr__(self, name): + try: + self.__getattribute__(name) + except AttributeError as e: + if self.__class__ is Task: + getLogger().warning( + "'clearml.Task' object has no attribute '{}'. Did you mean to import 'Task' from 'allegroai'?".format(name) + ) + raise e + diff --git a/clearml/version.py b/clearml/version.py index 78a6e51e..b436016b 100644 --- a/clearml/version.py +++ b/clearml/version.py @@ -1 +1 @@ -__version__ = '1.10.3' +__version__ = '1.10.4' diff --git a/examples/datasets/data_ingestion.py b/examples/datasets/data_ingestion.py index 8aa38694..638d01b2 100644 --- a/examples/datasets/data_ingestion.py +++ b/examples/datasets/data_ingestion.py @@ -32,11 +32,11 @@ params = { params = task.connect(params) # enabling configuration override by clearml/ print(params) # printing actual configuration (after override in remote mode) - # The below gets the dataset and stores in the cache. If you want to download the dataset regardless if it's in the # cache, use the Dataset.get(dataset_name, dataset_project).get_mutable_local_copy(path to download) +# Dataset need to have finalized or closed state to get the local copy of it dataset_path = Dataset.get( - dataset_name=dataset_name, dataset_project=dataset_project + dataset_name=dataset_name, dataset_project=dataset_project, only_completed=False ).get_local_copy() # Dataset and Dataloader initializations diff --git a/examples/frameworks/jsonargparse/pytorch_lightning_cli.py b/examples/frameworks/jsonargparse/pytorch_lightning_cli.py index d73428e1..751a37a5 100644 --- a/examples/frameworks/jsonargparse/pytorch_lightning_cli.py +++ b/examples/frameworks/jsonargparse/pytorch_lightning_cli.py @@ -1,103 +1,14 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Notice that this file has been modified to examplify the use of -# ClearML when used with PyTorch Lightning - -import torch -import torchvision.transforms as T -from torch.nn import functional as F -import torch.nn as nn -from torchmetrics import Accuracy - -from torchvision.datasets.mnist import MNIST -from pytorch_lightning import LightningModule -from pytorch_lightning.utilities.cli import LightningCLI +try: + from lightning.pytorch.cli import LightningCLI + from lightning.pytorch.demos.boring_classes import DemoModel, BoringDataModule +except ImportError: + import sys + print("Module 'lightning' not installed (only available for Python 3.8+") + sys.exit(0) from clearml import Task -class Net(nn.Module): - def __init__(self): - super().__init__() - self.conv1 = nn.Conv2d(1, 32, 3, 1) - self.conv2 = nn.Conv2d(32, 64, 3, 1) - self.dropout1 = nn.Dropout(0.25) - self.dropout2 = nn.Dropout(0.5) - self.fc1 = nn.Linear(9216, 128) - self.fc2 = nn.Linear(128, 10) - - def forward(self, x): - x = self.conv1(x) - x = F.relu(x) - x = self.conv2(x) - x = F.relu(x) - x = F.max_pool2d(x, 2) - x = self.dropout1(x) - x = torch.flatten(x, 1) - x = self.fc1(x) - x = F.relu(x) - x = self.dropout2(x) - x = self.fc2(x) - output = F.log_softmax(x, dim=1) - return output - - -class ImageClassifier(LightningModule): - def __init__(self, model=None, lr=1.0, gamma=0.7, batch_size=32): - super().__init__() - self.save_hyperparameters(ignore="model") - self.model = model or Net() - self.test_acc = Accuracy() - - def forward(self, x): - return self.model(x) - - def training_step(self, batch, batch_idx): - x, y = batch - logits = self.forward(x) - loss = F.nll_loss(logits, y.long()) - return loss - - def test_step(self, batch, batch_idx): - x, y = batch - logits = self.forward(x) - loss = F.nll_loss(logits, y.long()) - self.test_acc(logits, y) - self.log("test_acc", self.test_acc) - self.log("test_loss", loss) - - def configure_optimizers(self): - optimizer = torch.optim.Adadelta(self.model.parameters(), lr=self.hparams.lr) - return [optimizer], [torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=self.hparams.gamma)] - - @property - def transform(self): - return T.Compose([T.ToTensor(), T.Normalize((0.1307,), (0.3081,))]) - - def prepare_data(self) -> None: - MNIST("./data", download=True) - - def train_dataloader(self): - train_dataset = MNIST("./data", train=True, download=False, transform=self.transform) - return torch.utils.data.DataLoader(train_dataset, batch_size=self.hparams.batch_size) - - def test_dataloader(self): - test_dataset = MNIST("./data", train=False, download=False, transform=self.transform) - return torch.utils.data.DataLoader(test_dataset, batch_size=self.hparams.batch_size) - - if __name__ == "__main__": - Task.add_requirements('requirements.txt') - task = Task.init(project_name="example", task_name="pytorch_lightning_jsonargparse") - LightningCLI(ImageClassifier, seed_everything_default=42, save_config_overwrite=True, run=True) + Task.add_requirements("requirements.txt") + Task.init(project_name="example", task_name="pytorch_lightning_jsonargparse") + LightningCLI(DemoModel, BoringDataModule) diff --git a/examples/frameworks/jsonargparse/pytorch_lightning_cli.yml b/examples/frameworks/jsonargparse/pytorch_lightning_cli.yml index fe8c31a1..ab2de927 100644 --- a/examples/frameworks/jsonargparse/pytorch_lightning_cli.yml +++ b/examples/frameworks/jsonargparse/pytorch_lightning_cli.yml @@ -1,12 +1,13 @@ trainer: callbacks: - - class_path: pytorch_lightning.callbacks.LearningRateMonitor + - class_path: lightning.pytorch.callbacks.LearningRateMonitor init_args: logging_interval: epoch - - class_path: pytorch_lightning.callbacks.ModelCheckpoint + - class_path: lightning.pytorch.callbacks.ModelCheckpoint init_args: filename: best save_last: False save_top_k: 1 monitor: loss mode: min + max_epochs: 10 diff --git a/examples/frameworks/jsonargparse/pytorch_lightning_cli_old.py b/examples/frameworks/jsonargparse/pytorch_lightning_cli_old.py new file mode 100644 index 00000000..38cd91c6 --- /dev/null +++ b/examples/frameworks/jsonargparse/pytorch_lightning_cli_old.py @@ -0,0 +1,114 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Notice that this file has been modified to examplify the use of +# ClearML when used with PyTorch Lightning +import sys + +import torch +import torchvision.transforms as T +from torch.nn import functional as F +import torch.nn as nn +from torchmetrics import Accuracy + +from torchvision.datasets.mnist import MNIST +from pytorch_lightning import LightningModule +from clearml import Task +try: + from pytorch_lightning.cli import LightningCLI +except ImportError: + try: + from pytorch_lightning.utilities.cli import LightningCLI + except ImportError: + print("Looks like you are using pytorch_lightning>=2.0. This example only works with older versions") + sys.exit(0) + + +class Net(nn.Module): + def __init__(self): + super().__init__() + self.conv1 = nn.Conv2d(1, 32, 3, 1) + self.conv2 = nn.Conv2d(32, 64, 3, 1) + self.dropout1 = nn.Dropout(0.25) + self.dropout2 = nn.Dropout(0.5) + self.fc1 = nn.Linear(9216, 128) + self.fc2 = nn.Linear(128, 10) + + def forward(self, x): + x = self.conv1(x) + x = F.relu(x) + x = self.conv2(x) + x = F.relu(x) + x = F.max_pool2d(x, 2) + x = self.dropout1(x) + x = torch.flatten(x, 1) + x = self.fc1(x) + x = F.relu(x) + x = self.dropout2(x) + x = self.fc2(x) + output = F.log_softmax(x, dim=1) + return output + + +class ImageClassifier(LightningModule): + def __init__(self, model=None, lr=1.0, gamma=0.7, batch_size=32): + super().__init__() + self.save_hyperparameters(ignore="model") + self.model = model or Net() + try: + self.test_acc = Accuracy() + except TypeError: + self.test_acc = Accuracy("binary") + + def forward(self, x): + return self.model(x) + + def training_step(self, batch, batch_idx): + x, y = batch + logits = self.forward(x) + loss = F.nll_loss(logits, y.long()) + return loss + + def test_step(self, batch, batch_idx): + x, y = batch + logits = self.forward(x) + loss = F.nll_loss(logits, y.long()) + self.test_acc(logits, y) + self.log("test_acc", self.test_acc) + self.log("test_loss", loss) + + def configure_optimizers(self): + optimizer = torch.optim.Adadelta(self.model.parameters(), lr=self.hparams.lr) + return [optimizer], [torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=self.hparams.gamma)] + + @property + def transform(self): + return T.Compose([T.ToTensor(), T.Normalize((0.1307,), (0.3081,))]) + + def prepare_data(self) -> None: + MNIST("./data", download=True) + + def train_dataloader(self): + train_dataset = MNIST("./data", train=True, download=False, transform=self.transform) + return torch.utils.data.DataLoader(train_dataset, batch_size=self.hparams.batch_size) + + def test_dataloader(self): + test_dataset = MNIST("./data", train=False, download=False, transform=self.transform) + return torch.utils.data.DataLoader(test_dataset, batch_size=self.hparams.batch_size) + + +if __name__ == "__main__": + Task.add_requirements("requirements.txt") + Task.init(project_name="example", task_name="pytorch_lightning_jsonargparse") + LightningCLI(ImageClassifier, seed_everything_default=42, run=True) diff --git a/examples/frameworks/jsonargparse/pytorch_lightning_cli_old.yml b/examples/frameworks/jsonargparse/pytorch_lightning_cli_old.yml new file mode 100644 index 00000000..d5010ac4 --- /dev/null +++ b/examples/frameworks/jsonargparse/pytorch_lightning_cli_old.yml @@ -0,0 +1,12 @@ +trainer: + callbacks: + - class_path: pytorch_lightning.callbacks.LearningRateMonitor + init_args: + logging_interval: epoch + - class_path: pytorch_lightning.callbacks.ModelCheckpoint + init_args: + filename: best + save_last: False + save_top_k: 1 + monitor: loss + mode: min \ No newline at end of file diff --git a/examples/frameworks/jsonargparse/requirements.txt b/examples/frameworks/jsonargparse/requirements.txt index ee8b5de0..6b54e31b 100644 --- a/examples/frameworks/jsonargparse/requirements.txt +++ b/examples/frameworks/jsonargparse/requirements.txt @@ -1,7 +1,8 @@ -clearml jsonargparse pytorch_lightning torch torchmetrics torchvision docstring_parser +pytorch-lightning[extra] +lightning; python_version >= '3.8' diff --git a/examples/frameworks/openmmlab/openmmlab_cifar10.py b/examples/frameworks/openmmlab/openmmlab_cifar10.py index 4255dd07..bef3105d 100644 --- a/examples/frameworks/openmmlab/openmmlab_cifar10.py +++ b/examples/frameworks/openmmlab/openmmlab_cifar10.py @@ -3,12 +3,11 @@ import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import torchvision.transforms as transforms -from torch.utils.data import DataLoader -from torchvision.datasets import CIFAR10 - from mmcv.parallel import MMDataParallel from mmcv.runner import EpochBasedRunner from mmcv.utils import get_logger +from torch.utils.data import DataLoader +from torchvision.datasets import CIFAR10 class Model(nn.Module): diff --git a/examples/frameworks/openmmlab/requirements.txt b/examples/frameworks/openmmlab/requirements.txt index 010df3e4..fae81627 100644 --- a/examples/frameworks/openmmlab/requirements.txt +++ b/examples/frameworks/openmmlab/requirements.txt @@ -1,4 +1,4 @@ clearml -mmcv>=1.5.1 +mmcv>=1.5.1,<2.0.0 torch torchvision diff --git a/examples/frameworks/pytorch-lightning/pytorch_lightning_example.py b/examples/frameworks/pytorch-lightning/pytorch_lightning_example.py index 91f2d085..337d829e 100644 --- a/examples/frameworks/pytorch-lightning/pytorch_lightning_example.py +++ b/examples/frameworks/pytorch-lightning/pytorch_lightning_example.py @@ -1,13 +1,14 @@ -import os +import sys from argparse import ArgumentParser -import torch + import pytorch_lightning as pl +import torch from torch.nn import functional as F from torch.utils.data import DataLoader, random_split -from clearml import Task - -from torchvision.datasets.mnist import MNIST from torchvision import transforms +from torchvision.datasets.mnist import MNIST + +from clearml import Task class LitClassifier(pl.LightningModule): @@ -35,12 +36,13 @@ class LitClassifier(pl.LightningModule): y_hat = self(x) loss = F.cross_entropy(y_hat, y) self.log('valid_loss', loss) + return loss def test_step(self, batch, batch_idx): x, y = batch y_hat = self(x) loss = F.cross_entropy(y_hat, y) - self.log('test_loss', loss) + return loss def configure_optimizers(self): return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate) @@ -54,19 +56,17 @@ class LitClassifier(pl.LightningModule): if __name__ == '__main__': - # Connecting ClearML with the current process, - # from here on everything is logged automatically - task = Task.init(project_name="examples", task_name="PyTorch lightning MNIST example") - pl.seed_everything(0) parser = ArgumentParser() parser.add_argument('--batch_size', default=32, type=int) - parser = pl.Trainer.add_argparse_args(parser) - parser.set_defaults(max_epochs=3) + parser.add_argument('--max_epochs', default=3, type=int) + sys.argv.extend(['--max_epochs', '2']) parser = LitClassifier.add_model_specific_args(parser) args = parser.parse_args() + Task.init(project_name="examples-internal", task_name="lightning checkpoint issue and argparser") + # ------------ # data # ------------ @@ -74,9 +74,9 @@ if __name__ == '__main__': mnist_test = MNIST('', train=False, download=True, transform=transforms.ToTensor()) mnist_train, mnist_val = random_split(dataset, [55000, 5000]) - train_loader = DataLoader(mnist_train, batch_size=args.batch_size, num_workers=os.cpu_count()) - val_loader = DataLoader(mnist_val, batch_size=args.batch_size, num_workers=os.cpu_count()) - test_loader = DataLoader(mnist_test, batch_size=args.batch_size, num_workers=os.cpu_count()) + train_loader = DataLoader(mnist_train, batch_size=args.batch_size) + val_loader = DataLoader(mnist_val, batch_size=args.batch_size) + test_loader = DataLoader(mnist_test, batch_size=args.batch_size) # ------------ # model @@ -86,7 +86,7 @@ if __name__ == '__main__': # ------------ # training # ------------ - trainer = pl.Trainer.from_argparse_args(args) + trainer = pl.Trainer(max_epochs=args.max_epochs) trainer.fit(model, train_loader, val_loader) # ------------