clearml/trains/utilities/resource_monitor.py

255 lines
11 KiB
Python

import logging
import os
import warnings
from time import time
from threading import Thread, Event
import psutil
from pathlib2 import Path
from typing import Text
from ..binding.frameworks.tensorflow_bind import IsTensorboardInit
try:
from .gpu import gpustat
except ImportError:
gpustat = None
class ResourceMonitor(object):
_title_machine = ':monitor:machine'
_title_gpu = ':monitor:gpu'
def __init__(self, task, sample_frequency_per_sec=2., report_frequency_sec=30.,
first_report_sec=None, wait_for_first_iteration_to_start_sec=180.0,
max_wait_for_first_iteration_to_start_sec=1800.):
self._task = task
self._sample_frequency = sample_frequency_per_sec
self._report_frequency = report_frequency_sec
self._first_report_sec = first_report_sec or report_frequency_sec
self._wait_for_first_iteration = wait_for_first_iteration_to_start_sec
self._max_check_first_iteration = max_wait_for_first_iteration_to_start_sec
self._num_readouts = 0
self._readouts = {}
self._previous_readouts = {}
self._previous_readouts_ts = time()
self._thread = None
self._exit_event = Event()
self._gpustat_fail = 0
self._gpustat = gpustat
self._active_gpus = None
if not self._gpustat:
self._task.get_logger().report_text('TRAINS Monitor: GPU monitoring is not available')
else: # if running_remotely():
try:
active_gpus = os.environ.get('NVIDIA_VISIBLE_DEVICES', '') or \
os.environ.get('CUDA_VISIBLE_DEVICES', '')
if active_gpus:
self._active_gpus = [int(g.strip()) for g in active_gpus.split(',')]
except Exception:
pass
def start(self):
self._exit_event.clear()
self._thread = Thread(target=self._run)
self._thread.daemon = True
self._thread.start()
def stop(self):
self._exit_event.set()
# self._thread.join()
def _run(self):
try:
self._daemon()
except:
pass
def _daemon(self):
seconds_since_started = 0
reported = 0
last_iteration = 0
fallback_to_sec_as_iterations = None
# get max GPU ID, and make sure our active list is within range
if self._active_gpus:
try:
gpu_stat = self._gpustat.new_query()
if max(self._active_gpus) > len(gpu_stat.gpus) - 1:
self._active_gpus = None
except Exception:
pass
# last_iteration_interval = None
# last_iteration_ts = 0
# repeated_iterations = 0
while True:
last_report = time()
current_report_frequency = self._report_frequency if reported != 0 else self._first_report_sec
while (time() - last_report) < current_report_frequency:
# wait for self._sample_frequency seconds, if event set quit
if self._exit_event.wait(1.0 / self._sample_frequency):
return
# noinspection PyBroadException
try:
self._update_readouts()
except Exception:
pass
seconds_since_started += int(round(time() - last_report))
# check if we do not report any metric (so it means the last iteration will not be changed)
if fallback_to_sec_as_iterations is None:
if IsTensorboardInit.tensorboard_used():
fallback_to_sec_as_iterations = False
elif seconds_since_started >= self._wait_for_first_iteration:
self._task.get_logger().report_text('TRAINS Monitor: Could not detect iteration reporting, '
'falling back to iterations as seconds-from-start')
fallback_to_sec_as_iterations = True
elif fallback_to_sec_as_iterations is True and seconds_since_started <= self._max_check_first_iteration:
if self._check_logger_reported():
fallback_to_sec_as_iterations = False
self._task.get_logger().report_text('TRAINS Monitor: Reporting detected, '
'reverting back to iteration based reporting')
clear_readouts = True
# if we do not have last_iteration, we just use seconds as iteration
if fallback_to_sec_as_iterations:
iteration = seconds_since_started
else:
iteration = self._task.get_last_iteration()
if iteration < last_iteration:
# we started a new session?!
# wait out
clear_readouts = False
iteration = last_iteration
elif iteration == last_iteration:
# repeated_iterations += 1
# if last_iteration_interval:
# # to be on the safe side, we don't want to pass the actual next iteration
# iteration += int(0.95*last_iteration_interval[0] * (seconds_since_started - last_iteration_ts)
# / last_iteration_interval[1])
# else:
# iteration += 1
clear_readouts = False
iteration = last_iteration
else:
# last_iteration_interval = (iteration - last_iteration, seconds_since_started - last_iteration_ts)
# repeated_iterations = 0
# last_iteration_ts = seconds_since_started
last_iteration = iteration
fallback_to_sec_as_iterations = False
clear_readouts = True
# start reporting only when we figured out, if this is seconds based, or iterations based
average_readouts = self._get_average_readouts()
if fallback_to_sec_as_iterations is not None:
for k, v in average_readouts.items():
# noinspection PyBroadException
try:
title = self._title_gpu if k.startswith('gpu_') else self._title_machine
# 3 points after the dot
value = round(v*1000) / 1000.
self._task.get_logger().report_scalar(title=title, series=k, iteration=iteration, value=value)
except Exception:
pass
# clear readouts if this is update is not averaged
if clear_readouts:
self._clear_readouts()
# count reported iterations
reported += 1
def _update_readouts(self):
readouts = self._machine_stats()
elapsed = time() - self._previous_readouts_ts
self._previous_readouts_ts = time()
for k, v in readouts.items():
# cumulative measurements
if k.endswith('_mbs'):
v = (v - self._previous_readouts.get(k, v)) / elapsed
self._readouts[k] = self._readouts.get(k, 0.0) + v
self._num_readouts += 1
self._previous_readouts = readouts
def _get_num_readouts(self):
return self._num_readouts
def _get_average_readouts(self):
average_readouts = dict((k, v/float(self._num_readouts)) for k, v in self._readouts.items())
return average_readouts
def _clear_readouts(self):
self._readouts = {}
self._num_readouts = 0
def _machine_stats(self):
"""
:return: machine stats dictionary, all values expressed in megabytes
"""
cpu_usage = [float(v) for v in psutil.cpu_percent(percpu=True)]
stats = {
"cpu_usage": sum(cpu_usage) / float(len(cpu_usage)),
}
bytes_per_megabyte = 1024 ** 2
def bytes_to_megabytes(x):
return x / bytes_per_megabyte
virtual_memory = psutil.virtual_memory()
stats["memory_used_gb"] = bytes_to_megabytes(virtual_memory.used) / 1024
stats["memory_free_gb"] = bytes_to_megabytes(virtual_memory.available) / 1024
disk_use_percentage = psutil.disk_usage(Text(Path.home())).percent
stats["disk_free_percent"] = 100.0-disk_use_percentage
with warnings.catch_warnings():
if logging.root.level > logging.DEBUG: # If the logging level is bigger than debug, ignore
# psutil.sensors_temperatures warnings
warnings.simplefilter("ignore", category=RuntimeWarning)
sensor_stat = (psutil.sensors_temperatures() if hasattr(psutil, "sensors_temperatures") else {})
if "coretemp" in sensor_stat and len(sensor_stat["coretemp"]):
stats["cpu_temperature"] = max([float(t.current) for t in sensor_stat["coretemp"]])
# update cached measurements
net_stats = psutil.net_io_counters()
stats["network_tx_mbs"] = bytes_to_megabytes(net_stats.bytes_sent)
stats["network_rx_mbs"] = bytes_to_megabytes(net_stats.bytes_recv)
io_stats = psutil.disk_io_counters()
stats["io_read_mbs"] = bytes_to_megabytes(io_stats.read_bytes)
stats["io_write_mbs"] = bytes_to_megabytes(io_stats.write_bytes)
# check if we can access the gpu statistics
if self._gpustat:
try:
gpu_stat = self._gpustat.new_query()
for i, g in enumerate(gpu_stat.gpus):
# only monitor the active gpu's, if none were selected, monitor everything
if self._active_gpus and i not in self._active_gpus:
continue
stats["gpu_%d_temperature" % i] = float(g["temperature.gpu"])
stats["gpu_%d_utilization" % i] = float(g["utilization.gpu"])
stats["gpu_%d_mem_usage" % i] = 100. * float(g["memory.used"]) / float(g["memory.total"])
# already in MBs
stats["gpu_%d_mem_free_gb" % i] = float(g["memory.total"] - g["memory.used"]) / 1024
stats["gpu_%d_mem_used_gb" % i] = float(g["memory.used"]) / 1024
except Exception:
# something happened and we can't use gpu stats,
self._gpustat_fail += 1
if self._gpustat_fail >= 3:
self._task.get_logger().report_text('TRAINS Monitor: GPU monitoring failed getting GPU reading, '
'switching off GPU monitoring')
self._gpustat = None
return stats
def _check_logger_reported(self):
titles = list(self._task.get_logger()._get_used_title_series().keys())
try:
titles.remove(self._title_machine)
except ValueError:
pass
try:
titles.remove(self._title_gpu)
except ValueError:
pass
return len(titles) > 0