clearml/trains/utilities/resource_monitor.py

import logging
import os
import warnings
from time import time
from threading import Thread, Event

import psutil
from pathlib2 import Path
from typing import Text
from ..binding.frameworks.tensorflow_bind import IsTensorboardInit

try:
    from .gpu import gpustat
except ImportError:
    gpustat = None


class ResourceMonitor(object):
    _title_machine = ':monitor:machine'
    _title_gpu = ':monitor:gpu'

    def __init__(self, task, sample_frequency_per_sec=2., report_frequency_sec=30.,
                 first_report_sec=None, wait_for_first_iteration_to_start_sec=180.0,
                 max_wait_for_first_iteration_to_start_sec=1800., report_mem_used_per_process=True):
        self._task = task
        self._sample_frequency = sample_frequency_per_sec
        self._report_frequency = report_frequency_sec
        self._first_report_sec = first_report_sec or report_frequency_sec
        self._wait_for_first_iteration = wait_for_first_iteration_to_start_sec
        self._max_check_first_iteration = max_wait_for_first_iteration_to_start_sec
        self._num_readouts = 0
        self._readouts = {}
        self._previous_readouts = {}
        self._previous_readouts_ts = time()
        self._thread = None
        self._exit_event = Event()
        self._gpustat_fail = 0
        self._gpustat = gpustat
        self._active_gpus = None
        self._process_info = psutil.Process() if report_mem_used_per_process else None
        self._last_process_pool = {}
        self._last_process_id_list = []
        if not self._gpustat:
            self._task.get_logger().report_text('TRAINS Monitor: GPU monitoring is not available')
        else:  # if running_remotely():
            try:
                active_gpus = os.environ.get('NVIDIA_VISIBLE_DEVICES', '') or \
                    os.environ.get('CUDA_VISIBLE_DEVICES', '')
                if active_gpus:
                    self._active_gpus = [int(g.strip()) for g in active_gpus.split(',')]
            except Exception:
                pass

    def start(self):
        self._exit_event.clear()
        self._thread = Thread(target=self._run)
        self._thread.daemon = True
        self._thread.start()

    def stop(self):
        self._exit_event.set()
        # self._thread.join()

    def _run(self):
        # noinspection PyBroadException
        try:
            self._daemon()
        except Exception:
            pass

    def _daemon(self):
        seconds_since_started = 0
        reported = 0
        last_iteration = 0
        fallback_to_sec_as_iterations = None

        # get max GPU ID, and make sure our active list is within range
        if self._active_gpus:
            try:
                gpu_stat = self._gpustat.new_query()
                if max(self._active_gpus) > len(gpu_stat.gpus) - 1:
                    self._active_gpus = None
            except Exception:
                pass

        # last_iteration_interval = None
        # last_iteration_ts = 0
        # repeated_iterations = 0
        while True:
            last_report = time()
            current_report_frequency = self._report_frequency if reported != 0 else self._first_report_sec
            while (time() - last_report) < current_report_frequency:
                # wait for self._sample_frequency seconds, if event set quit
                if self._exit_event.wait(1.0 / self._sample_frequency):
                    return
                # noinspection PyBroadException
                try:
                    self._update_readouts()
                except Exception:
                    pass

            seconds_since_started += int(round(time() - last_report))
            # check if we do not report any metric (so it means the last iteration will not be changed)
            if fallback_to_sec_as_iterations is None:
                if IsTensorboardInit.tensorboard_used():
                    fallback_to_sec_as_iterations = False
                elif seconds_since_started >= self._wait_for_first_iteration:
                    self._task.get_logger().report_text('TRAINS Monitor: Could not detect iteration reporting, '
                                                        'falling back to iterations as seconds-from-start')
                    fallback_to_sec_as_iterations = True
            elif fallback_to_sec_as_iterations is True and seconds_since_started <= self._max_check_first_iteration:
                if self._check_logger_reported():
                    fallback_to_sec_as_iterations = False
                    self._task.get_logger().report_text('TRAINS Monitor: Reporting detected, '
                                                        'reverting back to iteration based reporting')

            clear_readouts = True
            # if we do not have last_iteration, we just use seconds as iteration
            if fallback_to_sec_as_iterations:
                iteration = seconds_since_started
            else:
                iteration = self._task.get_last_iteration()
                if iteration < last_iteration:
                    # we started a new session?!
                    # wait out
                    clear_readouts = False
                    iteration = last_iteration
                elif iteration == last_iteration:
                    # repeated_iterations += 1
                    # if last_iteration_interval:
                    #     # to be on the safe side, we don't want to pass the actual next iteration
                    #     iteration += int(0.95*last_iteration_interval[0] * (seconds_since_started - last_iteration_ts)
                    #                      / last_iteration_interval[1])
                    # else:
                    #     iteration += 1
                    clear_readouts = False
                    iteration = last_iteration
                else:
                    # last_iteration_interval = (iteration - last_iteration, seconds_since_started - last_iteration_ts)
                    # repeated_iterations = 0
                    # last_iteration_ts = seconds_since_started
                    last_iteration = iteration
                    fallback_to_sec_as_iterations = False
                    clear_readouts = True

            # start reporting only when we figured out, if this is seconds based, or iterations based
            average_readouts = self._get_average_readouts()
            if fallback_to_sec_as_iterations is not None:
                for k, v in average_readouts.items():
                    # noinspection PyBroadException
                    try:
                        title = self._title_gpu if k.startswith('gpu_') else self._title_machine
                        # 3 points after the dot
                        value = round(v * 1000) / 1000.
                        self._task.get_logger().report_scalar(title=title, series=k, iteration=iteration, value=value)
                    except Exception:
                        pass
                # clear readouts if this is update is not averaged
                if clear_readouts:
                    self._clear_readouts()

            # count reported iterations
            reported += 1

    def _update_readouts(self):
        readouts = self._machine_stats()
        elapsed = time() - self._previous_readouts_ts
        self._previous_readouts_ts = time()
        for k, v in readouts.items():
            # cumulative measurements
            if k.endswith('_mbs'):
                v = (v - self._previous_readouts.get(k, v)) / elapsed

            self._readouts[k] = self._readouts.get(k, 0.0) + v
        self._num_readouts += 1
        self._previous_readouts = readouts

    def _get_num_readouts(self):
        return self._num_readouts

    def _get_average_readouts(self):
        average_readouts = dict((k, v / float(self._num_readouts)) for k, v in self._readouts.items())
        return average_readouts

    def _clear_readouts(self):
        self._readouts = {}
        self._num_readouts = 0

    def _machine_stats(self):
        """
        :return: machine stats dictionary, all values expressed in megabytes
        """
        cpu_usage = [float(v) for v in psutil.cpu_percent(percpu=True)]
        stats = {
            "cpu_usage": sum(cpu_usage) / float(len(cpu_usage)),
        }

        bytes_per_megabyte = 1024 ** 2

        def bytes_to_megabytes(x):
            return x / bytes_per_megabyte

        virtual_memory = psutil.virtual_memory()
        # stats["memory_used_gb"] = bytes_to_megabytes(virtual_memory.used) / 1024
        stats["memory_used_gb"] = bytes_to_megabytes(
            self._get_process_used_memory() if self._process_info else virtual_memory.used) / 1024
        stats["memory_free_gb"] = bytes_to_megabytes(virtual_memory.available) / 1024
        disk_use_percentage = psutil.disk_usage(Text(Path.home())).percent
        stats["disk_free_percent"] = 100.0 - disk_use_percentage
        with warnings.catch_warnings():
            if logging.root.level > logging.DEBUG:  # If the logging level is bigger than debug, ignore
                # psutil.sensors_temperatures warnings
                warnings.simplefilter("ignore", category=RuntimeWarning)
            sensor_stat = (psutil.sensors_temperatures() if hasattr(psutil, "sensors_temperatures") else {})
        if "coretemp" in sensor_stat and len(sensor_stat["coretemp"]):
            stats["cpu_temperature"] = max([float(t.current) for t in sensor_stat["coretemp"]])

        # update cached measurements
        net_stats = psutil.net_io_counters()
        stats["network_tx_mbs"] = bytes_to_megabytes(net_stats.bytes_sent)
        stats["network_rx_mbs"] = bytes_to_megabytes(net_stats.bytes_recv)
        io_stats = psutil.disk_io_counters()
        stats["io_read_mbs"] = bytes_to_megabytes(io_stats.read_bytes)
        stats["io_write_mbs"] = bytes_to_megabytes(io_stats.write_bytes)

        # check if we can access the gpu statistics
        if self._gpustat:
            try:
                stats.update(self._get_gpu_stats())
            except Exception:
                # something happened and we can't use gpu stats,
                self._gpustat_fail += 1
                if self._gpustat_fail >= 3:
                    self._task.get_logger().report_text('TRAINS Monitor: GPU monitoring failed getting GPU reading, '
                                                        'switching off GPU monitoring')
                    self._gpustat = None

        return stats

    def _check_logger_reported(self):
        titles = self.get_logger_reported_titles(self._task)
        return len(titles) > 0

    @classmethod
    def get_logger_reported_titles(cls, task):
        titles = list(task.get_logger()._get_used_title_series().keys())
        try:
            titles.remove(cls._title_machine)
        except ValueError:
            pass
        try:
            titles.remove(cls._title_gpu)
        except ValueError:
            pass
        return titles

    def _get_process_used_memory(self):
        def mem_usage_children(a_mem_size, pr, parent_mem=None):
            self._last_process_id_list.append(pr.pid)
            # add out memory usage
            our_mem = pr.memory_info()
            mem_diff = our_mem.rss - parent_mem.rss if parent_mem else our_mem.rss
            a_mem_size += mem_diff if mem_diff > 0 else 0
            # now we are the parent
            for child in pr.children():
                # get the current memory
                m = pr.memory_info()
                mem_diff = m.rss - our_mem.rss
                a_mem_size += mem_diff if mem_diff > 0 else 0
                a_mem_size = mem_usage_children(a_mem_size, child, parent_mem=m)
            return a_mem_size

        # only run the memory usage query once per reporting period
        # because this memory query is relatively slow, and changes very little.
        if self._last_process_pool.get('cpu') and \
                (time() - self._last_process_pool['cpu'][0]) < self._report_frequency:
            return self._last_process_pool['cpu'][1]

        # if we have no parent process, return 0 (it's an error)
        if not self._process_info:
            return 0

        self._last_process_id_list = []
        mem_size = mem_usage_children(0, self._process_info)
        self._last_process_pool['cpu'] = time(), mem_size

        return mem_size

    def _get_gpu_stats(self):
        if not self._gpustat:
            return {}

        # per process memory query id slow, so we only call it once per reporting period,
        # On the rest of the samples we return the previous memory measurement

        # update mem used by our process and sub processes
        if self._process_info and (not self._last_process_pool.get('gpu') or
                                   (time() - self._last_process_pool['gpu'][0]) >= self._report_frequency):
            gpu_stat = self._gpustat.new_query(per_process_stats=True)
            gpu_mem = {}
            for i, g in enumerate(gpu_stat.gpus):
                gpu_mem[i] = 0
                for p in g.processes:
                    if p['pid'] in self._last_process_id_list:
                        gpu_mem[i] += p.get('gpu_memory_usage', 0)
            self._last_process_pool['gpu'] = time(), gpu_mem
        else:
            # if we do no need to update the memory usage, run global query
            # if we have no parent process (backward compatibility), return global stats
            gpu_stat = self._gpustat.new_query()
            gpu_mem = self._last_process_pool['gpu'][1] if self._last_process_pool.get('gpu') else None

        # generate the statistics dict for actual report
        stats = {}
        for i, g in enumerate(gpu_stat.gpus):
            # only monitor the active gpu's, if none were selected, monitor everything
            if self._active_gpus and i not in self._active_gpus:
                continue
            stats["gpu_%d_temperature" % i] = float(g["temperature.gpu"])
            stats["gpu_%d_utilization" % i] = float(g["utilization.gpu"])
            stats["gpu_%d_mem_usage" % i] = 100. * float(g["memory.used"]) / float(g["memory.total"])
            # already in MBs
            stats["gpu_%d_mem_free_gb" % i] = float(g["memory.total"] - g["memory.used"]) / 1024
            # use previously sampled process gpu memory, or global if it does not exist
            stats["gpu_%d_mem_used_gb" % i] = float(gpu_mem[i] if gpu_mem else g["memory.used"]) / 1024

        return stats