diff --git a/trains/utilities/resource_monitor.py b/trains/utilities/resource_monitor.py
index 9441f038..9e9dbd68 100644
--- a/trains/utilities/resource_monitor.py
+++ b/trains/utilities/resource_monitor.py
@@ -38,7 +38,8 @@ class ResourceMonitor(object):
         self._gpustat = gpustat
         self._active_gpus = None
         self._process_info = psutil.Process() if report_mem_used_per_process else None
-        self._last_process_pool = None
+        self._last_process_pool = {}
+        self._last_process_id_list = []
         if not self._gpustat:
             self._task.get_logger().report_text('TRAINS Monitor: GPU monitoring is not available')
         else:  # if running_remotely():
@@ -224,17 +225,7 @@ class ResourceMonitor(object):
         # check if we can access the gpu statistics
         if self._gpustat:
             try:
-                gpu_stat = self._gpustat.new_query()
-                for i, g in enumerate(gpu_stat.gpus):
-                    # only monitor the active gpu's, if none were selected, monitor everything
-                    if self._active_gpus and i not in self._active_gpus:
-                        continue
-                    stats["gpu_%d_temperature" % i] = float(g["temperature.gpu"])
-                    stats["gpu_%d_utilization" % i] = float(g["utilization.gpu"])
-                    stats["gpu_%d_mem_usage" % i] = 100. * float(g["memory.used"]) / float(g["memory.total"])
-                    # already in MBs
-                    stats["gpu_%d_mem_free_gb" % i] = float(g["memory.total"] - g["memory.used"]) / 1024
-                    stats["gpu_%d_mem_used_gb" % i] = float(g["memory.used"]) / 1024
+                stats.update(self._get_gpu_stats())
             except Exception:
                 # something happened and we can't use gpu stats,
                 self._gpustat_fail += 1
@@ -264,6 +255,7 @@ class ResourceMonitor(object):
 
     def _get_process_used_memory(self):
         def mem_usage_children(a_mem_size, pr, parent_mem=None):
+            self._last_process_id_list.append(pr.pid)
             # add out memory usage
             our_mem = pr.memory_info()
             mem_diff = our_mem.rss - parent_mem.rss if parent_mem else our_mem.rss
@@ -279,13 +271,56 @@ class ResourceMonitor(object):
 
         # only run the memory usage query once per reporting period
         # because this memory query is relatively slow, and changes very little.
-        if self._last_process_pool and (time() - self._last_process_pool[0]) < 0.01*self._report_frequency:
-            return self._last_process_pool[1]
+        if self._last_process_pool.get('cpu') and \
+                (time() - self._last_process_pool['cpu'][0]) < self._report_frequency:
+            return self._last_process_pool['cpu'][1]
 
         # if we have no parent process, return 0 (it's an error)
         if not self._process_info:
             return 0
 
+        self._last_process_id_list = []
         mem_size = mem_usage_children(0, self._process_info)
-        self._last_process_pool = time(), mem_size
+        self._last_process_pool['cpu'] = time(), mem_size
+
         return mem_size
+
+    def _get_gpu_stats(self):
+        if not self._gpustat:
+            return {}
+
+        # per process memory query id slow, so we only call it once per reporting period,
+        # On the rest of the samples we return the previous memory measurement
+
+        # update mem used by our process and sub processes
+        if self._process_info and (not self._last_process_pool.get('gpu') or
+                                   (time() - self._last_process_pool['gpu'][0]) >= self._report_frequency):
+            gpu_stat = self._gpustat.new_query(per_process_stats=True)
+            gpu_mem = {}
+            for i, g in enumerate(gpu_stat.gpus):
+                gpu_mem[i] = 0
+                for p in g.processes:
+                    if p['pid'] in self._last_process_id_list:
+                        gpu_mem[i] += p.get('gpu_memory_usage', 0)
+            self._last_process_pool['gpu'] = time(), gpu_mem
+        else:
+            # if we do no need to update the memory usage, run global query
+            # if we have no parent process (backward compatibility), return global stats
+            gpu_stat = self._gpustat.new_query()
+            gpu_mem = self._last_process_pool['gpu'][1] if self._last_process_pool.get('gpu') else None
+
+        # generate the statistics dict for actual report
+        stats = {}
+        for i, g in enumerate(gpu_stat.gpus):
+            # only monitor the active gpu's, if none were selected, monitor everything
+            if self._active_gpus and i not in self._active_gpus:
+                continue
+            stats["gpu_%d_temperature" % i] = float(g["temperature.gpu"])
+            stats["gpu_%d_utilization" % i] = float(g["utilization.gpu"])
+            stats["gpu_%d_mem_usage" % i] = 100. * float(g["memory.used"]) / float(g["memory.total"])
+            # already in MBs
+            stats["gpu_%d_mem_free_gb" % i] = float(g["memory.total"] - g["memory.used"]) / 1024
+            # use previously sampled process gpu memory, or global if it does not exist
+            stats["gpu_%d_mem_used_gb" % i] = float(gpu_mem[i] if gpu_mem else g["memory.used"]) / 1024
+
+        return stats