"""
Memory operations.
reduce_mem_usage_automatic,
total_size,
begin_operation,
end_operation,
sizeof_fmt,
top_mem_vars
"""
from __future__ import annotations
import os
import re
import time
from collections import deque
from itertools import chain
from sys import getsizeof
import numpy as np
import psutil
from pandas import DataFrame
from pymove.utils.log import logger
[docs]def reduce_mem_usage_automatic(df: DataFrame):
"""
Reduces the memory usage of the given dataframe.
Parameter
---------
df : dataframe
The input data to which the operation will be performed.
Examples
--------
>>> import numpy as np
>>> import pandas as pd
>>> from pymove.utils.mem import reduce_mem_usage_automatic
>>> df = pd.DataFrame({'col_1': np.arange(10000, dtype=np.float64)})
>>> df.dtytes
col_1 float64
dtype: object
>>> reduce_mem_usage_automatic(df)
'Memory usage of dataframe is 0.08 MB'
'Memory usage after optimization is: 0.02 MB'
'Decreased by 74.9 %'
>>> df.dtytes
col_1 float16
dtype: object
"""
start_mem = df.memory_usage().sum() / 1024 ** 2
logger.info(f'Memory usage of dataframe is {start_mem:.2f} MB')
for col in df.columns:
col_type = df[col].dtype
if re.match('int', str(col_type)):
c_min = df[col].min()
c_max = df[col].max()
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif (
c_min > np.iinfo(np.uint8).min
and c_max < np.iinfo(np.uint8).max
):
df[col] = df[col].astype(np.uint8)
elif (
c_min > np.iinfo(np.int16).min
and c_max < np.iinfo(np.int16).max
):
df[col] = df[col].astype(np.int16)
elif (
c_min > np.iinfo(np.uint16).min
and c_max < np.iinfo(np.uint16).max
):
df[col] = df[col].astype(np.uint16)
elif (
c_min > np.iinfo(np.int32).min
and c_max < np.iinfo(np.int32).max
):
df[col] = df[col].astype(np.int32)
elif (
c_min > np.iinfo(np.uint32).min
and c_max < np.iinfo(np.uint32).max
):
df[col] = df[col].astype(np.uint32)
elif (
c_min > np.iinfo(np.int64).min
and c_max < np.iinfo(np.int64).max
):
df[col] = df[col].astype(np.int64)
elif (
c_min > np.iinfo(np.uint64).min
and c_max < np.iinfo(np.uint64).max
):
df[col] = df[col].astype(np.uint64)
elif re.match('float', str(col_type)):
c_min = df[col].min()
c_max = df[col].max()
if (
c_min > np.finfo(np.float16).min
and c_max < np.finfo(np.float16).max
):
df[col] = df[col].astype(np.float16)
elif (
c_min > np.finfo(np.float32).min
and c_max < np.finfo(np.float32).max
):
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024 ** 2
logger.info(f'Memory usage after optimization is: {end_mem:.2f} MB')
logger.info(
f'Decreased by {100 * (start_mem - end_mem) / start_mem:.1f} %'
)
[docs]def total_size(
o: object, handlers: dict = None, verbose: bool = True
) -> float:
"""
Calculates the approximate memory footprint of an given object.
Automatically finds the contents of the following builtin
containers and their subclasses: tuple, list, deque, dict, set and
frozenset.
Parameters
----------
o : object
The object to calculate his memory footprint.
handlers : dict, optional
To search other containers, add handlers to iterate over their contents,
handlers = {SomeContainerClass: iter,
OtherContainerClass: OtherContainerClass.get_elements}
by default None
verbose : boolean, optional
If set to True, the following information will be printed for
each content of the object, by default False
- the size of the object in bytes.
- his type_
- the object values
Returns
-------
float
The memory used by the given object
Examples
--------
>>> import numpy as np
>>> from pymove.utils.mem import total_size
>>> arr = np.arange(10000, dtype=np.float64)
>>> sz = total_size(arr)
'Size in bytes: 80104, Type: <class 'numpy.ndarray'>'
>>> sz
432
"""
if handlers is None:
handlers = {}
def dict_handler(d):
return chain.from_iterable(d.items())
all_handlers = {
tuple: iter,
list: iter,
deque: iter,
dict: dict_handler,
set: iter,
frozenset: iter,
}
# user handlers take precedence
all_handlers.update(handlers)
# track which object id"srs have already been seen
seen = set()
# estimate sizeof object without __sizeof__
default_size = getsizeof(0)
def sizeof(o):
# do not double count the same object
if id(o) in seen:
return 0
seen.add(id(o))
s = getsizeof(o, default_size)
for typ, handler in all_handlers.items():
if isinstance(o, typ):
s += sum(map(sizeof, handler(o)))
break
if verbose:
logger.info(f'Size in bytes: {s}, Type: {type(o)}')
return s
return sizeof(o)
[docs]def begin_operation(name: str) -> dict:
"""
Gets the stats for the current operation.
Parameters
----------
name: str
name of the operation
Returns
-------
dict
dictionary with the operation stats
Examples
--------
>>> from pymove.utils.mem import begin_operation
>>> operation = begin_operation('operation')
>>> operation
{
'process': psutil.Process(
pid=103401, name='python', status='running', started='21:48:11'
),
'init': 293732352, 'start': 1622082973.8825781, 'name': 'operation'
}
"""
process = psutil.Process(os.getpid())
init = process.memory_info()[0]
start = time.time()
return {'process': process, 'init': init, 'start': start, 'name': name}
[docs]def end_operation(operation: dict) -> dict:
"""
Gets the time and memory usage of the operation.
Parameters
----------
operation: dict
dictionary with the begining stats of the operation
Returns
-------
dict
dictionary with the operation execution stats
Examples
--------
>>> import numpy as np
>>> import time
>>> from pymove.utils.mem import begin_operation, end_operation
>>> operation = begin_operation('create_arr')
>>> arr = np.arange(100000, dtype=np.float64)
>>> time.sleep(1.2)
>>> end_operation(operation)
{'name': 'create_arr', 'time in seconds': 1.2022554874420166, 'memory': '752.0 KiB'}
"""
finish = operation['process'].memory_info()[0]
last_operation_name = operation['name']
last_operation_time_duration = time.time() - operation['start']
last_operation_mem_usage = finish - operation['init']
return {
'name': last_operation_name,
'time in seconds': last_operation_time_duration,
'memory': sizeof_fmt(last_operation_mem_usage),
}
[docs]def sizeof_fmt(mem_usage: float, suffix: str = 'B') -> str:
"""
Returns the memory usage calculation of the last function.
Parameters
----------
mem_usage : float
memory usage in bytes
suffix: string, optional
suffix of the unit, by default 'B'
Returns
-------
str
A string of the memory usage in a more readable format
Examples
--------
>>> from pymove.utils.mem import sizeof_fmt
>>> sizeof_fmt(1024)
1.0 KiB
>>> sizeof_fmt(2e6)
1.9 MiB
"""
for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']:
if abs(mem_usage) < 1024.0:
return f'{mem_usage:3.1f} {unit}{suffix}'
mem_usage /= 1024.0
return '{:.1f} {}{}'.format(mem_usage, 'Yi', suffix)
[docs]def top_mem_vars(
variables: dict, n: int = 10, hide_private=True
) -> DataFrame:
"""
Shows the sizes of the active variables.
Parameters
----------
variables: locals() or globals()
Whether to shows local or global variables
n: int, optional
number of variables to show, by default
hide_private: bool, optional
Whether to hide private variables, by default True
Returns
-------
DataFrame
dataframe with variables names and sizes
Examples
--------
>>> import numpy as np
>>> from pymove.utils.mem import top_mem_vars
>>> arr = np.arange(100000, dtype=np.float64)
>>> long_string = 'Hello World!' * 100
>>> top_mem_vars(locals())
var mem
0 arr 781.4 KiB
1 long_string 1.2 KiB
2 local 416.0 B
3 top_mem_vars 136.0 B
4 np 72.0 B
"""
vars_ = iter([(name, getsizeof(value)) for name, value in variables.items()])
if hide_private:
vars_ = filter(lambda x: not x[0].startswith('_'), vars_)
top_vars = DataFrame(
sorted(vars_, key=lambda x: -x[1])[:n],
columns=['var', 'mem']
)
top_vars['mem'] = top_vars['mem'].apply(sizeof_fmt)
return top_vars