"""PandasMoveDataFrame class."""
from __future__ import annotations
from typing import TYPE_CHECKING, Any, Callable
import numpy as np
from pandas import DataFrame, DateOffset, Series, Timedelta
from pymove.core.dataframe import MoveDataFrame
from pymove.core.grid import Grid
from pymove.utils.constants import (
DATE,
DATETIME,
DAY,
DAY_PERIODS,
DIST_PREV_TO_NEXT,
DIST_TO_NEXT,
DIST_TO_PREV,
HOUR,
HOUR_COS,
HOUR_SIN,
LATITUDE,
LOCAL_LABEL,
LONGITUDE,
MOVE,
PERIOD,
SITUATION,
SPEED_PREV_TO_NEXT,
SPEED_TO_NEXT,
SPEED_TO_PREV,
STOP,
TID,
TIME_PREV_TO_NEXT,
TIME_TO_NEXT,
TIME_TO_PREV,
TRAJ_ID,
TYPE_DASK,
TYPE_PANDAS,
UID,
WEEK_DAYS,
WEEK_END,
)
from pymove.utils.conversions import lat_meters
from pymove.utils.distances import haversine
from pymove.utils.log import logger, progress_bar
from pymove.utils.mem import begin_operation, end_operation
from pymove.utils.trajectories import shift
if TYPE_CHECKING:
from pymove.core.dask import DaskMoveDataFrame
[docs]class PandasMoveDataFrame(DataFrame):
"""PyMove dataframe extending Pandas DataFrame."""
def __init__(
self,
data: DataFrame | list | dict,
latitude: str = LATITUDE,
longitude: str = LONGITUDE,
datetime: str = DATETIME,
traj_id: str = TRAJ_ID,
):
"""
Checks whether past data has 'lat', 'lon', 'datetime' columns.
Renames it with the PyMove lib standard. After starts the
attributes of the class.
- self._mgr : Represents trajectory data.
- self._type : Represents the type of layer below the data structure.
- self.last_operation : Represents the last operation performed.
Parameters
----------
data : DataFrame or list or dict
Input trajectory data
latitude : str, optional
Represents column name latitude, by default LATITUDE
longitude : str, optional
Represents column name longitude, by default LONGITUDE
datetime : str, optional
Represents column name datetime, by default DATETIME
traj_id : str, optional
Represents column name trajectory id, by default TRAJ_ID
Raises
------
KeyError
If missing one of lat, lon, datetime columns
ValueError, ParserError
If the data types can't be converted
"""
if isinstance(data, dict):
data = DataFrame.from_dict(data)
elif isinstance(data, DataFrame):
data = DataFrame(data)
elif (
isinstance(data, list) or isinstance(data, np.ndarray)
):
zip_list = [LATITUDE, LONGITUDE, DATETIME, TRAJ_ID]
for i in range(len(data[0])):
try:
zip_list[i] = zip_list[i]
except KeyError:
zip_list.append(str(i))
data = DataFrame(data, columns=zip_list)
columns = MoveDataFrame.format_labels(
traj_id, latitude, longitude, datetime
)
tdf = data.rename(columns=columns)
if MoveDataFrame.has_columns(tdf):
MoveDataFrame.validate_move_data_frame(tdf)
super().__init__(tdf)
self._type = TYPE_PANDAS
self.last_operation: dict = None # type: ignore[assignment]
else:
raise KeyError(
'Couldn\'t instantiate MoveDataFrame because data has missing columns.'
)
@property
def lat(self) -> Series:
"""
Checks for the LATITUDE column and returns its value.
Returns
-------
Series
LATITUDE column
Raises
------
AttributeError
If the LATITUDE column is not present in the DataFrame
"""
if LATITUDE not in self:
raise AttributeError(
"The MoveDataFrame does not contain the column '%s.'"
% LATITUDE
)
return self[LATITUDE]
@property
def lng(self) -> Series:
"""
Checks for the LONGITUDE column and returns its value.
Returns
-------
Series
LONGITUDE column
Raises
------
AttributeError
If the LONGITUDE column is not present in the DataFrame
"""
if LONGITUDE not in self:
raise AttributeError(
"The MoveDataFrame does not contain the column '%s.'"
% LONGITUDE
)
return self[LONGITUDE]
@property
def datetime(self) -> Series:
"""
Checks for the DATETIME column and returns its value.
Returns
-------
Series
DATETIME column
Raises
------
AttributeError
If the DATETIME column is not present in the DataFrame
"""
if DATETIME not in self:
raise AttributeError(
"The MoveDataFrame does not contain the column '%s.'"
% DATETIME
)
return self[DATETIME]
[docs] def rename(
self,
mapper: dict | Callable | None = None,
index: dict | Callable | None = None,
columns: dict | Callable | None = None,
axis: int | str | None = None,
copy: bool = True,
inplace: bool = False
) -> 'PandasMoveDataFrame' | DataFrame | None:
"""
Alter axes labels.
Function / dict values must be unique (1-to-1).
Labels not contained in a dict / Series will be left as-is.
Extra labels listed don’t throw an error.
Parameters
----------
mapper : dict or function, optional
Dict-like or functions transformations to apply to that axis’ values.
Use either mapper and axis to specify the axis to target
with mapper, or index and columns, by default None
index : dict or function, optional
Alternative to specifying axis
(mapper, axis=0 is equivalent to index=mapper), by default None
columns : dict or function, optional
Alternative to specifying axis
(mapper, axis=1 is equivalent to columns=mapper), by default None
axis : int or str, optional
Axis to target with mapper.
Can be either the axis name (‘index’, ‘columns’) or number (0, 1),
by default None
copy : bool, optional
Also copy underlying data, by default True
inplace : bool, optional
Whether to return a new DataFrame.
If True then value of copy is ignored, by default False
Returns
-------
PandasMoveDataFrame, DataFrame
DataFrame with the renamed axis labels or None
Raises
------
AttributeError
If trying to rename a required column inplace
"""
rename_ = super().rename(
mapper=mapper, index=index, columns=columns, axis=axis, copy=copy
)
if inplace:
if MoveDataFrame.has_columns(rename_):
self._mgr = rename_._mgr
self._item_cache: dict = dict()
rename_ = None
else:
raise AttributeError(
'Could not rename columns lat, lon, and datetime.'
)
if rename_ is not None and MoveDataFrame.has_columns(rename_):
rename_ = PandasMoveDataFrame(data=rename_)
return rename_
[docs] def len(self) -> int:
"""
Returns the length/row numbers in trajectory data.
Returns
-------
int
Represents the trajectory data length.
"""
return self.shape[0]
def __getitem__(self, key):
"""Retrieves and item from this object."""
item = super().__getitem__(key)
if (
isinstance(item, DataFrame)
and MoveDataFrame.has_columns(item)
):
return PandasMoveDataFrame(item)
return item
[docs] def head(self, n: int = 5) -> 'PandasMoveDataFrame':
"""
Return the first n rows.
This function returns the first n rows for the object
based on position. It is useful for quickly testing if
your object has the right type of data in it.
Parameters
----------
n : int, optional
Number of rows to select, by default 5
Returns
-------
PandasMoveDataFrame
The first n rows of the caller object.
References
----------
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.head.html
"""
head_ = super().head(n=n)
return PandasMoveDataFrame(data=head_)
[docs] def tail(self, n: int = 5) -> 'PandasMoveDataFrame':
"""
Return the last n rows.
This function returns the last n rows for the object
based on position. It is useful for quickly testing if
your object has the right type of data in it.
Parameters
----------
n : int, optional
Number of rows to select, by default 5
Returns
-------
PandasMoveDataFrame
The last n rows of the caller object.
References
----------
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.tail.html
"""
tail_ = super().tail(n=n)
return PandasMoveDataFrame(data=tail_)
[docs] def get_users_number(self) -> int:
"""
Check and return number of users in trajectory data.
Returns
-------
int
Represents the number of users in trajectory data.
"""
operation = begin_operation('get_users_numbers')
if UID in self:
number_ = self[UID].nunique()
else:
number_ = 1
self.last_operation = end_operation(operation)
return number_
[docs] def to_grid(
self,
cell_size: float,
meters_by_degree: float | None = None
) -> Grid:
"""
Converts trajectory data to grid format.
Parameters
----------
cell_size : float
Represents grid cell size.
meters_by_degree : float, optional
Represents the corresponding meters of lat by degree,
by default lat_meters(-3.71839)
Returns
-------
Grid
Represents the trajectory in grid format
"""
operation = begin_operation('to_grid')
if meters_by_degree is None:
meters_by_degree = lat_meters(-3.71839)
grid_ = Grid(
data=self, cell_size=cell_size, meters_by_degree=meters_by_degree
)
self.last_operation = end_operation(operation)
return grid_
[docs] def to_data_frame(self) -> DataFrame:
"""
Converts trajectory data to DataFrame format.
Returns
-------
DataFrame
Represents the trajectory in DataFrame format.
"""
return DataFrame(self)
[docs] def to_dicrete_move_df(
self, local_label: str = LOCAL_LABEL
) -> 'PandasMoveDataFrame':
"""
Generate a discrete dataframe move.
Parameters
----------
local_label : str, optional
Represents the column name of feature local label, default LOCAL_LABEL
Returns
-------
PandasDiscreteMoveDataFrame
Represents an PandasMoveDataFrame discretized.
"""
operation = begin_operation('to_discrete_move_df')
if local_label not in self:
raise ValueError(
f'columns {local_label} not in df'
)
self.last_operation = end_operation(operation)
from pymove.core.pandas_discrete import PandasDiscreteMoveDataFrame
return PandasDiscreteMoveDataFrame(
self, LATITUDE, LONGITUDE, DATETIME, TRAJ_ID, local_label
)
[docs] def copy(self, deep: bool = True) -> 'PandasMoveDataFrame':
"""
Make a copy of this object’s indices and data.
When deep=True (default), a new object will be created with a copy
of the calling object data and indices. Modifications to the
data or indices of the copy will not be reflected in the original
object (see notes below).
When deep=False, a new object will be created without copying the calling
object data or index (only references to the data and index are copied).
Any changes to the data of the original will be reflected in the
shallow copy (and vice versa).
Parameters
----------
deep : bool, optional
Make a deep copy, including a copy of the data and the indices.
With deep=False neither the indices nor the data are copied, by default True
Returns
-------
PandasMoveDataFrame
Object type matches caller.
Notes
-----
When deep=True, data is copied but actual Python objects will not be
copied recursively, only the reference to the object.
This is in contrast to copy.deepcopy in the Standard Library, which
recursively copies object data (see examples below).
While Index objects are copied when deep=True, the underlying
numpy array is not copied for performance reasons. Since Index is
immutable, the underlying data can be safely shared and a
copy is not needed.
"""
copy_ = super().copy(deep=deep)
return PandasMoveDataFrame(data=copy_)
[docs] def generate_tid_based_on_id_datetime(
self,
str_format: str = '%Y%m%d%H',
sort: bool = True,
inplace: bool = True
) -> 'PandasMoveDataFrame' | None:
"""
Create or update trajectory id based on id and datetime.
Parameters
----------
str_format : str, optional
Format to consider the datetime, by default '%Y%m%d%H'
sort : bool, optional
Wether to sort the dataframe, by default True
inplace : bool, optional
Represents whether the operation will be performed on
the data provided or in a copy, by default True
Returns
-------
PandasMoveDataFrame
Object with new features or None
"""
operation = begin_operation('generate_tid_based_on_id_datetime')
if not inplace:
data = self.copy()
else:
data = self
logger.debug('\nCreating or updating tid feature...\n')
if sort is True:
logger.debug(
'...Sorting by %s and %s to increase performance\n'
% (TRAJ_ID, DATETIME)
)
data.sort_values([TRAJ_ID, DATETIME], inplace=True)
data[TID] = data[TRAJ_ID].astype(str) + data[
DATETIME
].dt.strftime(str_format)
logger.debug('\n...tid feature was created...\n')
data.last_operation = end_operation(operation)
if not inplace:
return data
[docs] def generate_date_features(
self, inplace: bool = True
) -> 'PandasMoveDataFrame' | None:
"""
Create or update date feature based on datetime.
Parameters
----------
inplace : bool, optional
Represents whether the operation will be performed
on the data provided or in a copy, by default True
Returns
-------
PandasMoveDataFrame
Object with new features or None
"""
operation = begin_operation('generate_date_features')
if not inplace:
data = self.copy()
else:
data = self
logger.debug('Creating date features...')
if DATETIME in self:
data[DATE] = data[DATETIME].dt.date
logger.debug('..Date features was created...\n')
data.last_operation = end_operation(operation)
if not inplace:
return data
[docs] def generate_hour_features(
self, inplace: bool = True
) -> 'PandasMoveDataFrame' | None:
"""
Create or update hour features based on datetime.
Parameters
----------
inplace : bool, optional
Represents whether the operation will be performed
on the data provided or in a copy, by default True
Returns
-------
PandasMoveDataFrame
Object with new features or None
"""
operation = begin_operation('generate_hour_features')
if not inplace:
data = self.copy()
else:
data = self
logger.debug('\nCreating or updating a feature for hour...\n')
if DATETIME in self:
data[HOUR] = data[DATETIME].dt.hour
logger.debug('...Hour feature was created...\n')
data.last_operation = end_operation(operation)
if not inplace:
return data
[docs] def generate_day_of_the_week_features(
self, inplace: bool = True
) -> 'PandasMoveDataFrame' | None:
"""
Create or update day of the week features based on datetime.
Parameters
----------
inplace : bool, optional
Represents whether the operation will be performed
on the data provided or in a copy, by default True
Returns
-------
PandasMoveDataFrame
Object with new features or None
"""
operation = begin_operation('generate_day_of_the_week_features')
if not inplace:
data = self.copy()
else:
data = self
logger.debug('\nCreating or updating day of the week feature...\n')
data[DAY] = data[DATETIME].dt.day_name()
logger.debug('...the day of the week feature was created...\n')
data.last_operation = end_operation(operation)
if not inplace:
return data
[docs] def generate_weekend_features(
self,
create_day_of_week: bool = False,
inplace: bool = True
) -> 'PandasMoveDataFrame' | None:
"""
Adds information to rows determining if it is a weekend day.
Create or update the feature weekend to the dataframe,
if this resource indicates that the given day is the
weekend, otherwise, it is a day of the week.
Parameters
----------
create_day_of_week : bool, optional
Indicates if the column day should be keeped in the dataframe.
If set to False the column will be dropped, by default False
inplace : bool, optional
Represents whether the operation will be performed
on the data provided or in a copy, by default True
Returns
-------
PandasMoveDataFrame
Object with new features or None
"""
operation = begin_operation('generate_weekend_features')
if not inplace:
data = self.copy()
else:
data = self
data.generate_day_of_the_week_features(inplace=True)
logger.debug('Creating or updating a feature for weekend\n')
if DAY in data:
fds = (data[DAY] == WEEK_DAYS[5]) | (data[DAY] == WEEK_DAYS[6])
index_fds = data[fds].index
data[WEEK_END] = 0
data.at[index_fds, WEEK_END] = 1
logger.debug('...Weekend was set as 1 or 0...\n')
if not create_day_of_week:
logger.debug('...dropping colum day\n')
del data[DAY]
data.last_operation = end_operation(operation)
if not inplace:
return data
[docs] def generate_time_of_day_features(
self, inplace: bool = True
) -> 'PandasMoveDataFrame' | None:
"""
Create or update time of day features based on datetime.
Parameters
----------
inplace : bool, optional
Represents whether the operation will be performed
on the data provided or in a copy, by default True
Returns
-------
PandasMoveDataFrame
Object with new features or None
Early morning from 0H to 6H
Morning from 6H to 12H
Afternoon from 12H to 18H
Evening from 18H to 24H
Examples
--------
- datetime1 = 2019-04-28 02:00:56 -> period = Early Morning
- datetime2 = 2019-04-28 08:00:56 -> period = Morning
- datetime3 = 2019-04-28 14:00:56 -> period = Afternoon
- datetime4 = 2019-04-28 20:00:56 -> period = Evening
"""
operation = begin_operation('generate_time_of_day_features')
if not inplace:
data = self.copy()
else:
data = self
periods = [
'\n' 'Creating or updating period feature',
'...Early morning from 0H to 6H',
'...Morning from 6H to 12H',
'...Afternoon from 12H to 18H',
'...Evening from 18H to 24H' '\n',
]
logger.debug('\n'.join(periods))
hours = data[DATETIME].dt.hour
conditions = [
(hours >= 0) & (hours < 6),
(hours >= 6) & (hours < 12),
(hours >= 12) & (hours < 18),
(hours >= 18) & (hours < 24),
]
data[PERIOD] = np.select(conditions, DAY_PERIODS, 'undefined')
logger.debug('...the period of day feature was created')
data.last_operation = end_operation(operation)
if not inplace:
return data
@staticmethod
def _prepare_generate_data(
data_: DataFrame, sort: bool, label_id: str
) -> tuple[Any, int, None]:
"""
Processes the data and create variables for generate methods.
Parameters
----------
data_ : DataFrame
Dataframe to be processed.
sort : bool
Whether to sort the data.
label_id : str
Name of the label feature.
Returns
-------
Tuple[List, int, int, int]
data_ unique ids.
size of id.
starting index
"""
if sort is True:
logger.debug(
'...Sorting by %s and %s to increase performance\n'
% (label_id, DATETIME)
)
data_.sort_values([label_id, DATETIME])
if data_.index.name is None:
logger.debug(
'...Set %s as index to a higher performance\n'
% label_id
)
data_.set_index(label_id, inplace=True)
ids = data_.index.unique()
size_id = 0
idx = None
return ids, size_id, idx
[docs] def generate_dist_time_speed_features(
self,
label_id: str = TRAJ_ID,
label_dtype: Callable = np.float64,
sort: bool = True,
inplace: bool = True
) -> 'PandasMoveDataFrame' | None:
"""
Adds distance, time and speed information to the dataframe.
Firstly, create the three distance to an GPS point P (lat, lon). After,
create two time features to point P: time to previous and time to next.
Lastly, create two features to speed using time and distance features.
Parameters
----------
label_id : str, optional
Represents name of column of trajectories id, by default TRAJ_ID
label_dtype : callable, optional
Represents column id type, by default np.float64
sort : bool, optional
If sort == True the dataframe will be sorted, by True
inplace : bool, optional
Represents whether the operation will be performed on
the data provided or in a copy, by default True
Returns
-------
PandasMoveDataFrame
Object with new features or None
Examples
--------
- dist_to_prev = 248.33 meters, dist_to_prev 536.57 meters
- time_to_prev = 60 seconds, time_prev = 60.0 seconds
- speed_to_prev = 4.13 m/srs, speed_prev = 8.94 m/srs.
"""
operation = begin_operation('generate_dist_time_speed_features')
if not inplace:
data = self.copy()
else:
data = self
ids, size_id, idx = self._prepare_generate_data(
data, sort, label_id
)
message = '\nCreating or updating distance, time and speed features'
message += ' in meters by seconds\n'
logger.debug(
message
)
# create new feature to distance
data[DIST_TO_PREV] = label_dtype(-1.0)
# create new feature to time
data[TIME_TO_PREV] = label_dtype(-1.0)
# create new feature to speed
data[SPEED_TO_PREV] = label_dtype(-1.0)
for idx in progress_bar(
ids, desc='Generating distance, time and speed features'
):
curr_lat = data.at[idx, LATITUDE]
curr_lon = data.at[idx, LONGITUDE]
size_id = curr_lat.size
if size_id <= 1:
data.at[idx, DIST_TO_PREV] = np.nan
data.at[idx, TIME_TO_PREV] = np.nan
data.at[idx, SPEED_TO_PREV] = np.nan
else:
prev_lat = shift(curr_lat, 1)
prev_lon = shift(curr_lon, 1)
# compute distance from previous to current point
data.at[idx, DIST_TO_PREV] = haversine(
prev_lat, prev_lon, curr_lat, curr_lon
)
time_ = data.at[idx, DATETIME].values.astype(label_dtype)
time_prev = (time_ - shift(time_, 1)) * (10 ** -9)
data.at[idx, TIME_TO_PREV] = time_prev
# set speed features
data.at[idx, SPEED_TO_PREV] = (
data.at[idx, DIST_TO_PREV] / time_prev
) # unit: m/srs
data.reset_index(inplace=True)
data.last_operation = end_operation(operation)
if not inplace:
return data
[docs] def generate_dist_features(
self,
label_id: str = TRAJ_ID,
label_dtype: Callable = np.float64,
sort: bool = True,
inplace: bool = True
) -> 'PandasMoveDataFrame' | None:
"""
Create the three distance in meters to an GPS point P.
Parameters
----------
label_id : str, optional
Represents name of column of trajectories id, by default TRAJ_ID
label_dtype : callable, optional
Represents column id type, by default np.float64
sort : bool, optional
If sort == True the dataframe will be sorted, by True
inplace : bool, optional
Represents whether the operation will be performed on
the data provided or in a copy, by default True
Returns
-------
PandasMoveDataFrame
Object with new features or None
Examples
--------
- P to P.next = 2 meters
- P to P.previous = 1 meter
- P.previous to P.next = 1 meters
"""
operation = begin_operation('generate_dist_features')
if not inplace:
data = self.copy()
else:
data = self
ids, size_id, idx = self._prepare_generate_data(
data, sort, label_id
)
logger.debug('\nCreating or updating distance features in meters...\n')
# create ou update columns
data[DIST_TO_PREV] = label_dtype(-1.0)
data[DIST_TO_NEXT] = label_dtype(-1.0)
data[DIST_PREV_TO_NEXT] = label_dtype(-1.0)
for idx in progress_bar(ids, desc='Generating distance features'):
curr_lat = data.at[idx, LATITUDE]
curr_lon = data.at[idx, LONGITUDE]
size_id = curr_lat.size
if size_id <= 1:
data.at[idx, DIST_TO_PREV] = np.nan
else:
prev_lat = shift(curr_lat, 1)
prev_lon = shift(curr_lon, 1)
# compute distance from previous to current point
data.at[idx, DIST_TO_PREV] = haversine(
prev_lat, prev_lon, curr_lat, curr_lon
)
next_lat = shift(curr_lat, -1)
next_lon = shift(curr_lon, -1)
# compute distance to next point
data.at[idx, DIST_TO_NEXT] = haversine(
curr_lat, curr_lon, next_lat, next_lon
)
# using pandas shift in a large dataset: 7min 21s
# using numpy shift above: 33.6 srs
# use distance from previous to next
data.at[idx, DIST_PREV_TO_NEXT] = haversine(
prev_lat, prev_lon, next_lat, next_lon
)
data.reset_index(inplace=True)
data.last_operation = end_operation(operation)
if not inplace:
return data
[docs] def generate_time_features(
self,
label_id: str = TRAJ_ID,
label_dtype: Callable = np.float64,
sort: bool = True,
inplace: bool = True
) -> 'PandasMoveDataFrame' | None:
"""
Create the three time in seconds to an GPS point P.
Parameters
----------
label_id : str, optional
Represents name of column of trajectories id, by default TRAJ_ID
label_dtype : callable, optional
Represents column id type, by default np.float64
sort : bool, optional
If sort == True the dataframe will be sorted, by True
inplace : bool, optional
Represents whether the operation will be performed on
the data provided or in a copy, by default True
Returns
-------
PandasMoveDataFrame
Object with new features or None
Examples
--------
- P to P.next = 5 seconds
- P to P.previous = 15 seconds
- P.previous to P.next = 20 seconds
"""
operation = begin_operation('generate_time_features')
if not inplace:
data = self.copy()
else:
data = self
ids, size_id, idx = self._prepare_generate_data(
data, sort, label_id
)
logger.debug(
'\nCreating or updating time features seconds\n'
)
# create new feature to time
data[TIME_TO_PREV] = label_dtype(-1.0)
data[TIME_TO_NEXT] = label_dtype(-1.0)
data[TIME_PREV_TO_NEXT] = label_dtype(-1.0)
for idx in progress_bar(
ids, desc='Generating time features'
):
curr_time = data.at[idx, DATETIME].values.astype(label_dtype)
size_id = curr_time.size
if size_id <= 1:
data.at[idx, TIME_TO_PREV] = np.nan
else:
prev_time = shift(curr_time, 1)
time_prev = (curr_time - prev_time) * (10 ** -9)
data.at[idx, TIME_TO_PREV] = time_prev
next_time = shift(curr_time, -1)
time_prev = (next_time - curr_time) * (10 ** -9)
data.at[idx, TIME_TO_NEXT] = time_prev
time_prev_to_next = (next_time - prev_time) * (10 ** -9)
data.at[idx, TIME_PREV_TO_NEXT] = time_prev_to_next
data.reset_index(inplace=True)
data.last_operation = end_operation(operation)
if not inplace:
return data
[docs] def generate_speed_features(
self,
label_id: str = TRAJ_ID,
label_dtype: Callable = np.float64,
sort: bool = True,
inplace: bool = True
) -> 'PandasMoveDataFrame' | None:
"""
Create the three speed in meter by seconds to an GPS point P.
Parameters
----------
label_id : str, optional
Represents name of column of trajectories id, by default TRAJ_ID
label_dtype : callable, optional
Represents column id type, by default np.float64
sort : bool, optional
If sort == True the dataframe will be sorted, by True
inplace : bool, optional
Represents whether the operation will be performed on
the data provided or in a copy, by default True
Returns
-------
PandasMoveDataFrame
Object with new features or None
Raises
------
ValueError
If feature generation fails
Examples
--------
- P to P.next = 1 meter/seconds
- P to P.previous = 3 meter/seconds
- P.previous to P.next = 2 meter/seconds
"""
operation = begin_operation('generate_speed_features')
if not inplace:
data = self.copy()
else:
data = self
logger.debug(
'\nCreating or updating speed features meters by seconds\n'
)
dists = data.generate_dist_features(
label_id, label_dtype, sort, inplace=False
)
times = data.generate_time_features(
label_id, label_dtype, sort, inplace=False
)
if dists is None or times is None:
raise ValueError('Geretating dist or time feature failed')
data[SPEED_TO_PREV] = dists[DIST_TO_PREV] / times[TIME_TO_PREV]
data[SPEED_TO_NEXT] = dists[DIST_TO_NEXT] / times[TIME_TO_NEXT]
d_prev_next = dists[DIST_TO_PREV] + dists[DIST_TO_NEXT]
data[SPEED_PREV_TO_NEXT] = d_prev_next / times[TIME_PREV_TO_NEXT]
self._prepare_generate_data(
data, sort, label_id
)
data.reset_index(inplace=True)
data.last_operation = end_operation(operation)
if not inplace:
return data
[docs] def generate_move_and_stop_by_radius(
self,
radius: float = 0,
target_label: str = DIST_TO_PREV,
inplace: bool = True
):
"""
Create or update column with move and stop points by radius.
Parameters
----------
radius : float, optional
Represents radius, by default 0
target_label : str, optional
Represents column to compute, by default DIST_TO_PREV
inplace : bool, optional
Represents whether the operation will be performed on
the data provided or in a copy, by default True
Returns
-------
PandasMoveDataFrame
Object with new features or None
"""
operation = begin_operation('generate_move_and_stop_by_radius')
if not inplace:
data = self.copy()
else:
data = self
data.generate_dist_features(inplace=True)
logger.debug('\nCreating or updating features MOVE and STOPS...\n')
conditions = (
(data[target_label] > radius),
(data[target_label] <= radius),
)
choices = [MOVE, STOP]
data[SITUATION] = np.select(conditions, choices, np.nan)
logger.debug(
'\n....There are %s stops to this parameters\n'
% (data[data[SITUATION] == STOP].shape[0])
)
data.last_operation = end_operation(operation)
if not inplace:
return data
[docs] def time_interval(self) -> Timedelta:
"""
Get time difference between max and min datetime in trajectory data.
Returns
-------
Timedelta
Represents the time difference.
"""
operation = begin_operation('time_interval')
time_diff = self[DATETIME].max() - self[DATETIME].min()
self.last_operation = end_operation(operation)
return time_diff
[docs] def get_bbox(self) -> tuple[float, float, float, float]:
"""
Returns the bounding box of the dataframe.
A bounding box (usually shortened to bbox) is an area defined by two
longitudes and two latitudes, where:
- Latitude is a decimal number between -90.0 and 90.0.
- Longitude is a decimal number between -180.0 and 180.0.
They usually follow the standard format of:
- bbox = left, bottom, right, top
- bbox = min Longitude , min Latitude , max Longitude , max Latitude
Returns
-------
Tuple[float, float, float, float]:
Represents a bound box, that is a tuple of 4 values with
the min and max limits of latitude e longitude.
lat_min, lon_min, lat_max, lon_max
Examples
--------
(22.147577, 113.54884299999999, 41.132062, 121.156224)
"""
operation = begin_operation('get_bbox')
bbox_ = (
self[LATITUDE].min(),
self[LONGITUDE].min(),
self[LATITUDE].max(),
self[LONGITUDE].max(),
)
self.last_operation = end_operation(operation)
return bbox_
[docs] def show_trajectories_info(self):
"""
Show dataset information from dataframe.
Displays the number of rows, datetime interval, and bounding box.
Examples
--------
====================== INFORMATION ABOUT DATASET ======================
Number of Points: 217654
Number of IDs objects: 2
Start Date:2008-10-23 05:53:05 End Date:2009-03-19 05:46:37
Bounding Box:(22.147577, 113.54884299999999, 41.132062, 121.156224)
=======================================================================
"""
operation = begin_operation('show_trajectories_info')
message = ('=' * 22) + ' INFORMATION ABOUT DATASET ' + ('=' * 22)
print(
'\n%s\n' % message
)
print('Number of Points: %s\n' % self.shape[0])
if TRAJ_ID in self:
print(
'Number of IDs objects: %s\n'
% self[TRAJ_ID].nunique()
)
if TID in self:
print(
'Number of TIDs trajectory: %s\n'
% self[TID].nunique()
)
if DATETIME in self:
dt_max = self[DATETIME].max()
dt_min = self[DATETIME].min()
print(
'Start Date:%s End Date:%s\n'
% (dt_min, dt_max)
)
if LATITUDE and LONGITUDE in self:
print(
f'Bounding Box:{self.get_bbox()}\n'
) # bbox return = Lat_min , Long_min, Lat_max, Long_max
if TIME_TO_PREV in self:
t_max = round(self[TIME_TO_PREV].max(), 3)
t_min = round(self[TIME_TO_PREV].min(), 3)
print(
'Gap time MAX:%s Gap time MIN:%s\n'
% (t_max, t_min)
)
if SPEED_TO_PREV in self:
s_max = round(self[SPEED_TO_PREV].max(), 3)
s_min = round(self[SPEED_TO_PREV].min(), 3)
print(
'Speed MAX:%s Speed MIN:%s\n'
% (s_max, s_min)
)
if DIST_TO_PREV in self:
d_max = round(self[DIST_TO_PREV].max(), 3)
d_min = round(self[DIST_TO_PREV].min(), 3)
print(
'Distance MAX:%s Distance MIN:%s\n'
% (d_max, d_min)
)
print(
'\n%s\n' % ('=' * len(message))
)
self.last_operation = end_operation(operation)
[docs] def astype(
self,
dtype: Callable | dict,
copy: bool = True,
errors: str = 'raise'
) -> DataFrame:
"""
Cast a pandas object to a specified dtype.
Parameters
----------
dtype: callable, dict
Use a numpy.dtype or Python type to cast entire pandas object
to the same type. Alternatively, use {col: dtype, …},
where col is a column label and dtype is a numpy.dtype
or Python type to cast one or more of the DataFrame
columns to column-specific types.
copy: bool, optional
Return a copy when copy=True (be very careful setting
copy=False as changes to values then
may propagate to other pandas objects), by default True
errors: str, optional
Control raising of exceptions on invalid data for provided dtype,
by default 'raise
- raise : allow exceptions to be raised
- ignore : suppress exceptions. On error return original object
Returns
-------
DataFrame
Casted object to specified type.
References
----------
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.astype.html
Raises
------
AttributeError
If trying to change required types inplace
"""
if not copy and isinstance(dtype, str):
raise AttributeError(
'Could not change lat, lon, and datetime type.'
)
elif not copy and isinstance(dtype, dict):
keys = set(list(dtype.keys()))
columns = {LATITUDE, LONGITUDE, DATETIME}
if keys & columns:
raise AttributeError(
'Could not change lat, lon, and datetime type.'
)
return super().astype(dtype=dtype, copy=copy, errors=errors)
[docs] def sort_values(
self,
by: str | list[str],
axis: int = 0,
ascending: bool = True,
inplace: bool = False,
kind: str = 'quicksort',
na_position: str = 'last',
) -> 'PandasMoveDataFrame' | None:
"""
Sorts the values of the _data, along an axis.
Parameters
----------
by: str, list
Name or list of names to sort the _data by
axis: int, optional
if set to 0 or 'index', will count for each column.
if set to 1 or 'columns', will count for each row
by default 0
ascending: bool, optional
Sort ascending vs. descending. Specify list for
multiple sort orders.
If this is a list of bool, must match the length, by default True
inplace: bool, optional
if set to true the original dataframe will be altered,
the duplicates will be dropped in place,
otherwise the operation will be made in a copy,
that will be returned, by default False
kind: str, optional
Choice of sorting algorithm, 'quicksort', 'mergesort', 'heapsort'
For DataFrames, this option is only applied when sorting
on a single column or label, by default 'quicksort'
na_position: str, optional
'first', 'last', by default 'last
If 'first' puts NaNs at the beginning;
If last puts NaNs at the end.
Returns
-------
PandasMoveDataFrame
The sorted dataframe or None
References
----------
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sort_values.html
"""
_sort_values = super().sort_values(
by=by, axis=axis, ascending=ascending,
inplace=False, kind=kind, na_position=na_position
)
if inplace:
self._mgr = _sort_values._mgr
self._item_cache = dict()
return None
return PandasMoveDataFrame(data=_sort_values)
[docs] def reset_index(
self,
level: int | str | tuple | list | None = None,
drop: bool = False,
inplace: bool = False,
col_level: int | str = 0,
col_fill: str = ''
) -> 'PandasMoveDataFrame' | None:
"""
Resets the DataFrame's index, and use the default one.
One or more levels can be removed, if the DataFrame has a MultiIndex.
Parameters
----------
level: int or str or tuple or list, optional
Only the levels specify will be removed from the index
If set to None, all levels are removed, by default None
drop: bool, optional
Do not try to insert index into dataframe columns
This resets the index to the default integer index, by default False
inplace: bool, optional
Modify the DataFrame in place (do not create a new object), by default False
col_level: int or str, optional
If the columns have multiple levels, determines which level
the labels are inserted into, by default 0
col_fill: str, optional
If the columns have multiple levels, determines how the other levels are named
If None then the index name is repeated, by default ''
PandasMoveDataFrame
The generated picture or None
References
----------
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.reset_index.html
"""
_reset_index = super().reset_index(
level=level, drop=drop, inplace=False, col_level=col_level, col_fill=col_fill
)
if inplace:
self._mgr = _reset_index._mgr
self._item_cache = dict()
return None
return PandasMoveDataFrame(data=_reset_index)
[docs] def set_index(
self,
keys: str | list[str],
drop: bool = True,
append: bool = False,
inplace: bool = False,
verify_integrity: bool = False,
) -> 'PandasMoveDataFrame' | DataFrame | None:
"""
Set the DataFrame index (row labels) using one or more existing columns or arrays.
Parameters
----------
keys: str, list
label or array-like or list of labels/arrays
This parameter can be either a single column key, a single
array of the same length as the calling DataFrame,
or a list containing an arbitrary combination of
column keys and arrays
drop: bool, optional
Delete columns to be used as the new index, by default True
append: bool, optional
Whether to append columns to existing index, by default True
inplace: bool, optional
Modify the DataFrame in place (do not create a new object), by default True
verify_integrity: bool, optional
Check the new index for duplicates
Otherwise defer the check until necessary
Setting to False will improve the performance of this method, by default True
Returns
-------
PandasMoveDataFrame, DataFrame
Object with a new index or None
References
----------
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.set_index.html
Raises
------
AttributeError
If trying to change required columns types
"""
if inplace and drop:
if isinstance(keys, str):
aux = {keys}
else:
aux = set(keys)
columns = {LATITUDE, LONGITUDE, DATETIME}
if aux & columns:
raise AttributeError(
'Could not change lat, lon, and datetime type.'
)
_set_index = super().set_index(
keys=keys, drop=drop, append=append,
inplace=False, verify_integrity=verify_integrity
)
if inplace:
self._mgr = _set_index._mgr
self._item_cache = dict()
_set_index = None
if _set_index is not None and MoveDataFrame.has_columns(_set_index):
_set_index = PandasMoveDataFrame(data=_set_index)
return _set_index
[docs] def drop(
self,
labels: str | list[str] | None = None,
axis: int | str = 0,
index: str | list[str] | None = None,
columns: str | list[str] | None = None,
level: int | str | None = None,
inplace: bool = False,
errors: str = 'raise',
) -> 'PandasMoveDataFrame' | DataFrame | None:
"""
Removes rows or columns.
By specifying label names and corresponding axis,
or by specifying directly index or column names.
When using a multiindex, labels on different levels
can be removed by specifying the level.
Parameters
----------
labels: str or list, optional
Index or column labels to drop, by default None
axis: int or str, optional
Whether to drop labels from the index (0 or 'index')
or columns (1 or 'columns'), by default 0
index: str or list, optional
Alternative to specifying axis
(labels, axis=0 is equivalent to index=labels), by default None
columns: str or list, optional
Alternative to specifying axis
(labels, axis=1 is equivalent to columns=labels), by default None
level: str or int, optional
For MultiIndex, level from which the labels will be removed, by default None
inplace: bool, optional
If True, do operation inplace and return None
Otherwise, make a copy, do operations and return, by default False
errors: bool, optional
'ignore', 'raise', by default 'raise'
If 'ignore', suppress error and only existing labels are dropped.
Returns
-------
PandasMoveDataFrame, DataFrame
Object without the removed index or column labels or None
Raises
------
AttributeError
If trying to drop a required column inplace
KeyError
If any of the labels is not found in the selected axis.
References
----------
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html
"""
if inplace:
_labels1 = set()
_labels2 = set()
if labels is not None:
if isinstance(labels, str):
_labels1 = {labels}
else:
_labels1 = set(labels)
elif columns is not None:
if isinstance(columns, str):
_labels2 = {columns}
else:
_labels2 = set(columns)
_columns = {LATITUDE, LONGITUDE, DATETIME}
if (
(axis == 1 or axis == 'columns' or columns)
and (_labels1.union(_labels2) & _columns)
):
raise AttributeError(
'Could not drop columns lat, lon, and datetime.'
)
_drop = super().drop(
labels=labels, axis=axis, index=index, columns=columns,
level=level, inplace=False, errors=errors
)
if inplace:
self._mgr = _drop._mgr
self._item_cache = dict()
_drop = None
if _drop is not None and MoveDataFrame.has_columns(_drop):
_drop = PandasMoveDataFrame(data=_drop)
return _drop
[docs] def drop_duplicates(
self,
subset: int | str | None = None,
keep: str | bool = 'first',
inplace: bool = False
) -> 'PandasMoveDataFrame' | None:
"""
Uses the pandas's function drop_duplicates, to remove duplicated rows from data.
Parameters
----------
subset: int or str, optional
Only consider certain columns for identifying duplicates,
by default use all of the columns, by default None
keep: str, optional
- first : Drop duplicates except for the first occurrence.
- last : Drop duplicates except for the last occurrence.
- False : Drop all duplicates.
by default 'first'
inplace: bool, optional
Whether to drop duplicates in place or to return a copy, by default False
Returns
-------
PandasMoveDataFrame
Object with duplicated rows or None
References
----------
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html
"""
_drop_duplicates = super().drop_duplicates(
subset=subset, keep=keep, inplace=False
)
if inplace:
self._mgr = _drop_duplicates._mgr
self._item_cache = dict()
return None
return PandasMoveDataFrame(data=_drop_duplicates)
[docs] def shift(
self,
periods: int = 1,
freq: DateOffset | Timedelta | str | None = None,
axis: int | str = 0,
fill_value: Any | None = None
) -> 'PandasMoveDataFrame':
"""
Shift index by desired number of periods with an optional time freq.
Parameters
----------
periods: int, optional, default 1
Number of periods to shift. Can be positive or negative.
freq: DateOffset or Timedelta or str, optional, default None
Offset to use from the series module or time rule (e.g. 'EOM').
If freq is specified then the index values are shifted but
the data is not realigned. That is, use freq if you would like
to extend the index when shifting and preserve the original data.
When freq is not passed, shift the index without realigning the
data. If freq is passed (in this case, the index must be
date or datetime, or it will raise a NotImplementedError),
the index will be increased using the periods and the freq.
axis: 0 or 'index', 1 or 'columns', None, optional, default 0
Shift direction.
fill_value: object, optional, default None
The scalar value to use for newly introduced missing values.
The default depends on the dtype of self.
For numeric data, np.nan is used.
For datetime, timedelta, or period data, etc.
NaT is used. For extension dtypes, self.dtype.na_value is used.
Returns
-------
PandasMoveDataFrame
A copy of the original object, shifted.
References
----------
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.shift.html
"""
_shift = super().shift(
periods=periods, freq=freq, axis=axis, fill_value=fill_value
)
return PandasMoveDataFrame(data=_shift)
[docs] def fillna(
self,
value: Any | None = None,
method: str | None = None,
axis: int | str | None = None,
inplace: bool = False,
limit: int | None = None,
downcast: dict | None = None,
):
"""
Fill NA/NaN values using the specified method.
Parameters
----------
value : scalar, dict, Series, or DataFrame
Value to use to fill holes (e.g. 0), alternately a
dict/Series/DataFrame of values specifying which value to use for
each index (for a Series) or column (for a DataFrame). Values not
in the dict/Series/DataFrame will not be filled. This value cannot
be a list.
method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
Method to use for filling holes in reindexed Series
pad / ffill: propagate last valid observation forward to next valid
backfill / bfill: use next valid observation to fill gap.
axis : {0 or 'index', 1 or 'columns'}
Axis along which to fill missing values.
inplace : bool, default False
If True, fill in-place. Note: this will modify any
other views on this object (e.g., a no-copy slice for a column in a
DataFrame).
limit : int, default None
If method is specified, this is the maximum number of consecutive
NaN values to forward/backward fill. In other words, if there is
a gap with more than this number of consecutive NaNs, it will only
be partially filled. If method is not specified, this is the
maximum number of entries along the entire axis where NaNs will be
filled. Must be greater than 0 if not None.
downcast : dict, default is None
A dict of item->dtype of what to downcast if possible,
or the str 'infer' which will try to downcast to an appropriate
equal type (e.g. float64 to int64 if possible).
Returns
-------
PandasMoveDataFrame
Object with missing values filled or None
References
----------
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html
"""
_fillna = super().fillna(
value=value, method=method, axis=axis,
inplace=False, limit=limit, downcast=downcast
)
if inplace:
self._mgr = _fillna._mgr
self._item_cache = dict()
return None
return PandasMoveDataFrame(data=_fillna)
[docs] def dropna(
self,
axis: int | str = 0,
how: str = 'any',
thresh: float | None = None,
subset: list | None = None,
inplace: bool = False
):
"""
Removes missing data.
Parameters
----------
axis: 0 or 'index', 1 or 'columns', None, optional
Determine if rows or columns are removed, by default 0
- 0, or 'index' : Drop rows which contain missing values.
- 1, or 'columns' : Drop columns which contain missing value.
how: str, optional
Determine if row or column is removed from DataFrame, by default 'any
when we have at least one NA or all NA.
- 'any' : If any NA values are present, drop that row or column.
- 'all' : If all values are NA, drop that row or column.
thresh: float, optional
Require that many non-NA values, by default None
subset: array-like, optional
Labels along other axis to consider, by default None
e.g. if you are dropping rows these would be a
list of columns to include.
inplace: bool, optional
If True, do operation inplace and return None, by default False
Returns
-------
PandasMoveDataFrame
Object with NA entries dropped or None
References
----------
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html
Raises
------
AttributeError
If trying to drop required columns inplace
"""
if inplace:
if axis == 1 or axis == 'columns':
columns = [LATITUDE, LONGITUDE, DATETIME]
data = self[columns]
if data.isnull().values.any():
raise AttributeError(
'Could not drop columns lat, lon, and datetime.'
)
_dropna = super().dropna(
axis=axis, how=how, thresh=thresh, subset=subset, inplace=False)
if inplace:
self._mgr = _dropna._mgr
self._item_cache = dict()
_dropna = None
if _dropna is not None and MoveDataFrame.has_columns(_dropna):
_dropna = PandasMoveDataFrame(data=_dropna)
return _dropna
[docs] def sample(
self,
n: int | None = None,
frac: float | None = None,
replace: bool = False,
weights: str | list | None = None,
random_state: int | None = None,
axis: int | str | None = None
) -> 'PandasMoveDataFrame':
"""
Return a random sample of items from an axis of object.
You can use `random_state` for reproducibility.
Parameters
----------
n : int, optional
Number of items from axis to return. Cannot be used with `frac`,
by default None
frac : float, optional
Fraction of axis items to return. Cannot be used with `n`, by deault None
replace : bool, optional
Allow or disallow sampling of the same row more than once, by default False
weights : str or ndarray-like, optional
If 'None' results in equal probability weighting.
If passed a Series, will align with target object on index. Index
values in weights not found in sampled object will be ignored and
index values in sampled object not in weights will be assigned
weights of zero.
If called on a DataFrame, will accept the name of a column
when axis = 0.
Unless weights are a Series, weights must be same length as axis
being sampled.
If weights do not sum to 1, they will be normalized to sum to 1.
Missing values in the weights column will be treated as zero.
Infinite values not allowed.
by default None
random_state : int or numpy.random.RandomState, optional
Seed for the random number generator (if int), or numpy RandomState
object,by default None
axis : {0 or 'index', 1 or 'columns', None}, optional
Axis to sample. Accepts axis number or name. Default is stat axis
for given data type (0 for Series and DataFrames), by default None
Returns
-------
PandasMoveDataFrame
A new object of same type as caller containing `n` items randomly
sampled from the caller object.
See Also
--------
numpy.random.choice: Generates a random sample from a given 1-D numpy
array.
Notes
-----
If `frac` > 1, `replacement` should be set to `True`.
References
----------
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sample.html
"""
_sample = super().sample(
n=n, frac=frac, replace=replace, weights=weights,
random_state=random_state, axis=axis
)
return PandasMoveDataFrame(data=_sample)
[docs] def isin(self, values: list | Series | DataFrame | dict) -> DataFrame:
"""
Determines whether each element in the DataFrame is contained in values.
values : iterable, Series, DataFrame or dict
The result will only be true at a location if all the labels match.
If values is a Series, the index.
If values is a dict, the keys must be the
column names, which must match.
If values is a DataFrame, then both the
index and column labels must match.
Returns
-------
DataFrame:
DataFrame of booleans showing whether
each element in the DataFrame is contained in values
References
----------
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.isin.html
"""
return DataFrame(self).isin(values)
[docs] def append(
self,
other: 'PandasMoveDataFrame' | DataFrame,
ignore_index: bool = False,
verify_integrity: bool = False,
sort: bool = False
) -> 'PandasMoveDataFrame':
"""
Append rows of other to the end of caller, returning a new object.
Columns in other that are not in the caller are added as new columns.
Parameters
----------
other : DataFrame or Series/dict-like object, or list of these
The data to append.
ignore_index : bool, optional
If True, do not use the index labels, by default False
verify_integrity : bool, optional
If True, raise ValueError on creating index with duplicates, by default False
sort : bool, optional
Sort columns if the columns of self and other are not aligned
The default sorting is deprecated and will
change to not-sorting in a future version of pandas.
by default False
Returns
-------
PandasMoveDataFrame
A dataframe containing rows from both the caller and `other`.
References
----------
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.append.html
"""
if isinstance(other, PandasMoveDataFrame):
other = DataFrame(other)
_append = super().append(
other=other, ignore_index=ignore_index,
verify_integrity=verify_integrity, sort=sort
)
return PandasMoveDataFrame(data=_append)
[docs] def join(
self,
other: 'PandasMoveDataFrame' | DataFrame,
on: str | list | None = None,
how: str = 'left',
lsuffix: str = '',
rsuffix: str = '',
sort: bool = False
) -> 'PandasMoveDataFrame':
"""
Join columns of other, returning a new object.
Join columns with `other` PandasMoveDataFrame either on index or
on a key column. Efficiently join multiple DataFrame objects
by index at once by passing a list.
Parameters
----------
other : DataFrame, Series, or list of DataFrame
Index should be similar to one of the columns in this one. If a
Series is passed, its name attribute must be set, and that will be
used as the column name in the resulting joined DataFrame.
on : str or list of str or array-like, optional
Column or index level name(srs) in the caller to join on the index
in `other`, otherwise joins index-on-index. If multiple
values given, the `other` DataFrame must have a MultiIndex. Can
pass an array as the join key if it is not already contained in
the calling DataFrame. Like an Excel VLOOKUP operation.
how : {'left', 'right', 'outer', 'inner'}, optional
How to handle the operation of the two objects, by default 'left'
* left: use calling frame index (or column if on is specified)
* right: use `other` index.
* outer: form union of calling frame index (or column if on is
specified) with `other` index, and sort it.
lexicographically.
* inner: form intersection of calling frame index (or column if
on is specified) with `other` index, preserving the order
of the calling one.
lsuffix : str, optional
Suffix to use from left frame overlapping columns, by default ''
rsuffix : str, optional
Suffix to use from right frame overlapping columns, by default ''
sort : bool, optional
Order result DataFrame lexicographically by the join key. If False,
the order of the join key depends on the join type (how keyword)
Returns
-------
PandasMoveDataFrame
A dataframe containing columns from both the caller and `other`.
Notes
-----
Parameters `on`, `lsuffix`, and `rsuffix` are not supported when
passing a list of `DataFrame` objects.
References
----------
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.join.html
"""
if isinstance(other, PandasMoveDataFrame):
other = other._data
_join = super().join(
other=other, on=on, how=how,
lsuffix=lsuffix, rsuffix=rsuffix, sort=sort)
return PandasMoveDataFrame(data=_join)
[docs] def merge(
self,
right: 'PandasMoveDataFrame' | DataFrame | Series,
how: str = 'inner',
on: str | list | None = None,
left_on: str | list | None = None,
right_on: str | list | None = None,
left_index: bool = False,
right_index: bool = False,
sort: bool = False,
suffixes: tuple[str, str] = ('_x', '_y'),
copy: bool = True,
indicator: bool | str = False,
validate: str | None = None
) -> 'PandasMoveDataFrame':
"""
Merge DataFrame or named Series objects with a database-style join.
The join is done on columns or indexes. If joining columns on columns,
the DataFrame indexes will be ignored. Otherwise if joining indexes
on indexes or indexes on a column or columns, the index will be passed on.
Parameters
----------
right: DataFrame or named Series
Object to merge with.
how: {‘left’, ‘right’, ‘outer’, ‘inner’}, optional
Type of merge to be performed, by default ‘inner’
left: use only keys from left frame, similar to a SQL left outer join;
preserve key order.
right: use only keys from right frame, similar to a SQL right outer join;
preserve key order.
outer: use union of keys from both frames, similar to a SQL full outer join;
sort keys lexicographically.
inner: use intersection of keys from both frames, similar to a SQL inner join;
preserve the order of the left keys.
on: label or list, optional
Column or index level names to join on. These must be found in both
DataFrames. If on is None and not merging on indexes then this defaults
to the intersection of the columns in both DataFrames, by default None
left_on: str or list or array-like, optional
Column or index level names to join on in the left DataFrame. Can
also be an array or list of arrays of the length of the left DataFrame.
These arrays are treated as if they are columns, by default None
right_on: str or list or array-like, optional
Column or index level names to join on in the right DataFrame.
Can also be an array or list of arrays of the length of the right DataFrame.
These arrays are treated as if they are columns, by default None
left_index: bool, optional
Use the index from the left DataFrame as the join key(s), by default False
If it is a MultiIndex, the number of keys in the other DataFrame
(either the index or a number of columns) must match the number of levels.
right_index: bool, optional
Use the index from the right DataFrame as the join key, by default False
Same caveats as left_index.
sort: bool, optional
Sort the join keys lexicographically in the result DataFrame, by default False
If False, the order of the join keys depends on the join type (how keyword).
suffixes: tuple of (str, str), optional
Suffix to apply to overlapping column names in the left and right side
respectively. To raise an exception on overlapping columns use (False, False)
by default (‘_x’, ‘_y’)
copy: bool, optional
If False, avoid copy if possible, by default True
indicator: bool or str, optional
If True, adds a column to output DataFrame called '_merge' with
information on the source of each row. If string, column with
information on source of each row will be added to output DataFrame,
and column will be named value of string. Information column is
Categorical-type and takes on a value of 'left_only' for observations
whose merge key only appears in ‘left’ DataFrame, 'right_only' for
observations whose merge key only appears in ‘right’ DataFrame,
and 'both' if the observation’s merge key is found in both.
by default False
validate: str, optional
If specified, checks if merge is of specified type, by default None
'one_to_one' or '1:1': check if merge keys are unique in both
left and right datasets.
'one_to_many' or '1:m': check if merge keys are unique in left dataset.
'many_to_one' or 'm:1': check if merge keys are unique in right dataset.
'many_to_many' or 'm:m': allowed, but does not result in checks.
Returns
-------
PandasMoveDataFrame
A DataFrame of the two merged objects.
References
----------
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html?highlight=merge#pandas.DataFrame.merge
"""
if isinstance(right, PandasMoveDataFrame):
right = right._data
_merge = super().merge(
right=right, how=how, on=on, left_on=left_on, right_on=right_on,
left_index=left_index, right_index=right_index, sort=sort,
suffixes=suffixes, copy=copy, indicator=indicator, validate=validate
)
return PandasMoveDataFrame(data=_merge)
[docs] def write_file(self, file_name: str, separator: str = ','):
"""
Write trajectory data to a new file.
Parameters
----------
file_name : str
Represents the filename.
separator : str, optional
Represents the information separator in a new file, by default ','
"""
self.to_csv(
file_name, sep=separator, encoding='utf-8', index=False
)
[docs] def convert_to(
self, new_type: str
) -> MoveDataFrame | 'PandasMoveDataFrame' | 'DaskMoveDataFrame':
"""
Convert an object from one type to another specified by the user.
Parameters
----------
new_type: 'pandas' or 'dask'
The type for which the object will be converted.
Returns
-------
A subclass of MoveDataFrameAbstractModel
The converted object.
"""
operation = begin_operation('convet_to')
if new_type == TYPE_DASK:
_dask = MoveDataFrame(
self,
latitude=LATITUDE,
longitude=LONGITUDE,
datetime=DATETIME,
traj_id=TRAJ_ID,
type_=TYPE_DASK,
n_partitions=1,
)
self.last_operation = end_operation(operation)
return _dask
else:
self.last_operation = end_operation(operation)
return self
[docs] def get_type(self) -> str:
"""
Returns the type of the object.
Returns
-------
str
A string representing the type of the object.
"""
type_ = self._type
return type_