Source code for pymove.core.pandas

"""PandasMoveDataFrame class."""
from __future__ import annotations

from typing import TYPE_CHECKING, Any, Callable

import numpy as np
from pandas import DataFrame, DateOffset, Series, Timedelta

from pymove.core.dataframe import MoveDataFrame
from pymove.core.grid import Grid
from pymove.utils.constants import (
    DATE,
    DATETIME,
    DAY,
    DAY_PERIODS,
    DIST_PREV_TO_NEXT,
    DIST_TO_NEXT,
    DIST_TO_PREV,
    HOUR,
    HOUR_COS,
    HOUR_SIN,
    LATITUDE,
    LOCAL_LABEL,
    LONGITUDE,
    MOVE,
    PERIOD,
    SITUATION,
    SPEED_PREV_TO_NEXT,
    SPEED_TO_NEXT,
    SPEED_TO_PREV,
    STOP,
    TID,
    TIME_PREV_TO_NEXT,
    TIME_TO_NEXT,
    TIME_TO_PREV,
    TRAJ_ID,
    TYPE_DASK,
    TYPE_PANDAS,
    UID,
    WEEK_DAYS,
    WEEK_END,
)
from pymove.utils.conversions import lat_meters
from pymove.utils.distances import haversine
from pymove.utils.log import logger, progress_bar
from pymove.utils.mem import begin_operation, end_operation
from pymove.utils.trajectories import shift

if TYPE_CHECKING:
    from pymove.core.dask import DaskMoveDataFrame


[docs]class PandasMoveDataFrame(DataFrame): """PyMove dataframe extending Pandas DataFrame.""" def __init__( self, data: DataFrame | list | dict, latitude: str = LATITUDE, longitude: str = LONGITUDE, datetime: str = DATETIME, traj_id: str = TRAJ_ID, ): """ Checks whether past data has 'lat', 'lon', 'datetime' columns. Renames it with the PyMove lib standard. After starts the attributes of the class. - self._mgr : Represents trajectory data. - self._type : Represents the type of layer below the data structure. - self.last_operation : Represents the last operation performed. Parameters ---------- data : DataFrame or list or dict Input trajectory data latitude : str, optional Represents column name latitude, by default LATITUDE longitude : str, optional Represents column name longitude, by default LONGITUDE datetime : str, optional Represents column name datetime, by default DATETIME traj_id : str, optional Represents column name trajectory id, by default TRAJ_ID Raises ------ KeyError If missing one of lat, lon, datetime columns ValueError, ParserError If the data types can't be converted """ if isinstance(data, dict): data = DataFrame.from_dict(data) elif isinstance(data, DataFrame): data = DataFrame(data) elif ( isinstance(data, list) or isinstance(data, np.ndarray) ): zip_list = [LATITUDE, LONGITUDE, DATETIME, TRAJ_ID] for i in range(len(data[0])): try: zip_list[i] = zip_list[i] except KeyError: zip_list.append(str(i)) data = DataFrame(data, columns=zip_list) columns = MoveDataFrame.format_labels( traj_id, latitude, longitude, datetime ) tdf = data.rename(columns=columns) if MoveDataFrame.has_columns(tdf): MoveDataFrame.validate_move_data_frame(tdf) super().__init__(tdf) self._type = TYPE_PANDAS self.last_operation: dict = None # type: ignore[assignment] else: raise KeyError( 'Couldn\'t instantiate MoveDataFrame because data has missing columns.' ) @property def lat(self) -> Series: """ Checks for the LATITUDE column and returns its value. Returns ------- Series LATITUDE column Raises ------ AttributeError If the LATITUDE column is not present in the DataFrame """ if LATITUDE not in self: raise AttributeError( "The MoveDataFrame does not contain the column '%s.'" % LATITUDE ) return self[LATITUDE] @property def lng(self) -> Series: """ Checks for the LONGITUDE column and returns its value. Returns ------- Series LONGITUDE column Raises ------ AttributeError If the LONGITUDE column is not present in the DataFrame """ if LONGITUDE not in self: raise AttributeError( "The MoveDataFrame does not contain the column '%s.'" % LONGITUDE ) return self[LONGITUDE] @property def datetime(self) -> Series: """ Checks for the DATETIME column and returns its value. Returns ------- Series DATETIME column Raises ------ AttributeError If the DATETIME column is not present in the DataFrame """ if DATETIME not in self: raise AttributeError( "The MoveDataFrame does not contain the column '%s.'" % DATETIME ) return self[DATETIME]
[docs] def rename( self, mapper: dict | Callable | None = None, index: dict | Callable | None = None, columns: dict | Callable | None = None, axis: int | str | None = None, copy: bool = True, inplace: bool = False ) -> 'PandasMoveDataFrame' | DataFrame | None: """ Alter axes labels. Function / dict values must be unique (1-to-1). Labels not contained in a dict / Series will be left as-is. Extra labels listed don’t throw an error. Parameters ---------- mapper : dict or function, optional Dict-like or functions transformations to apply to that axis’ values. Use either mapper and axis to specify the axis to target with mapper, or index and columns, by default None index : dict or function, optional Alternative to specifying axis (mapper, axis=0 is equivalent to index=mapper), by default None columns : dict or function, optional Alternative to specifying axis (mapper, axis=1 is equivalent to columns=mapper), by default None axis : int or str, optional Axis to target with mapper. Can be either the axis name (‘index’, ‘columns’) or number (0, 1), by default None copy : bool, optional Also copy underlying data, by default True inplace : bool, optional Whether to return a new DataFrame. If True then value of copy is ignored, by default False Returns ------- PandasMoveDataFrame, DataFrame DataFrame with the renamed axis labels or None Raises ------ AttributeError If trying to rename a required column inplace """ rename_ = super().rename( mapper=mapper, index=index, columns=columns, axis=axis, copy=copy ) if inplace: if MoveDataFrame.has_columns(rename_): self._mgr = rename_._mgr self._item_cache: dict = dict() rename_ = None else: raise AttributeError( 'Could not rename columns lat, lon, and datetime.' ) if rename_ is not None and MoveDataFrame.has_columns(rename_): rename_ = PandasMoveDataFrame(data=rename_) return rename_
[docs] def len(self) -> int: """ Returns the length/row numbers in trajectory data. Returns ------- int Represents the trajectory data length. """ return self.shape[0]
def __getitem__(self, key): """Retrieves and item from this object.""" item = super().__getitem__(key) if ( isinstance(item, DataFrame) and MoveDataFrame.has_columns(item) ): return PandasMoveDataFrame(item) return item
[docs] def head(self, n: int = 5) -> 'PandasMoveDataFrame': """ Return the first n rows. This function returns the first n rows for the object based on position. It is useful for quickly testing if your object has the right type of data in it. Parameters ---------- n : int, optional Number of rows to select, by default 5 Returns ------- PandasMoveDataFrame The first n rows of the caller object. References ---------- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.head.html """ head_ = super().head(n=n) return PandasMoveDataFrame(data=head_)
[docs] def tail(self, n: int = 5) -> 'PandasMoveDataFrame': """ Return the last n rows. This function returns the last n rows for the object based on position. It is useful for quickly testing if your object has the right type of data in it. Parameters ---------- n : int, optional Number of rows to select, by default 5 Returns ------- PandasMoveDataFrame The last n rows of the caller object. References ---------- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.tail.html """ tail_ = super().tail(n=n) return PandasMoveDataFrame(data=tail_)
[docs] def get_users_number(self) -> int: """ Check and return number of users in trajectory data. Returns ------- int Represents the number of users in trajectory data. """ operation = begin_operation('get_users_numbers') if UID in self: number_ = self[UID].nunique() else: number_ = 1 self.last_operation = end_operation(operation) return number_
[docs] def to_grid( self, cell_size: float, meters_by_degree: float | None = None ) -> Grid: """ Converts trajectory data to grid format. Parameters ---------- cell_size : float Represents grid cell size. meters_by_degree : float, optional Represents the corresponding meters of lat by degree, by default lat_meters(-3.71839) Returns ------- Grid Represents the trajectory in grid format """ operation = begin_operation('to_grid') if meters_by_degree is None: meters_by_degree = lat_meters(-3.71839) grid_ = Grid( data=self, cell_size=cell_size, meters_by_degree=meters_by_degree ) self.last_operation = end_operation(operation) return grid_
[docs] def to_data_frame(self) -> DataFrame: """ Converts trajectory data to DataFrame format. Returns ------- DataFrame Represents the trajectory in DataFrame format. """ return DataFrame(self)
[docs] def to_dicrete_move_df( self, local_label: str = LOCAL_LABEL ) -> 'PandasMoveDataFrame': """ Generate a discrete dataframe move. Parameters ---------- local_label : str, optional Represents the column name of feature local label, default LOCAL_LABEL Returns ------- PandasDiscreteMoveDataFrame Represents an PandasMoveDataFrame discretized. """ operation = begin_operation('to_discrete_move_df') if local_label not in self: raise ValueError( f'columns {local_label} not in df' ) self.last_operation = end_operation(operation) from pymove.core.pandas_discrete import PandasDiscreteMoveDataFrame return PandasDiscreteMoveDataFrame( self, LATITUDE, LONGITUDE, DATETIME, TRAJ_ID, local_label )
[docs] def copy(self, deep: bool = True) -> 'PandasMoveDataFrame': """ Make a copy of this object’s indices and data. When deep=True (default), a new object will be created with a copy of the calling object data and indices. Modifications to the data or indices of the copy will not be reflected in the original object (see notes below). When deep=False, a new object will be created without copying the calling object data or index (only references to the data and index are copied). Any changes to the data of the original will be reflected in the shallow copy (and vice versa). Parameters ---------- deep : bool, optional Make a deep copy, including a copy of the data and the indices. With deep=False neither the indices nor the data are copied, by default True Returns ------- PandasMoveDataFrame Object type matches caller. Notes ----- When deep=True, data is copied but actual Python objects will not be copied recursively, only the reference to the object. This is in contrast to copy.deepcopy in the Standard Library, which recursively copies object data (see examples below). While Index objects are copied when deep=True, the underlying numpy array is not copied for performance reasons. Since Index is immutable, the underlying data can be safely shared and a copy is not needed. """ copy_ = super().copy(deep=deep) return PandasMoveDataFrame(data=copy_)
[docs] def generate_tid_based_on_id_datetime( self, str_format: str = '%Y%m%d%H', sort: bool = True, inplace: bool = True ) -> 'PandasMoveDataFrame' | None: """ Create or update trajectory id based on id and datetime. Parameters ---------- str_format : str, optional Format to consider the datetime, by default '%Y%m%d%H' sort : bool, optional Wether to sort the dataframe, by default True inplace : bool, optional Represents whether the operation will be performed on the data provided or in a copy, by default True Returns ------- PandasMoveDataFrame Object with new features or None """ operation = begin_operation('generate_tid_based_on_id_datetime') if not inplace: data = self.copy() else: data = self logger.debug('\nCreating or updating tid feature...\n') if sort is True: logger.debug( '...Sorting by %s and %s to increase performance\n' % (TRAJ_ID, DATETIME) ) data.sort_values([TRAJ_ID, DATETIME], inplace=True) data[TID] = data[TRAJ_ID].astype(str) + data[ DATETIME ].dt.strftime(str_format) logger.debug('\n...tid feature was created...\n') data.last_operation = end_operation(operation) if not inplace: return data
[docs] def generate_date_features( self, inplace: bool = True ) -> 'PandasMoveDataFrame' | None: """ Create or update date feature based on datetime. Parameters ---------- inplace : bool, optional Represents whether the operation will be performed on the data provided or in a copy, by default True Returns ------- PandasMoveDataFrame Object with new features or None """ operation = begin_operation('generate_date_features') if not inplace: data = self.copy() else: data = self logger.debug('Creating date features...') if DATETIME in self: data[DATE] = data[DATETIME].dt.date logger.debug('..Date features was created...\n') data.last_operation = end_operation(operation) if not inplace: return data
[docs] def generate_hour_features( self, inplace: bool = True ) -> 'PandasMoveDataFrame' | None: """ Create or update hour features based on datetime. Parameters ---------- inplace : bool, optional Represents whether the operation will be performed on the data provided or in a copy, by default True Returns ------- PandasMoveDataFrame Object with new features or None """ operation = begin_operation('generate_hour_features') if not inplace: data = self.copy() else: data = self logger.debug('\nCreating or updating a feature for hour...\n') if DATETIME in self: data[HOUR] = data[DATETIME].dt.hour logger.debug('...Hour feature was created...\n') data.last_operation = end_operation(operation) if not inplace: return data
[docs] def generate_day_of_the_week_features( self, inplace: bool = True ) -> 'PandasMoveDataFrame' | None: """ Create or update day of the week features based on datetime. Parameters ---------- inplace : bool, optional Represents whether the operation will be performed on the data provided or in a copy, by default True Returns ------- PandasMoveDataFrame Object with new features or None """ operation = begin_operation('generate_day_of_the_week_features') if not inplace: data = self.copy() else: data = self logger.debug('\nCreating or updating day of the week feature...\n') data[DAY] = data[DATETIME].dt.day_name() logger.debug('...the day of the week feature was created...\n') data.last_operation = end_operation(operation) if not inplace: return data
[docs] def generate_weekend_features( self, create_day_of_week: bool = False, inplace: bool = True ) -> 'PandasMoveDataFrame' | None: """ Adds information to rows determining if it is a weekend day. Create or update the feature weekend to the dataframe, if this resource indicates that the given day is the weekend, otherwise, it is a day of the week. Parameters ---------- create_day_of_week : bool, optional Indicates if the column day should be keeped in the dataframe. If set to False the column will be dropped, by default False inplace : bool, optional Represents whether the operation will be performed on the data provided or in a copy, by default True Returns ------- PandasMoveDataFrame Object with new features or None """ operation = begin_operation('generate_weekend_features') if not inplace: data = self.copy() else: data = self data.generate_day_of_the_week_features(inplace=True) logger.debug('Creating or updating a feature for weekend\n') if DAY in data: fds = (data[DAY] == WEEK_DAYS[5]) | (data[DAY] == WEEK_DAYS[6]) index_fds = data[fds].index data[WEEK_END] = 0 data.at[index_fds, WEEK_END] = 1 logger.debug('...Weekend was set as 1 or 0...\n') if not create_day_of_week: logger.debug('...dropping colum day\n') del data[DAY] data.last_operation = end_operation(operation) if not inplace: return data
[docs] def generate_time_of_day_features( self, inplace: bool = True ) -> 'PandasMoveDataFrame' | None: """ Create or update time of day features based on datetime. Parameters ---------- inplace : bool, optional Represents whether the operation will be performed on the data provided or in a copy, by default True Returns ------- PandasMoveDataFrame Object with new features or None Early morning from 0H to 6H Morning from 6H to 12H Afternoon from 12H to 18H Evening from 18H to 24H Examples -------- - datetime1 = 2019-04-28 02:00:56 -> period = Early Morning - datetime2 = 2019-04-28 08:00:56 -> period = Morning - datetime3 = 2019-04-28 14:00:56 -> period = Afternoon - datetime4 = 2019-04-28 20:00:56 -> period = Evening """ operation = begin_operation('generate_time_of_day_features') if not inplace: data = self.copy() else: data = self periods = [ '\n' 'Creating or updating period feature', '...Early morning from 0H to 6H', '...Morning from 6H to 12H', '...Afternoon from 12H to 18H', '...Evening from 18H to 24H' '\n', ] logger.debug('\n'.join(periods)) hours = data[DATETIME].dt.hour conditions = [ (hours >= 0) & (hours < 6), (hours >= 6) & (hours < 12), (hours >= 12) & (hours < 18), (hours >= 18) & (hours < 24), ] data[PERIOD] = np.select(conditions, DAY_PERIODS, 'undefined') logger.debug('...the period of day feature was created') data.last_operation = end_operation(operation) if not inplace: return data
[docs] def generate_datetime_in_format_cyclical( self, label_datetime: str = DATETIME, inplace: bool = True ) -> 'PandasMoveDataFrame' | None: """ Create or update column with cyclical datetime feature. Parameters ---------- label_datetime : str, optional Represents column id type, by default DATETIME inplace : bool, optional Represents whether the operation will be performed on the data provided or in a copy, by default True Returns ------- PandasMoveDataFrame Object with new features or None References ---------- https://ianlondon.github.io/blog/encoding-cyclical-features-24hour-time/ https://www.avanwyk.com/encoding-cyclical-features-for-deep-learning/ """ operation = begin_operation('generate_datetime_in_format_cyclical') if not inplace: data = self.copy() else: data = self logger.debug('Encoding cyclical continuous features - 24-hour time') if label_datetime in data: hours = self[label_datetime].dt.hour data[HOUR_SIN] = np.sin(2 * np.pi * hours / 23.0) data[HOUR_COS] = np.cos(2 * np.pi * hours / 23.0) logger.debug('...hour_sin and hour_cos features were created...\n') data.last_operation = end_operation(operation) if not inplace: return data
@staticmethod def _prepare_generate_data( data_: DataFrame, sort: bool, label_id: str ) -> tuple[Any, int, None]: """ Processes the data and create variables for generate methods. Parameters ---------- data_ : DataFrame Dataframe to be processed. sort : bool Whether to sort the data. label_id : str Name of the label feature. Returns ------- Tuple[List, int, int, int] data_ unique ids. size of id. starting index """ if sort is True: logger.debug( '...Sorting by %s and %s to increase performance\n' % (label_id, DATETIME) ) data_.sort_values([label_id, DATETIME]) if data_.index.name is None: logger.debug( '...Set %s as index to a higher performance\n' % label_id ) data_.set_index(label_id, inplace=True) ids = data_.index.unique() size_id = 0 idx = None return ids, size_id, idx
[docs] def generate_dist_time_speed_features( self, label_id: str = TRAJ_ID, label_dtype: Callable = np.float64, sort: bool = True, inplace: bool = True ) -> 'PandasMoveDataFrame' | None: """ Adds distance, time and speed information to the dataframe. Firstly, create the three distance to an GPS point P (lat, lon). After, create two time features to point P: time to previous and time to next. Lastly, create two features to speed using time and distance features. Parameters ---------- label_id : str, optional Represents name of column of trajectories id, by default TRAJ_ID label_dtype : callable, optional Represents column id type, by default np.float64 sort : bool, optional If sort == True the dataframe will be sorted, by True inplace : bool, optional Represents whether the operation will be performed on the data provided or in a copy, by default True Returns ------- PandasMoveDataFrame Object with new features or None Examples -------- - dist_to_prev = 248.33 meters, dist_to_prev 536.57 meters - time_to_prev = 60 seconds, time_prev = 60.0 seconds - speed_to_prev = 4.13 m/srs, speed_prev = 8.94 m/srs. """ operation = begin_operation('generate_dist_time_speed_features') if not inplace: data = self.copy() else: data = self ids, size_id, idx = self._prepare_generate_data( data, sort, label_id ) message = '\nCreating or updating distance, time and speed features' message += ' in meters by seconds\n' logger.debug( message ) # create new feature to distance data[DIST_TO_PREV] = label_dtype(-1.0) # create new feature to time data[TIME_TO_PREV] = label_dtype(-1.0) # create new feature to speed data[SPEED_TO_PREV] = label_dtype(-1.0) for idx in progress_bar( ids, desc='Generating distance, time and speed features' ): curr_lat = data.at[idx, LATITUDE] curr_lon = data.at[idx, LONGITUDE] size_id = curr_lat.size if size_id <= 1: data.at[idx, DIST_TO_PREV] = np.nan data.at[idx, TIME_TO_PREV] = np.nan data.at[idx, SPEED_TO_PREV] = np.nan else: prev_lat = shift(curr_lat, 1) prev_lon = shift(curr_lon, 1) # compute distance from previous to current point data.at[idx, DIST_TO_PREV] = haversine( prev_lat, prev_lon, curr_lat, curr_lon ) time_ = data.at[idx, DATETIME].values.astype(label_dtype) time_prev = (time_ - shift(time_, 1)) * (10 ** -9) data.at[idx, TIME_TO_PREV] = time_prev # set speed features data.at[idx, SPEED_TO_PREV] = ( data.at[idx, DIST_TO_PREV] / time_prev ) # unit: m/srs data.reset_index(inplace=True) data.last_operation = end_operation(operation) if not inplace: return data
[docs] def generate_dist_features( self, label_id: str = TRAJ_ID, label_dtype: Callable = np.float64, sort: bool = True, inplace: bool = True ) -> 'PandasMoveDataFrame' | None: """ Create the three distance in meters to an GPS point P. Parameters ---------- label_id : str, optional Represents name of column of trajectories id, by default TRAJ_ID label_dtype : callable, optional Represents column id type, by default np.float64 sort : bool, optional If sort == True the dataframe will be sorted, by True inplace : bool, optional Represents whether the operation will be performed on the data provided or in a copy, by default True Returns ------- PandasMoveDataFrame Object with new features or None Examples -------- - P to P.next = 2 meters - P to P.previous = 1 meter - P.previous to P.next = 1 meters """ operation = begin_operation('generate_dist_features') if not inplace: data = self.copy() else: data = self ids, size_id, idx = self._prepare_generate_data( data, sort, label_id ) logger.debug('\nCreating or updating distance features in meters...\n') # create ou update columns data[DIST_TO_PREV] = label_dtype(-1.0) data[DIST_TO_NEXT] = label_dtype(-1.0) data[DIST_PREV_TO_NEXT] = label_dtype(-1.0) for idx in progress_bar(ids, desc='Generating distance features'): curr_lat = data.at[idx, LATITUDE] curr_lon = data.at[idx, LONGITUDE] size_id = curr_lat.size if size_id <= 1: data.at[idx, DIST_TO_PREV] = np.nan else: prev_lat = shift(curr_lat, 1) prev_lon = shift(curr_lon, 1) # compute distance from previous to current point data.at[idx, DIST_TO_PREV] = haversine( prev_lat, prev_lon, curr_lat, curr_lon ) next_lat = shift(curr_lat, -1) next_lon = shift(curr_lon, -1) # compute distance to next point data.at[idx, DIST_TO_NEXT] = haversine( curr_lat, curr_lon, next_lat, next_lon ) # using pandas shift in a large dataset: 7min 21s # using numpy shift above: 33.6 srs # use distance from previous to next data.at[idx, DIST_PREV_TO_NEXT] = haversine( prev_lat, prev_lon, next_lat, next_lon ) data.reset_index(inplace=True) data.last_operation = end_operation(operation) if not inplace: return data
[docs] def generate_time_features( self, label_id: str = TRAJ_ID, label_dtype: Callable = np.float64, sort: bool = True, inplace: bool = True ) -> 'PandasMoveDataFrame' | None: """ Create the three time in seconds to an GPS point P. Parameters ---------- label_id : str, optional Represents name of column of trajectories id, by default TRAJ_ID label_dtype : callable, optional Represents column id type, by default np.float64 sort : bool, optional If sort == True the dataframe will be sorted, by True inplace : bool, optional Represents whether the operation will be performed on the data provided or in a copy, by default True Returns ------- PandasMoveDataFrame Object with new features or None Examples -------- - P to P.next = 5 seconds - P to P.previous = 15 seconds - P.previous to P.next = 20 seconds """ operation = begin_operation('generate_time_features') if not inplace: data = self.copy() else: data = self ids, size_id, idx = self._prepare_generate_data( data, sort, label_id ) logger.debug( '\nCreating or updating time features seconds\n' ) # create new feature to time data[TIME_TO_PREV] = label_dtype(-1.0) data[TIME_TO_NEXT] = label_dtype(-1.0) data[TIME_PREV_TO_NEXT] = label_dtype(-1.0) for idx in progress_bar( ids, desc='Generating time features' ): curr_time = data.at[idx, DATETIME].values.astype(label_dtype) size_id = curr_time.size if size_id <= 1: data.at[idx, TIME_TO_PREV] = np.nan else: prev_time = shift(curr_time, 1) time_prev = (curr_time - prev_time) * (10 ** -9) data.at[idx, TIME_TO_PREV] = time_prev next_time = shift(curr_time, -1) time_prev = (next_time - curr_time) * (10 ** -9) data.at[idx, TIME_TO_NEXT] = time_prev time_prev_to_next = (next_time - prev_time) * (10 ** -9) data.at[idx, TIME_PREV_TO_NEXT] = time_prev_to_next data.reset_index(inplace=True) data.last_operation = end_operation(operation) if not inplace: return data
[docs] def generate_speed_features( self, label_id: str = TRAJ_ID, label_dtype: Callable = np.float64, sort: bool = True, inplace: bool = True ) -> 'PandasMoveDataFrame' | None: """ Create the three speed in meter by seconds to an GPS point P. Parameters ---------- label_id : str, optional Represents name of column of trajectories id, by default TRAJ_ID label_dtype : callable, optional Represents column id type, by default np.float64 sort : bool, optional If sort == True the dataframe will be sorted, by True inplace : bool, optional Represents whether the operation will be performed on the data provided or in a copy, by default True Returns ------- PandasMoveDataFrame Object with new features or None Raises ------ ValueError If feature generation fails Examples -------- - P to P.next = 1 meter/seconds - P to P.previous = 3 meter/seconds - P.previous to P.next = 2 meter/seconds """ operation = begin_operation('generate_speed_features') if not inplace: data = self.copy() else: data = self logger.debug( '\nCreating or updating speed features meters by seconds\n' ) dists = data.generate_dist_features( label_id, label_dtype, sort, inplace=False ) times = data.generate_time_features( label_id, label_dtype, sort, inplace=False ) if dists is None or times is None: raise ValueError('Geretating dist or time feature failed') data[SPEED_TO_PREV] = dists[DIST_TO_PREV] / times[TIME_TO_PREV] data[SPEED_TO_NEXT] = dists[DIST_TO_NEXT] / times[TIME_TO_NEXT] d_prev_next = dists[DIST_TO_PREV] + dists[DIST_TO_NEXT] data[SPEED_PREV_TO_NEXT] = d_prev_next / times[TIME_PREV_TO_NEXT] self._prepare_generate_data( data, sort, label_id ) data.reset_index(inplace=True) data.last_operation = end_operation(operation) if not inplace: return data
[docs] def generate_move_and_stop_by_radius( self, radius: float = 0, target_label: str = DIST_TO_PREV, inplace: bool = True ): """ Create or update column with move and stop points by radius. Parameters ---------- radius : float, optional Represents radius, by default 0 target_label : str, optional Represents column to compute, by default DIST_TO_PREV inplace : bool, optional Represents whether the operation will be performed on the data provided or in a copy, by default True Returns ------- PandasMoveDataFrame Object with new features or None """ operation = begin_operation('generate_move_and_stop_by_radius') if not inplace: data = self.copy() else: data = self data.generate_dist_features(inplace=True) logger.debug('\nCreating or updating features MOVE and STOPS...\n') conditions = ( (data[target_label] > radius), (data[target_label] <= radius), ) choices = [MOVE, STOP] data[SITUATION] = np.select(conditions, choices, np.nan) logger.debug( '\n....There are %s stops to this parameters\n' % (data[data[SITUATION] == STOP].shape[0]) ) data.last_operation = end_operation(operation) if not inplace: return data
[docs] def time_interval(self) -> Timedelta: """ Get time difference between max and min datetime in trajectory data. Returns ------- Timedelta Represents the time difference. """ operation = begin_operation('time_interval') time_diff = self[DATETIME].max() - self[DATETIME].min() self.last_operation = end_operation(operation) return time_diff
[docs] def get_bbox(self) -> tuple[float, float, float, float]: """ Returns the bounding box of the dataframe. A bounding box (usually shortened to bbox) is an area defined by two longitudes and two latitudes, where: - Latitude is a decimal number between -90.0 and 90.0. - Longitude is a decimal number between -180.0 and 180.0. They usually follow the standard format of: - bbox = left, bottom, right, top - bbox = min Longitude , min Latitude , max Longitude , max Latitude Returns ------- Tuple[float, float, float, float]: Represents a bound box, that is a tuple of 4 values with the min and max limits of latitude e longitude. lat_min, lon_min, lat_max, lon_max Examples -------- (22.147577, 113.54884299999999, 41.132062, 121.156224) """ operation = begin_operation('get_bbox') bbox_ = ( self[LATITUDE].min(), self[LONGITUDE].min(), self[LATITUDE].max(), self[LONGITUDE].max(), ) self.last_operation = end_operation(operation) return bbox_
[docs] def show_trajectories_info(self): """ Show dataset information from dataframe. Displays the number of rows, datetime interval, and bounding box. Examples -------- ====================== INFORMATION ABOUT DATASET ====================== Number of Points: 217654 Number of IDs objects: 2 Start Date:2008-10-23 05:53:05 End Date:2009-03-19 05:46:37 Bounding Box:(22.147577, 113.54884299999999, 41.132062, 121.156224) ======================================================================= """ operation = begin_operation('show_trajectories_info') message = ('=' * 22) + ' INFORMATION ABOUT DATASET ' + ('=' * 22) print( '\n%s\n' % message ) print('Number of Points: %s\n' % self.shape[0]) if TRAJ_ID in self: print( 'Number of IDs objects: %s\n' % self[TRAJ_ID].nunique() ) if TID in self: print( 'Number of TIDs trajectory: %s\n' % self[TID].nunique() ) if DATETIME in self: dt_max = self[DATETIME].max() dt_min = self[DATETIME].min() print( 'Start Date:%s End Date:%s\n' % (dt_min, dt_max) ) if LATITUDE and LONGITUDE in self: print( f'Bounding Box:{self.get_bbox()}\n' ) # bbox return = Lat_min , Long_min, Lat_max, Long_max if TIME_TO_PREV in self: t_max = round(self[TIME_TO_PREV].max(), 3) t_min = round(self[TIME_TO_PREV].min(), 3) print( 'Gap time MAX:%s Gap time MIN:%s\n' % (t_max, t_min) ) if SPEED_TO_PREV in self: s_max = round(self[SPEED_TO_PREV].max(), 3) s_min = round(self[SPEED_TO_PREV].min(), 3) print( 'Speed MAX:%s Speed MIN:%s\n' % (s_max, s_min) ) if DIST_TO_PREV in self: d_max = round(self[DIST_TO_PREV].max(), 3) d_min = round(self[DIST_TO_PREV].min(), 3) print( 'Distance MAX:%s Distance MIN:%s\n' % (d_max, d_min) ) print( '\n%s\n' % ('=' * len(message)) ) self.last_operation = end_operation(operation)
[docs] def astype( self, dtype: Callable | dict, copy: bool = True, errors: str = 'raise' ) -> DataFrame: """ Cast a pandas object to a specified dtype. Parameters ---------- dtype: callable, dict Use a numpy.dtype or Python type to cast entire pandas object to the same type. Alternatively, use {col: dtype, …}, where col is a column label and dtype is a numpy.dtype or Python type to cast one or more of the DataFrame columns to column-specific types. copy: bool, optional Return a copy when copy=True (be very careful setting copy=False as changes to values then may propagate to other pandas objects), by default True errors: str, optional Control raising of exceptions on invalid data for provided dtype, by default 'raise - raise : allow exceptions to be raised - ignore : suppress exceptions. On error return original object Returns ------- DataFrame Casted object to specified type. References ---------- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.astype.html Raises ------ AttributeError If trying to change required types inplace """ if not copy and isinstance(dtype, str): raise AttributeError( 'Could not change lat, lon, and datetime type.' ) elif not copy and isinstance(dtype, dict): keys = set(list(dtype.keys())) columns = {LATITUDE, LONGITUDE, DATETIME} if keys & columns: raise AttributeError( 'Could not change lat, lon, and datetime type.' ) return super().astype(dtype=dtype, copy=copy, errors=errors)
[docs] def sort_values( self, by: str | list[str], axis: int = 0, ascending: bool = True, inplace: bool = False, kind: str = 'quicksort', na_position: str = 'last', ) -> 'PandasMoveDataFrame' | None: """ Sorts the values of the _data, along an axis. Parameters ---------- by: str, list Name or list of names to sort the _data by axis: int, optional if set to 0 or 'index', will count for each column. if set to 1 or 'columns', will count for each row by default 0 ascending: bool, optional Sort ascending vs. descending. Specify list for multiple sort orders. If this is a list of bool, must match the length, by default True inplace: bool, optional if set to true the original dataframe will be altered, the duplicates will be dropped in place, otherwise the operation will be made in a copy, that will be returned, by default False kind: str, optional Choice of sorting algorithm, 'quicksort', 'mergesort', 'heapsort' For DataFrames, this option is only applied when sorting on a single column or label, by default 'quicksort' na_position: str, optional 'first', 'last', by default 'last If 'first' puts NaNs at the beginning; If last puts NaNs at the end. Returns ------- PandasMoveDataFrame The sorted dataframe or None References ---------- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sort_values.html """ _sort_values = super().sort_values( by=by, axis=axis, ascending=ascending, inplace=False, kind=kind, na_position=na_position ) if inplace: self._mgr = _sort_values._mgr self._item_cache = dict() return None return PandasMoveDataFrame(data=_sort_values)
[docs] def reset_index( self, level: int | str | tuple | list | None = None, drop: bool = False, inplace: bool = False, col_level: int | str = 0, col_fill: str = '' ) -> 'PandasMoveDataFrame' | None: """ Resets the DataFrame's index, and use the default one. One or more levels can be removed, if the DataFrame has a MultiIndex. Parameters ---------- level: int or str or tuple or list, optional Only the levels specify will be removed from the index If set to None, all levels are removed, by default None drop: bool, optional Do not try to insert index into dataframe columns This resets the index to the default integer index, by default False inplace: bool, optional Modify the DataFrame in place (do not create a new object), by default False col_level: int or str, optional If the columns have multiple levels, determines which level the labels are inserted into, by default 0 col_fill: str, optional If the columns have multiple levels, determines how the other levels are named If None then the index name is repeated, by default '' PandasMoveDataFrame The generated picture or None References ---------- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.reset_index.html """ _reset_index = super().reset_index( level=level, drop=drop, inplace=False, col_level=col_level, col_fill=col_fill ) if inplace: self._mgr = _reset_index._mgr self._item_cache = dict() return None return PandasMoveDataFrame(data=_reset_index)
[docs] def set_index( self, keys: str | list[str], drop: bool = True, append: bool = False, inplace: bool = False, verify_integrity: bool = False, ) -> 'PandasMoveDataFrame' | DataFrame | None: """ Set the DataFrame index (row labels) using one or more existing columns or arrays. Parameters ---------- keys: str, list label or array-like or list of labels/arrays This parameter can be either a single column key, a single array of the same length as the calling DataFrame, or a list containing an arbitrary combination of column keys and arrays drop: bool, optional Delete columns to be used as the new index, by default True append: bool, optional Whether to append columns to existing index, by default True inplace: bool, optional Modify the DataFrame in place (do not create a new object), by default True verify_integrity: bool, optional Check the new index for duplicates Otherwise defer the check until necessary Setting to False will improve the performance of this method, by default True Returns ------- PandasMoveDataFrame, DataFrame Object with a new index or None References ---------- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.set_index.html Raises ------ AttributeError If trying to change required columns types """ if inplace and drop: if isinstance(keys, str): aux = {keys} else: aux = set(keys) columns = {LATITUDE, LONGITUDE, DATETIME} if aux & columns: raise AttributeError( 'Could not change lat, lon, and datetime type.' ) _set_index = super().set_index( keys=keys, drop=drop, append=append, inplace=False, verify_integrity=verify_integrity ) if inplace: self._mgr = _set_index._mgr self._item_cache = dict() _set_index = None if _set_index is not None and MoveDataFrame.has_columns(_set_index): _set_index = PandasMoveDataFrame(data=_set_index) return _set_index
[docs] def drop( self, labels: str | list[str] | None = None, axis: int | str = 0, index: str | list[str] | None = None, columns: str | list[str] | None = None, level: int | str | None = None, inplace: bool = False, errors: str = 'raise', ) -> 'PandasMoveDataFrame' | DataFrame | None: """ Removes rows or columns. By specifying label names and corresponding axis, or by specifying directly index or column names. When using a multiindex, labels on different levels can be removed by specifying the level. Parameters ---------- labels: str or list, optional Index or column labels to drop, by default None axis: int or str, optional Whether to drop labels from the index (0 or 'index') or columns (1 or 'columns'), by default 0 index: str or list, optional Alternative to specifying axis (labels, axis=0 is equivalent to index=labels), by default None columns: str or list, optional Alternative to specifying axis (labels, axis=1 is equivalent to columns=labels), by default None level: str or int, optional For MultiIndex, level from which the labels will be removed, by default None inplace: bool, optional If True, do operation inplace and return None Otherwise, make a copy, do operations and return, by default False errors: bool, optional 'ignore', 'raise', by default 'raise' If 'ignore', suppress error and only existing labels are dropped. Returns ------- PandasMoveDataFrame, DataFrame Object without the removed index or column labels or None Raises ------ AttributeError If trying to drop a required column inplace KeyError If any of the labels is not found in the selected axis. References ---------- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html """ if inplace: _labels1 = set() _labels2 = set() if labels is not None: if isinstance(labels, str): _labels1 = {labels} else: _labels1 = set(labels) elif columns is not None: if isinstance(columns, str): _labels2 = {columns} else: _labels2 = set(columns) _columns = {LATITUDE, LONGITUDE, DATETIME} if ( (axis == 1 or axis == 'columns' or columns) and (_labels1.union(_labels2) & _columns) ): raise AttributeError( 'Could not drop columns lat, lon, and datetime.' ) _drop = super().drop( labels=labels, axis=axis, index=index, columns=columns, level=level, inplace=False, errors=errors ) if inplace: self._mgr = _drop._mgr self._item_cache = dict() _drop = None if _drop is not None and MoveDataFrame.has_columns(_drop): _drop = PandasMoveDataFrame(data=_drop) return _drop
[docs] def drop_duplicates( self, subset: int | str | None = None, keep: str | bool = 'first', inplace: bool = False ) -> 'PandasMoveDataFrame' | None: """ Uses the pandas's function drop_duplicates, to remove duplicated rows from data. Parameters ---------- subset: int or str, optional Only consider certain columns for identifying duplicates, by default use all of the columns, by default None keep: str, optional - first : Drop duplicates except for the first occurrence. - last : Drop duplicates except for the last occurrence. - False : Drop all duplicates. by default 'first' inplace: bool, optional Whether to drop duplicates in place or to return a copy, by default False Returns ------- PandasMoveDataFrame Object with duplicated rows or None References ---------- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html """ _drop_duplicates = super().drop_duplicates( subset=subset, keep=keep, inplace=False ) if inplace: self._mgr = _drop_duplicates._mgr self._item_cache = dict() return None return PandasMoveDataFrame(data=_drop_duplicates)
[docs] def shift( self, periods: int = 1, freq: DateOffset | Timedelta | str | None = None, axis: int | str = 0, fill_value: Any | None = None ) -> 'PandasMoveDataFrame': """ Shift index by desired number of periods with an optional time freq. Parameters ---------- periods: int, optional, default 1 Number of periods to shift. Can be positive or negative. freq: DateOffset or Timedelta or str, optional, default None Offset to use from the series module or time rule (e.g. 'EOM'). If freq is specified then the index values are shifted but the data is not realigned. That is, use freq if you would like to extend the index when shifting and preserve the original data. When freq is not passed, shift the index without realigning the data. If freq is passed (in this case, the index must be date or datetime, or it will raise a NotImplementedError), the index will be increased using the periods and the freq. axis: 0 or 'index', 1 or 'columns', None, optional, default 0 Shift direction. fill_value: object, optional, default None The scalar value to use for newly introduced missing values. The default depends on the dtype of self. For numeric data, np.nan is used. For datetime, timedelta, or period data, etc. NaT is used. For extension dtypes, self.dtype.na_value is used. Returns ------- PandasMoveDataFrame A copy of the original object, shifted. References ---------- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.shift.html """ _shift = super().shift( periods=periods, freq=freq, axis=axis, fill_value=fill_value ) return PandasMoveDataFrame(data=_shift)
[docs] def fillna( self, value: Any | None = None, method: str | None = None, axis: int | str | None = None, inplace: bool = False, limit: int | None = None, downcast: dict | None = None, ): """ Fill NA/NaN values using the specified method. Parameters ---------- value : scalar, dict, Series, or DataFrame Value to use to fill holes (e.g. 0), alternately a dict/Series/DataFrame of values specifying which value to use for each index (for a Series) or column (for a DataFrame). Values not in the dict/Series/DataFrame will not be filled. This value cannot be a list. method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None Method to use for filling holes in reindexed Series pad / ffill: propagate last valid observation forward to next valid backfill / bfill: use next valid observation to fill gap. axis : {0 or 'index', 1 or 'columns'} Axis along which to fill missing values. inplace : bool, default False If True, fill in-place. Note: this will modify any other views on this object (e.g., a no-copy slice for a column in a DataFrame). limit : int, default None If method is specified, this is the maximum number of consecutive NaN values to forward/backward fill. In other words, if there is a gap with more than this number of consecutive NaNs, it will only be partially filled. If method is not specified, this is the maximum number of entries along the entire axis where NaNs will be filled. Must be greater than 0 if not None. downcast : dict, default is None A dict of item->dtype of what to downcast if possible, or the str 'infer' which will try to downcast to an appropriate equal type (e.g. float64 to int64 if possible). Returns ------- PandasMoveDataFrame Object with missing values filled or None References ---------- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html """ _fillna = super().fillna( value=value, method=method, axis=axis, inplace=False, limit=limit, downcast=downcast ) if inplace: self._mgr = _fillna._mgr self._item_cache = dict() return None return PandasMoveDataFrame(data=_fillna)
[docs] def dropna( self, axis: int | str = 0, how: str = 'any', thresh: float | None = None, subset: list | None = None, inplace: bool = False ): """ Removes missing data. Parameters ---------- axis: 0 or 'index', 1 or 'columns', None, optional Determine if rows or columns are removed, by default 0 - 0, or 'index' : Drop rows which contain missing values. - 1, or 'columns' : Drop columns which contain missing value. how: str, optional Determine if row or column is removed from DataFrame, by default 'any when we have at least one NA or all NA. - 'any' : If any NA values are present, drop that row or column. - 'all' : If all values are NA, drop that row or column. thresh: float, optional Require that many non-NA values, by default None subset: array-like, optional Labels along other axis to consider, by default None e.g. if you are dropping rows these would be a list of columns to include. inplace: bool, optional If True, do operation inplace and return None, by default False Returns ------- PandasMoveDataFrame Object with NA entries dropped or None References ---------- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html Raises ------ AttributeError If trying to drop required columns inplace """ if inplace: if axis == 1 or axis == 'columns': columns = [LATITUDE, LONGITUDE, DATETIME] data = self[columns] if data.isnull().values.any(): raise AttributeError( 'Could not drop columns lat, lon, and datetime.' ) _dropna = super().dropna( axis=axis, how=how, thresh=thresh, subset=subset, inplace=False) if inplace: self._mgr = _dropna._mgr self._item_cache = dict() _dropna = None if _dropna is not None and MoveDataFrame.has_columns(_dropna): _dropna = PandasMoveDataFrame(data=_dropna) return _dropna
[docs] def sample( self, n: int | None = None, frac: float | None = None, replace: bool = False, weights: str | list | None = None, random_state: int | None = None, axis: int | str | None = None ) -> 'PandasMoveDataFrame': """ Return a random sample of items from an axis of object. You can use `random_state` for reproducibility. Parameters ---------- n : int, optional Number of items from axis to return. Cannot be used with `frac`, by default None frac : float, optional Fraction of axis items to return. Cannot be used with `n`, by deault None replace : bool, optional Allow or disallow sampling of the same row more than once, by default False weights : str or ndarray-like, optional If 'None' results in equal probability weighting. If passed a Series, will align with target object on index. Index values in weights not found in sampled object will be ignored and index values in sampled object not in weights will be assigned weights of zero. If called on a DataFrame, will accept the name of a column when axis = 0. Unless weights are a Series, weights must be same length as axis being sampled. If weights do not sum to 1, they will be normalized to sum to 1. Missing values in the weights column will be treated as zero. Infinite values not allowed. by default None random_state : int or numpy.random.RandomState, optional Seed for the random number generator (if int), or numpy RandomState object,by default None axis : {0 or 'index', 1 or 'columns', None}, optional Axis to sample. Accepts axis number or name. Default is stat axis for given data type (0 for Series and DataFrames), by default None Returns ------- PandasMoveDataFrame A new object of same type as caller containing `n` items randomly sampled from the caller object. See Also -------- numpy.random.choice: Generates a random sample from a given 1-D numpy array. Notes ----- If `frac` > 1, `replacement` should be set to `True`. References ---------- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sample.html """ _sample = super().sample( n=n, frac=frac, replace=replace, weights=weights, random_state=random_state, axis=axis ) return PandasMoveDataFrame(data=_sample)
[docs] def isin(self, values: list | Series | DataFrame | dict) -> DataFrame: """ Determines whether each element in the DataFrame is contained in values. values : iterable, Series, DataFrame or dict The result will only be true at a location if all the labels match. If values is a Series, the index. If values is a dict, the keys must be the column names, which must match. If values is a DataFrame, then both the index and column labels must match. Returns ------- DataFrame: DataFrame of booleans showing whether each element in the DataFrame is contained in values References ---------- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.isin.html """ return DataFrame(self).isin(values)
[docs] def append( self, other: 'PandasMoveDataFrame' | DataFrame, ignore_index: bool = False, verify_integrity: bool = False, sort: bool = False ) -> 'PandasMoveDataFrame': """ Append rows of other to the end of caller, returning a new object. Columns in other that are not in the caller are added as new columns. Parameters ---------- other : DataFrame or Series/dict-like object, or list of these The data to append. ignore_index : bool, optional If True, do not use the index labels, by default False verify_integrity : bool, optional If True, raise ValueError on creating index with duplicates, by default False sort : bool, optional Sort columns if the columns of self and other are not aligned The default sorting is deprecated and will change to not-sorting in a future version of pandas. by default False Returns ------- PandasMoveDataFrame A dataframe containing rows from both the caller and `other`. References ---------- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.append.html """ if isinstance(other, PandasMoveDataFrame): other = DataFrame(other) _append = super().append( other=other, ignore_index=ignore_index, verify_integrity=verify_integrity, sort=sort ) return PandasMoveDataFrame(data=_append)
[docs] def join( self, other: 'PandasMoveDataFrame' | DataFrame, on: str | list | None = None, how: str = 'left', lsuffix: str = '', rsuffix: str = '', sort: bool = False ) -> 'PandasMoveDataFrame': """ Join columns of other, returning a new object. Join columns with `other` PandasMoveDataFrame either on index or on a key column. Efficiently join multiple DataFrame objects by index at once by passing a list. Parameters ---------- other : DataFrame, Series, or list of DataFrame Index should be similar to one of the columns in this one. If a Series is passed, its name attribute must be set, and that will be used as the column name in the resulting joined DataFrame. on : str or list of str or array-like, optional Column or index level name(srs) in the caller to join on the index in `other`, otherwise joins index-on-index. If multiple values given, the `other` DataFrame must have a MultiIndex. Can pass an array as the join key if it is not already contained in the calling DataFrame. Like an Excel VLOOKUP operation. how : {'left', 'right', 'outer', 'inner'}, optional How to handle the operation of the two objects, by default 'left' * left: use calling frame index (or column if on is specified) * right: use `other` index. * outer: form union of calling frame index (or column if on is specified) with `other` index, and sort it. lexicographically. * inner: form intersection of calling frame index (or column if on is specified) with `other` index, preserving the order of the calling one. lsuffix : str, optional Suffix to use from left frame overlapping columns, by default '' rsuffix : str, optional Suffix to use from right frame overlapping columns, by default '' sort : bool, optional Order result DataFrame lexicographically by the join key. If False, the order of the join key depends on the join type (how keyword) Returns ------- PandasMoveDataFrame A dataframe containing columns from both the caller and `other`. Notes ----- Parameters `on`, `lsuffix`, and `rsuffix` are not supported when passing a list of `DataFrame` objects. References ---------- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.join.html """ if isinstance(other, PandasMoveDataFrame): other = other._data _join = super().join( other=other, on=on, how=how, lsuffix=lsuffix, rsuffix=rsuffix, sort=sort) return PandasMoveDataFrame(data=_join)
[docs] def merge( self, right: 'PandasMoveDataFrame' | DataFrame | Series, how: str = 'inner', on: str | list | None = None, left_on: str | list | None = None, right_on: str | list | None = None, left_index: bool = False, right_index: bool = False, sort: bool = False, suffixes: tuple[str, str] = ('_x', '_y'), copy: bool = True, indicator: bool | str = False, validate: str | None = None ) -> 'PandasMoveDataFrame': """ Merge DataFrame or named Series objects with a database-style join. The join is done on columns or indexes. If joining columns on columns, the DataFrame indexes will be ignored. Otherwise if joining indexes on indexes or indexes on a column or columns, the index will be passed on. Parameters ---------- right: DataFrame or named Series Object to merge with. how: {‘left’, ‘right’, ‘outer’, ‘inner’}, optional Type of merge to be performed, by default ‘inner’ left: use only keys from left frame, similar to a SQL left outer join; preserve key order. right: use only keys from right frame, similar to a SQL right outer join; preserve key order. outer: use union of keys from both frames, similar to a SQL full outer join; sort keys lexicographically. inner: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys. on: label or list, optional Column or index level names to join on. These must be found in both DataFrames. If on is None and not merging on indexes then this defaults to the intersection of the columns in both DataFrames, by default None left_on: str or list or array-like, optional Column or index level names to join on in the left DataFrame. Can also be an array or list of arrays of the length of the left DataFrame. These arrays are treated as if they are columns, by default None right_on: str or list or array-like, optional Column or index level names to join on in the right DataFrame. Can also be an array or list of arrays of the length of the right DataFrame. These arrays are treated as if they are columns, by default None left_index: bool, optional Use the index from the left DataFrame as the join key(s), by default False If it is a MultiIndex, the number of keys in the other DataFrame (either the index or a number of columns) must match the number of levels. right_index: bool, optional Use the index from the right DataFrame as the join key, by default False Same caveats as left_index. sort: bool, optional Sort the join keys lexicographically in the result DataFrame, by default False If False, the order of the join keys depends on the join type (how keyword). suffixes: tuple of (str, str), optional Suffix to apply to overlapping column names in the left and right side respectively. To raise an exception on overlapping columns use (False, False) by default (‘_x’, ‘_y’) copy: bool, optional If False, avoid copy if possible, by default True indicator: bool or str, optional If True, adds a column to output DataFrame called '_merge' with information on the source of each row. If string, column with information on source of each row will be added to output DataFrame, and column will be named value of string. Information column is Categorical-type and takes on a value of 'left_only' for observations whose merge key only appears in ‘left’ DataFrame, 'right_only' for observations whose merge key only appears in ‘right’ DataFrame, and 'both' if the observation’s merge key is found in both. by default False validate: str, optional If specified, checks if merge is of specified type, by default None 'one_to_one' or '1:1': check if merge keys are unique in both left and right datasets. 'one_to_many' or '1:m': check if merge keys are unique in left dataset. 'many_to_one' or 'm:1': check if merge keys are unique in right dataset. 'many_to_many' or 'm:m': allowed, but does not result in checks. Returns ------- PandasMoveDataFrame A DataFrame of the two merged objects. References ---------- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html?highlight=merge#pandas.DataFrame.merge """ if isinstance(right, PandasMoveDataFrame): right = right._data _merge = super().merge( right=right, how=how, on=on, left_on=left_on, right_on=right_on, left_index=left_index, right_index=right_index, sort=sort, suffixes=suffixes, copy=copy, indicator=indicator, validate=validate ) return PandasMoveDataFrame(data=_merge)
[docs] def write_file(self, file_name: str, separator: str = ','): """ Write trajectory data to a new file. Parameters ---------- file_name : str Represents the filename. separator : str, optional Represents the information separator in a new file, by default ',' """ self.to_csv( file_name, sep=separator, encoding='utf-8', index=False )
[docs] def convert_to( self, new_type: str ) -> MoveDataFrame | 'PandasMoveDataFrame' | 'DaskMoveDataFrame': """ Convert an object from one type to another specified by the user. Parameters ---------- new_type: 'pandas' or 'dask' The type for which the object will be converted. Returns ------- A subclass of MoveDataFrameAbstractModel The converted object. """ operation = begin_operation('convet_to') if new_type == TYPE_DASK: _dask = MoveDataFrame( self, latitude=LATITUDE, longitude=LONGITUDE, datetime=DATETIME, traj_id=TRAJ_ID, type_=TYPE_DASK, n_partitions=1, ) self.last_operation = end_operation(operation) return _dask else: self.last_operation = end_operation(operation) return self
[docs] def get_type(self) -> str: """ Returns the type of the object. Returns ------- str A string representing the type of the object. """ type_ = self._type return type_