Source code for pymove.core.pandas_discrete

"""PandasDiscreteMoveDataFrame class."""
from __future__ import annotations

import numpy as np
import pandas as pd
from pandas.core.frame import DataFrame

from pymove.core.grid import Grid
from pymove.core.pandas import PandasMoveDataFrame
from pymove.preprocessing.filters import clean_trajectories_with_few_points
from pymove.preprocessing.segmentation import (
    _drop_single_point,
    _prepare_segmentation,
    _update_curr_tid_count,
)
from pymove.utils.constants import (
    DATETIME,
    LATITUDE,
    LOCAL_LABEL,
    LONGITUDE,
    PREV_LOCAL,
    THRESHOLD,
    TID,
    TID_STAT,
    TIME_TO_PREV,
    TRAJ_ID,
)
from pymove.utils.datetime import generate_time_statistics, threshold_time_statistics
from pymove.utils.log import logger, progress_bar
from pymove.utils.mem import begin_operation, end_operation
from pymove.utils.trajectories import shift


[docs]class PandasDiscreteMoveDataFrame(PandasMoveDataFrame): """PyMove discrete dataframe extending PandasMoveDataFrame.""" def __init__( self, data: DataFrame | list | dict, latitude: str = LATITUDE, longitude: str = LONGITUDE, datetime: str = DATETIME, traj_id: str = TRAJ_ID, local_label: str = LOCAL_LABEL ): """ Creates a dataframe using local_label as a discrete feature for localization. Parameters ---------- data : Union[DataFrame, List, Dict] Input trajectory data latitude : str, optional Represents column name latitude, by default LATITUDE longitude : str, optional Represents column name longitude, by default LONGITUDE datetime : str, optional Represents column name datetime, by default DATETIME traj_id : str, optional Represents column name trajectory id, by default TRAJ_ID local_label : str, optional Represents column name local label, by default LOCAL_LABEL Raises ------ KeyError If missing one of lat, lon, datetime, local_label columns ValueError, ParserError If the data types can't be converted. """ super().__init__( data=data, latitude=latitude, longitude=longitude, datetime=datetime, traj_id=traj_id ) if local_label not in self: raise ValueError( f'{local_label} column not in dataframe' )
[docs] def discretize_based_grid(self, region_size: int = 1000): """ Discrete space in cells of the same size, assigning a unique id to each cell. Parameters ---------- region_size: int, optional Size of grid cell, by default 1000 """ operation = begin_operation('discretize based on grid') logger.debug('\nDiscretizing dataframe...') grid = Grid(self, cell_size=region_size) grid.create_update_index_grid_feature(self) self.reset_index(drop=True, inplace=True) self.last_operation = end_operation(operation)
[docs] def generate_prev_local_features( self, label_id: str = TRAJ_ID, local_label: str = LOCAL_LABEL, sort: bool = True, inplace: bool = True ) -> 'PandasDiscreteMoveDataFrame' | None: """ Create a feature prev_local with the label of previous local to current point. Parameters ---------- label_id : str, optional Represents name of column of trajectory id, by default TRAJ_ID local_label : str, optional Indicates name of column of place labels on symbolic trajectory, by default LOCAL_LABEL sort : bool, optional Wether the dataframe will be sorted, by default True inplace : bool, optional Represents whether the operation will be performed on the data provided or in a copy, by default True Returns ------- PandasDiscreteMoveDataFrame Object with new features or None """ operation = begin_operation('generate_prev_equ_feature') if inplace: data_ = self else: data_ = self.copy() ids, size_id, idx = self._prepare_generate_data( self, sort, label_id ) message = '\nCreating generate_prev_equ_feature in previous equ\n' logger.debug( message ) if (data_[local_label].dtype == 'int'): data_[local_label] = data_[local_label].astype(np.float16) for idx in progress_bar( ids, desc=f'Generating previous {local_label}' ): current_local = data_.at[idx, local_label] current_local = np.array(current_local) size_id = current_local.size if size_id <= 1: data_.at[idx, PREV_LOCAL] = np.nan else: prev_local = shift(current_local, 1) # previous to current point data_.at[idx, PREV_LOCAL] = prev_local data_.reset_index(inplace=True) data_.last_operation = end_operation(operation) if not inplace: return data_
[docs] def generate_tid_based_statistics( self, label_id: str = TRAJ_ID, local_label: str = LOCAL_LABEL, mean_coef: float = 1.0, std_coef: float = 1.0, statistics: DataFrame | None = None, label_tid_stat: str = TID_STAT, drop_single_points: bool = False, inplace: bool = True, ) -> 'PandasDiscreteMoveDataFrame' | None: """ Splits the trajectories into segments based on time statistics for segments. Parameters ---------- label_id : str, optional Represents name of column of trajectory id, by default TRAJ_ID local_label : str, optional Indicates name of column of place labels on symbolic trajectory, by default LOCAL_LABEL mean_coef : float, optional Multiplication coefficient of the mean time for the segment, by default 1.0 std_coef : float, optional Multiplication coefficient of sdt time for the segment, by default 1.0 statistics : DataFrame, optional Time Statistics of the pairwise local labels, by default None label_tid_stat : str, optional The label of the column containing the ids of the formed segments. Is the new splitted id, by default TID_STAT drop_single_points : bool, optional Wether to drop the trajectories with only one point, by default False inplace : bool, optional Represents whether the operation will be performed on the data provided or in a copy, by default True Returns ------- PandasDiscreteMoveDataFrame Object with new features or None Raises ------ KeyError If missing local_label column ValueError If the data contains only null values """ if inplace: data_ = self else: data_ = self.copy() if TIME_TO_PREV not in data_: self.generate_dist_time_speed_features(TRAJ_ID) if local_label not in data_: raise KeyError(f'{local_label} not in data frame.') if PREV_LOCAL not in data_: data_[local_label] = data_[local_label].astype(np.float64) self.generate_prev_local_features( label_id=label_id, local_label=local_label ) if statistics is None: if (data_[PREV_LOCAL].isna().sum() == data_.shape[0]): raise ValueError( f'all values in the {PREV_LOCAL} column are null.' ) else: statistics = generate_time_statistics(data_, local_label=local_label) threshold_time_statistics(statistics, mean_coef, std_coef, inplace=True) clean_trajectories_with_few_points( data_, label_tid=label_id, min_points_per_trajectory=2, inplace=True ) current_tid, ids, count = _prepare_segmentation(data_, label_id, TID_STAT) for idx in progress_bar(ids, desc='Generating %s' % TID_STAT): md = data_.loc[idx, [TIME_TO_PREV, local_label, PREV_LOCAL]] md = pd.DataFrame(md) filter_ = [] for _, row in md.iterrows(): local_label_ = row[local_label] prev_local = row[PREV_LOCAL] threshold = statistics[ (statistics[local_label] == local_label_) & (statistics[PREV_LOCAL] == prev_local) ][THRESHOLD].values filter_.append(row[TIME_TO_PREV] > threshold) filter_arr = np.array(filter_) current_tid, count = _update_curr_tid_count( filter_arr, data_, idx, label_tid_stat, current_tid, count ) if label_id == TID_STAT: self.reset_index(drop=True, inplace=True) logger.debug( f'... {TID} = {TID_STAT}, then reseting and drop index!') else: self.reset_index(inplace=True) logger.debug('... reseting index\n') if drop_single_points: _drop_single_point(data_, TID_STAT, label_id) self.generate_dist_time_speed_features() if not inplace: return data_