Source code for pymove.preprocessing.compression

"""
Compression operations.

compress_segment_stop_to_point

"""
from __future__ import annotations

import numpy as np
from pandas import DataFrame

from pymove.preprocessing.stay_point_detection import (
    create_or_update_move_stop_by_dist_time,
)
from pymove.utils.constants import (
    LAT_MEAN,
    LATITUDE,
    LON_MEAN,
    LONGITUDE,
    SEGMENT_STOP,
    STOP,
    TRAJ_ID,
)
from pymove.utils.log import logger, progress_bar, timer_decorator


[docs]@timer_decorator def compress_segment_stop_to_point( move_data: DataFrame, label_segment: str = SEGMENT_STOP, label_stop: str = STOP, point_mean: str = 'default', drop_moves: bool = False, label_id: str = TRAJ_ID, dist_radius: float = 30, time_radius: float = 900, inplace: bool = False, ) -> DataFrame: """ Compress the trajectories using the stop points in the dataframe. Compress a segment to point setting lat_mean e lon_mean to each segment. Parameters ---------- move_data : dataframe The input trajectory data label_segment : String, optional The label of the column containing the ids of the formed segments. Is the new splitted id, by default SEGMENT_STOP label_stop : String, optional Is the name of the column that indicates if a point is a stop, by default STOP point_mean : String, optional Indicates whether the mean points should be calculated using centroids or the point that repeat the most, by default 'default' drop_moves : Boolean, optional If set to true, the moving points will be dropped from the dataframe, by default False label_id : String, optional Used to create the stay points used in the compression. If the dataset already has the stop move, this parameter should be ignored. Indicates the label of the id column in the user dataframe, by default TRAJ_ID dist_radius : Double, optional Used to create the stay points used in the compression, by default 30 If the dataset already has the stop move, this parameter should be ignored. The first step in this function is segmenting the trajectory. The segments are used to find the stop points. The dist_radius defines the distance used in the segmentation. time_radius : Double, optional Used to create the stay points used in the compression, by default 900 If the dataset already has the stop move, this parameter should be ignored. The time_radius used to determine if a segment is a stop. If the user stayed in the segment for a time greater than time_radius, than the segment is a stop. inplace : boolean, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Returns ------- DataFrame Data with 3 additional features: segment_stop, lat_mean and lon_mean or None segment_stop indicates the trajectory segment to which the point belongs lat_mean and lon_mean: if the default option is used, lat_mean and lon_mean are defined based on point that repeats most within the segment On the other hand, if centroid option is used, lat_mean and lon_mean are defined by centroid of the all points into segment """ if not inplace: move_data = move_data.copy() if (label_segment not in move_data) & (label_stop not in move_data): create_or_update_move_stop_by_dist_time( move_data, dist_radius, time_radius, label_id, inplace=True ) logger.debug('...setting mean to lat and lon...') lat_mean = np.full(move_data.shape[0], -1.0, dtype=np.float64) lon_mean = np.full(move_data.shape[0], -1.0, dtype=np.float64) if drop_moves is False: lat_mean[move_data[~move_data[label_stop]].index] = np.NaN lon_mean[move_data[~move_data[label_stop]].index] = np.NaN else: logger.debug('...move segments will be dropped...') logger.debug('...get only segments stop...') segments = move_data[move_data[label_stop]][label_segment].unique() for idx in progress_bar( segments, desc=f'Generating {label_segment} and {label_stop}' ): filter_ = move_data[label_segment] == idx size_id = move_data[filter_].shape[0] # verify if filter is None if size_id > 1: # get first and last point of each stop segment ind_start = move_data[filter_].iloc[[0]].index ind_end = move_data[filter_].iloc[[-1]].index if point_mean == 'default': p = ( move_data[filter_] .groupby([LATITUDE, LONGITUDE], as_index=False) .agg({'id': 'count'}) .sort_values(['id']) .tail(1) ) lat_mean[ind_start] = p.iloc[0, 0] lon_mean[ind_start] = p.iloc[0, 1] lat_mean[ind_end] = p.iloc[0, 0] lon_mean[ind_end] = p.iloc[0, 1] elif point_mean == 'centroid': # set lat and lon mean to first_point # and last points to each segment lat_mean[ind_start] = move_data.loc[filter_][LATITUDE].mean() lon_mean[ind_start] = move_data.loc[filter_][LONGITUDE].mean() lat_mean[ind_end] = move_data.loc[filter_][LATITUDE].mean() lon_mean[ind_end] = move_data.loc[filter_][LONGITUDE].mean() else: logger.debug(f'There are segments with only one point: {idx}') move_data[LAT_MEAN] = lat_mean move_data[LON_MEAN] = lon_mean del lat_mean del lon_mean shape_before = move_data.shape[0] # filter points to drop filter_drop = ( (move_data[LAT_MEAN] == -1.0) & (move_data[LON_MEAN] == -1.0) ) shape_drop = move_data[filter_drop].shape[0] if shape_drop > 0: logger.debug('...Dropping %s points...' % shape_drop) move_data.drop(move_data[filter_drop].index, inplace=True) logger.debug( '...Shape_before: %s\n...Current shape: %s' % (shape_before, move_data.shape[0]) ) if not inplace: return move_data