Source code for pymove.preprocessing.compression

"""
Compression operations.

compress_segment_stop_to_point

"""
from __future__ import annotations

import numpy as np
from pandas import DataFrame

from pymove.preprocessing.stay_point_detection import (
    create_or_update_move_stop_by_dist_time,
)
from pymove.utils.constants import (
    LAT_MEAN,
    LATITUDE,
    LON_MEAN,
    LONGITUDE,
    SEGMENT_STOP,
    STOP,
    TRAJ_ID,
)
from pymove.utils.log import logger, progress_bar, timer_decorator


[docs]@timer_decorator
def compress_segment_stop_to_point(
    move_data: DataFrame,
    label_segment: str = SEGMENT_STOP,
    label_stop: str = STOP,
    point_mean: str = 'default',
    drop_moves: bool = False,
    label_id: str = TRAJ_ID,
    dist_radius: float = 30,
    time_radius: float = 900,
    inplace: bool = False,
) -> DataFrame:
    """
    Compress the trajectories using the stop points in the dataframe.

    Compress a segment to point setting lat_mean e lon_mean to each segment.

    Parameters
    ----------
    move_data : dataframe
       The input trajectory data
    label_segment : String, optional
        The label of the column containing the ids of the formed segments.
        Is the new splitted id, by default SEGMENT_STOP
    label_stop : String, optional
        Is the name of the column that indicates if a point is a stop, by default STOP
    point_mean : String, optional
        Indicates whether the mean points should be calculated using
        centroids or the point that repeat the most, by default 'default'
    drop_moves : Boolean, optional
        If set to true, the moving points will be dropped from the dataframe,
        by default False
    label_id : String, optional
         Used to create the stay points used in the compression.
         If the dataset already has the stop move, this
         parameter should be ignored.
         Indicates the label of the id column in the user dataframe, by default TRAJ_ID
    dist_radius : Double, optional
        Used to create the stay points used in the compression, by default 30
        If the dataset already has the stop move, this
        parameter should be ignored.
        The first step in this function is segmenting the trajectory.
        The segments are used to find the stop points.
        The dist_radius defines the distance used in the segmentation.
    time_radius :  Double, optional
        Used to create the stay points used in the compression, by default 900
        If the dataset already has the stop move, this
         parameter should be ignored.
        The time_radius used to determine if a segment is a stop.
        If the user stayed in the segment for a time
        greater than time_radius, than the segment is a stop.
    inplace : boolean, optional
        if set to true the original dataframe will be altered to contain
        the result of the filtering, otherwise a copy will be returned, by default False

    Returns
    -------
    DataFrame
        Data with 3 additional features: segment_stop, lat_mean and lon_mean or None
        segment_stop indicates the trajectory segment to which the point belongs
        lat_mean and lon_mean:
            if the default option is used, lat_mean and lon_mean are defined
            based on point that repeats most within the segment
            On the other hand, if centroid option is used,
            lat_mean and lon_mean are defined by centroid of
            the all points into segment

    """
    if not inplace:
        move_data = move_data.copy()

    if (label_segment not in move_data) & (label_stop not in move_data):
        create_or_update_move_stop_by_dist_time(
            move_data, dist_radius, time_radius, label_id, inplace=True
        )

    logger.debug('...setting mean to lat and lon...')
    lat_mean = np.full(move_data.shape[0], -1.0, dtype=np.float64)
    lon_mean = np.full(move_data.shape[0], -1.0, dtype=np.float64)

    if drop_moves is False:
        lat_mean[move_data[~move_data[label_stop]].index] = np.NaN
        lon_mean[move_data[~move_data[label_stop]].index] = np.NaN
    else:
        logger.debug('...move segments will be dropped...')

    logger.debug('...get only segments stop...')
    segments = move_data[move_data[label_stop]][label_segment].unique()

    for idx in progress_bar(
        segments, desc=f'Generating {label_segment} and {label_stop}'
    ):
        filter_ = move_data[label_segment] == idx

        size_id = move_data[filter_].shape[0]
        # verify if filter is None
        if size_id > 1:
            # get first and last point of each stop segment
            ind_start = move_data[filter_].iloc[[0]].index
            ind_end = move_data[filter_].iloc[[-1]].index

            if point_mean == 'default':
                p = (
                    move_data[filter_]
                    .groupby([LATITUDE, LONGITUDE], as_index=False)
                    .agg({'id': 'count'})
                    .sort_values(['id'])
                    .tail(1)
                )
                lat_mean[ind_start] = p.iloc[0, 0]
                lon_mean[ind_start] = p.iloc[0, 1]
                lat_mean[ind_end] = p.iloc[0, 0]
                lon_mean[ind_end] = p.iloc[0, 1]

            elif point_mean == 'centroid':
                # set lat and lon mean to first_point
                # and last points to each segment
                lat_mean[ind_start] = move_data.loc[filter_][LATITUDE].mean()
                lon_mean[ind_start] = move_data.loc[filter_][LONGITUDE].mean()
                lat_mean[ind_end] = move_data.loc[filter_][LATITUDE].mean()
                lon_mean[ind_end] = move_data.loc[filter_][LONGITUDE].mean()
        else:
            logger.debug(f'There are segments with only one point: {idx}')

    move_data[LAT_MEAN] = lat_mean
    move_data[LON_MEAN] = lon_mean
    del lat_mean
    del lon_mean

    shape_before = move_data.shape[0]
    # filter points to drop
    filter_drop = (
        (move_data[LAT_MEAN] == -1.0)
        & (move_data[LON_MEAN] == -1.0)
    )
    shape_drop = move_data[filter_drop].shape[0]

    if shape_drop > 0:
        logger.debug('...Dropping %s points...' % shape_drop)
        move_data.drop(move_data[filter_drop].index, inplace=True)

    logger.debug(
        '...Shape_before: %s\n...Current shape: %s'
        % (shape_before, move_data.shape[0])
    )

    if not inplace:
        return move_data