Source code for pymove.preprocessing.filters

Filtering operations.


from __future__ import annotations

from typing import TYPE_CHECKING, Any, Callable

import numpy as np
from pandas import DataFrame

from pymove.semantic.semantic import outliers
from pymove.utils.constants import (
from pymove.utils.log import logger

    from pymove.core.dask import DaskMoveDataFrame
    from pymove.core.pandas import PandasMoveDataFrame

[docs]def get_bbox_by_radius( coordinates: tuple[float, float], radius: float = 1000 ) -> tuple[float, float, float, float]: """ Defines minimum and maximum coordinates, given a distance radius from a point. Parameters ---------- coords : tuple (lat, lon) The coordinates of point radius: float, optional (1000 by default) Returns ------- array coordinates min and max of the bbox References ---------- """ earth_radius = 6371000 r = radius / earth_radius lat, lon = np.radians(coordinates) latmin = lat - r latmax = lat + r delta_lon = np.arcsin(np.sin(r) / np.cos(lat)) lonmin = lon - delta_lon lonmax = lon + delta_lon return tuple(np.rad2deg([latmin, lonmin, latmax, lonmax])) # type: ignore
[docs]def by_bbox( move_data: DataFrame, bbox: tuple[float, float, float, float], filter_out: bool = False, inplace: bool = False ) -> DataFrame | None: """ Filters points of the trajectories according to specified bounding box. Parameters ---------- move_data : dataframe The input trajectories data bbox : tuple Tuple of 4 elements, containing the minimum and maximum values of latitude and longitude of the bounding box. filter_out : boolean, optional If set to false the function will return the trajectories points within the bounding box, and the points outside otherwise, by default False inplace : boolean, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Returns ------- DataFrame Returns dataframe with trajectories points filtered by bounding box or None """ filter_ = ( (move_data[LATITUDE] >= bbox[0]) & (move_data[LONGITUDE] >= bbox[1]) & (move_data[LATITUDE] <= bbox[2]) & (move_data[LONGITUDE] <= bbox[3]) ) if filter_out: filter_ = ~filter_ return move_data.drop(index=move_data[~filter_].index, inplace=inplace)
[docs]def by_datetime( move_data: DataFrame, start_datetime: str | None = None, end_datetime: str | None = None, filter_out: bool = False, inplace: bool = False, ) -> DataFrame | None: """ Filters trajectories points according to specified time range. Parameters ---------- move_data : dataframe The input trajectory data start_datetime : str The start date and time (Datetime format) of the time range, by default None end_datetime : str The end date and time (Datetime format) of the time range, by default None filter_out : bool, optional If set to true, the function will return the points of the trajectories with timestamp outside the time range. The points whithin the time range will be return if filter_out is False. by default False inplace : bool, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Returns ------- DataFrame Returns dataframe with trajectories points filtered by time range or None """ if start_datetime is not None and end_datetime is not None: filter_ = ( (move_data[DATETIME] >= start_datetime) & (move_data[DATETIME] <= end_datetime) ) elif end_datetime is not None: filter_ = move_data[DATETIME] <= end_datetime else: filter_ = move_data[DATETIME] >= start_datetime if filter_out: filter_ = ~filter_ return move_data.drop(index=move_data[~filter_].index, inplace=inplace)
[docs]def by_label( move_data: DataFrame, value: Any, label_name: str, filter_out: bool = False, inplace: bool = False ) -> DataFrame | None: """ Filters trajectories points according to specified value and column label. Parameters ---------- move_data : dataframe The input trajectory data value : The value to be use to filter the trajectories Specifies the value used to filter the trajectories points label_name : str Specifies the label of the column used in the filtering filter_out : bool, optional If set to true, the function will return the points of the trajectories with timestamp outside the time range. The points whithin the time range will be return if filter_out is False. by default False inplace : bool, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Returns ------- DataFrame Returns dataframe with trajectories points filtered by label or None """ filter_ = move_data[label_name] == value if filter_out: filter_ = ~filter_ return move_data.drop(index=move_data[~filter_].index, inplace=inplace)
[docs]def by_id( move_data: DataFrame, id_: int | None = None, label_id: str = TRAJ_ID, filter_out: bool = False, inplace: bool = False ) -> DataFrame | None: """ Filters trajectories points according to specified trajectory id. Parameters ---------- move_data : dataframe The input trajectory data id_ : int Specifies the number of the id used to filter the trajectories points label_id : str, optional The label of the column which contains the id of the trajectories, by default TRAJ_ID filter_out : bool, optional If set to true, the function will return the points of the trajectories with timestamp outside the time range. The points whithin the time range will be return if filter_out is False. by default False inplace : bool, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Returns ------- DataFrame Returns dataframe with trajectories points filtered by id or None """ return by_label(move_data, id_, label_id, filter_out, inplace)
[docs]def by_tid( move_data: DataFrame, tid_: str | None = None, filter_out: bool = False, inplace: bool = False ) -> DataFrame | None: """ Filters trajectories points according to a specified trajectory tid. Parameters ---------- move_data : dataframe The input trajectory data tid_ : str Specifies the number of the tid used to filter the trajectories points label_tid : str, optional The label of the column in the user dataframe which contains the tid of the trajectories, by default None filter_out : bool, optional If set to true, the function will return the points of the trajectories with timestamp outside the time range. The points whithin the time range will be return if filter_out is False. by default False inplace : bool, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Returns ------- DataFrame Returns a dataframe with trajectories points filtered or None """ return by_label(move_data, tid_, TID, filter_out, inplace)
[docs]def clean_consecutive_duplicates( move_data: DataFrame, subset: int | str | None = None, keep: str | bool = 'first', inplace: bool = False ) -> DataFrame | None: """ Removes consecutive duplicate rows of the Dataframe. Optionally only certain columns can be consider. Parameters ---------- move_data : dataframe The input trajectory data subset : Array of str, optional Specifies Column label or sequence of labels, considered for identifying duplicates, by default None keep : 'first', 'last', optional If keep is set as first, all the duplicates except for the first occurrence will be dropped. On the other hand if set to last, all duplicates except for the last occurrence will be dropped. If set to False, all duplicates are dropped. by default 'first' inplace : boolean, optional if set to true the original dataframe will be altered, the duplicates will be dropped in place, otherwise a copy will be returned, by default False Returns ------- DataFrame The filtered trajectories points without consecutive duplicates or None """ if keep == 'first': n = 1 else: n = -1 if subset is None: filter_ = (move_data.shift(n) != move_data).any(axis=1) else: filter_ = (move_data[subset].shift(n) != move_data[subset]).any(axis=1) return move_data.drop(index=move_data[~filter_].index, inplace=inplace)
def _filter_single_by_max(move_data: DataFrame, **kwargs): """ Filters from a dataframe rows with features below value. Parameters ---------- move_data : dataframe Dataframe to be filtered. **kwargs : arguments - arg1 : feature - arg2 : value Returns ------- DataFrame Filtered dataframe. """ return move_data[move_data[kwargs['arg1']] <= kwargs['arg2']] def _filter_speed_max_radius(move_data: DataFrame, **kwargs): """ Filters from a dataframe rows with current or previous row features exceeding value. Parameters ---------- move_data : dataframe Dataframe to be filtered. **kwargs : arguments - arg1 : feature - arg2 : value Returns ------- DataFrame Filtered dataframe. """ filter_ = ( (np.nan_to_num(move_data[kwargs['arg1']].shift(1)) > kwargs['arg2']) | (np.nan_to_num(move_data[kwargs['arg1']]) > kwargs['arg2']) ) return move_data[filter_] def _filter_data(move_data: DataFrame, f: Callable, kwargs: dict): """ Filter the dataframe using condition from given function. Parameters ---------- move_data : dataframe Dataframe to be filtered. f : function Filtering function **kwargs : arguments - arg1 : feature - arg2 : value - outliers : special behavior if cleaning by outliers Returns ------- dataframe Filtered dataframe. int Number of rows to be dropped """ if kwargs['outliers']: filter_data_points = f( move_data, jump_coefficient=kwargs['arg1'], threshold=kwargs['arg2'], inplace=False ) filter_data_points = filter_data_points[filter_data_points[OUTLIER]] else: filter_data_points = f( move_data, arg1=kwargs['arg1'], arg2=kwargs['arg2'], inplace=False ) rows_to_drop = filter_data_points.shape[0] return filter_data_points, rows_to_drop def _clean_gps(move_data: DataFrame, f: Callable, **kwargs): """ Cleans gps points from a dataframe using condition from given function. Parameters ---------- move_data : dataframe Dataframe to be filtered. f : function Filtering function **kwargs : arguments - arg1 : feature - arg2 : value - outliers : special behavior if cleaning by outliers Returns ------- dataframe Filtered dataframe. """ if is not None: logger.debug('...Reset index for filtering\n') move_data.reset_index(inplace=True) filter_data_points, rows_to_drop = _filter_data(move_data, f, kwargs) sum_drop = 0 while rows_to_drop > 0: logger.debug('...Dropping %s rows of gps points\n' % rows_to_drop) shape_before = move_data.shape[0] move_data.drop(index=filter_data_points.index, inplace=True) sum_drop = sum_drop + rows_to_drop logger.debug( '...Rows before: %s, Rows after:%s, Sum drop:%s\n' % (shape_before, move_data.shape[0], sum_drop) ) filter_data_points, rows_to_drop = _filter_data(move_data, f, kwargs) logger.debug('%s GPS points were dropped' % sum_drop) return move_data
[docs]def clean_gps_jumps_by_distance( move_data: 'PandasMoveDataFrame' | 'DaskMoveDataFrame', label_id: str = TRAJ_ID, jump_coefficient: float = 3.0, threshold: float = 1, label_dtype: Callable = np.float64, inplace: bool = False, ) -> 'PandasMoveDataFrame' | 'DaskMoveDataFrame' | None: """ Removes the trajectories points that are outliers from the dataframe. Parameters ---------- move_data : dataframe The input trajectory data label_id : str, optional Indicates the label of the id column in the user dataframe, by default TRAJ_ID jump_coefficient : float, optional by default 3 threshold : float, optional Minimum value that the distance features must have in order to be considered outliers, by default 1 label_dtype : type, optional Represents column id type, by default np.float64. inplace : boolean, optional if set to true the operation is done in place, the original dataframe will be altered and None is returned, by default False Returns ------- DataFrame The filtered trajectories without the gps jumps or None """ if not inplace: move_data = move_data.copy() if DIST_TO_PREV not in move_data: move_data.generate_dist_features( label_id=label_id, label_dtype=label_dtype ) logger.debug( '\nCleaning gps jumps by distance to jump_coefficient %s...\n' % jump_coefficient ) move_data = _clean_gps( move_data, outliers, arg1=jump_coefficient, arg2=threshold, outliers=True ) if not inplace: return move_data
[docs]def clean_gps_nearby_points_by_distances( move_data: 'PandasMoveDataFrame' | 'DaskMoveDataFrame', label_id: str = TRAJ_ID, radius_area: float = 10.0, label_dtype: Callable = np.float64, inplace: bool = False, ) -> 'PandasMoveDataFrame' | 'DaskMoveDataFrame' | None: """ Removes points from the trajectories with smaller distance from the point before. Parameters ---------- move_data : dataframe The input trajectory data label_id : str, optional Indicates the label of the id column in the user dataframe, by default TRAJ_ID radius_area : float, optional Species the minimum distance a point must have to it"srs previous point in order not to be dropped, by default 10 label_dtype : type, optional Represents column id type, ,y default np.float64. inplace : boolean, optional if set to true the operation is done in place, the original dataframe will be altered and None is returned, be default False Returns ------- DataFrame The filtered trajectories without the gps nearby points by distance or None """ if not inplace: move_data = move_data.copy() if DIST_TO_PREV not in move_data: move_data.generate_dist_features( label_id=label_id, label_dtype=label_dtype ) logger.debug( '\nCleaning gps points from radius of %s meters\n' % radius_area ) move_data = _clean_gps( move_data, _filter_single_by_max, arg1=DIST_TO_PREV, arg2=radius_area, outliers=False ) if not inplace: return move_data
[docs]def clean_gps_nearby_points_by_speed( move_data: 'PandasMoveDataFrame' | 'DaskMoveDataFrame', label_id: str = TRAJ_ID, speed_radius: float = 0.0, label_dtype: Callable = np.float64, inplace: bool = False, ) -> 'PandasMoveDataFrame' | 'DaskMoveDataFrame' | None: """ Removes points from the trajectories with smaller speed of travel. Parameters ---------- move_data : dataframe The input trajectory data label_id : str, optional Indicates the label of the id column in the user dataframe, be defalt TRAJ_ID speed_radius : float, optional Species the minimum speed a point must have from it"srs previous point, in order not to be dropped, by default 0 label_dtype : type, optional Represents column id type, by default np.float64. inplace : boolean, optional if set to true the operation is done in place, the original dataframe will be altered and None is returned, by default False Returns ------- DataFrame The filtered trajectories without the gps nearby points by speed or None """ if not inplace: move_data = move_data.copy() if SPEED_TO_PREV not in move_data: move_data.generate_dist_time_speed_features( label_id=label_id, label_dtype=label_dtype ) logger.debug( '\nCleaning gps points using %s speed radius\n' % speed_radius ) move_data = _clean_gps( move_data, _filter_single_by_max, arg1=SPEED_TO_PREV, arg2=speed_radius, outliers=False ) if not inplace: return move_data
[docs]def clean_gps_speed_max_radius( move_data: 'PandasMoveDataFrame' | 'DaskMoveDataFrame', label_id: str = TRAJ_ID, speed_max: float = 50.0, label_dtype: Callable = np.float64, inplace: bool = False, ) -> 'PandasMoveDataFrame' | 'DaskMoveDataFrame' | None: """ Removes trajectories points with higher speed. Given any point p of the trajectory, the point will be removed if one of the following happens: if the travel speed from the point before p to p is greater than the max value of speed between adjacent points set by the user. Or the travel speed between point p and the next point is greater than the value set by the user. When the cleaning is done, the function will update the time and distance features in the dataframe and will call itself again. The function will finish processing when it can no longer find points disrespecting the limit of speed. Parameters ---------- move_data : dataframe The input trajectory data label_id : str, optional Indicates the label of the id column in the user dataframe, by default TRAJ_ID speed_max : float, optional Indicates the maximum value a point speed_to_prev and speed_to_next should have, in order not to be dropped, by default 50 label_dtype : type, optional Represents column id type, by default np.float64. inplace : boolean, optional if set to true the operation is done in place, the original dataframe will be altered and None is returned, by default False Returns ------- DataFrame The filtered trajectories without the gps nearby points or None """ if not inplace: move_data = move_data.copy() if SPEED_TO_PREV not in move_data: move_data.generate_dist_time_speed_features( label_id=label_id, label_dtype=label_dtype ) logger.debug( '\nClean gps points with speed max > %s meters by seconds' % speed_max ) move_data = _clean_gps( move_data, _filter_speed_max_radius, arg1=SPEED_TO_PREV, arg2=speed_max, outliers=False ) if not inplace: return move_data
[docs]def clean_trajectories_with_few_points( move_data: 'PandasMoveDataFrame' | 'DaskMoveDataFrame', label_tid: str = TID, min_points_per_trajectory: int = 2, inplace: bool = False ) -> 'PandasMoveDataFrame' | 'DaskMoveDataFrame' | None: """ Removes from the given dataframe, trajectories with fewer points. Parameters ---------- move_data : dataframe The input trajectory data label_tid : str, optional The label of the column which contains the tid of the trajectories, by default TID min_points_per_trajectory: int, optional Specifies the minimum number of points a trajectory must have in order not to be dropped, by default 2 inplace : boolean, optional if set to true the operation is done in place, the original dataframe will be altered and None is returned, by default False Returns ------- DataFrame The filtered trajectories without the minimum number of gps points or None Raises ------ KeyError If the label feature is not in the dataframe """ if not inplace: move_data = move_data.copy() if label_tid not in move_data: raise KeyError('%s not in dataframe' % label_tid) logger.debug( '\nCleaning gps points from trajectories of fewer than %s points\n' % min_points_per_trajectory ) if is not None: logger.debug('\n...Reset index for filtering\n') move_data.reset_index(inplace=True) move_datacount_tid = move_data.groupby(by=label_tid).size() filter_ = move_datacount_tid < min_points_per_trajectory tids_with_few_points = move_datacount_tid[filter_].index shape_before_drop = move_data.shape idx = move_data[move_data[label_tid].isin(tids_with_few_points)].index if idx.shape[0] > 0: logger.debug( '\n...There are %s ids with few points' % tids_with_few_points.shape[0] ) logger.debug( '\n...Tids before drop: %s' % move_data[label_tid].unique().shape[0] ) move_data.drop(index=idx, inplace=True) logger.debug( '\n...Tids after drop: %s' % move_data[label_tid].unique().shape[0] ) logger.debug( '\n...Shape - before drop: %s - after drop: %s' % (shape_before_drop, move_data.shape) ) if not inplace: return move_data
[docs]def clean_trajectories_short_and_few_points( move_data: 'PandasMoveDataFrame' | 'DaskMoveDataFrame', label_id: str = TID, min_trajectory_distance: float = 100, min_points_per_trajectory: int = 2, label_dtype: Callable = np.float64, inplace: bool = False, ) -> 'PandasMoveDataFrame' | 'DaskMoveDataFrame' | None: """ Eliminates from the given dataframe trajectories with fewer points and shorter length. Parameters ---------- move_data : dataframe The input trajectory data label_id : str, optional The label of the column which contains the tid of the trajectories, by default TID min_trajectory_distance: float, optional Specifies the minimun length a trajectory must have in order not to be dropped, by default 100 min_points_per_trajectory: int, optional Specifies the minimun number of points a trajectory must have in order not to be dropped, by default 2 label_dtype : type, optional Represents column id type, by default np.float64. inplace: boolean, optional if set to true the operation is done in place, the original dataframe will be altered and None is returned, by default False Returns ------- DataFrame The filtered trajectories with the minimum gps points and distance or None Notes ----- remove_tids_with_few_points must be performed before updating features. """ if not inplace: move_data = move_data.copy() logger.debug('\nRemove short trajectories...') clean_trajectories_with_few_points( move_data, label_id, min_points_per_trajectory, inplace=True ) if DIST_TO_PREV not in move_data: move_data.generate_dist_features( label_id=label_id, label_dtype=label_dtype ) logger.debug('\n...Dropping unnecessary trajectories...') if is not None: logger.debug('reseting index') move_data.reset_index(inplace=True) move_dataagg_tid = move_data.groupby(by=label_id).agg( {DIST_TO_PREV: 'sum'} ) filter_ = move_dataagg_tid[DIST_TO_PREV] < min_trajectory_distance tid_selection = move_dataagg_tid[filter_].index logger.debug( '\n...short trajectories and trajectories with a minimum distance (%s): %s' % (move_dataagg_tid.shape[0], min_trajectory_distance) ) logger.debug('\n...There are %s tid do drop' % tid_selection.shape[0]) shape_before_drop = move_data.shape idx = move_data[move_data[label_id].isin(tid_selection)].index if idx.shape[0] > 0: tids_before_drop = move_data[label_id].unique().shape[0] logger.debug( '\n...Tids - before drop: %s - after drop: %s' % (tids_before_drop, move_data[label_id].unique().shape[0]) ) move_data.drop(index=idx, inplace=True) logger.debug( '\n...Shape - before drop: %s - after drop: %s' % (shape_before_drop, move_data.shape) ) if not inplace: return move_data
[docs]def clean_id_by_time_max( move_data: 'PandasMoveDataFrame' | 'DaskMoveDataFrame', label_id: str = TRAJ_ID, time_max: float = 3600, label_dtype: Callable = np.float64, inplace: bool = False, ) -> 'PandasMoveDataFrame' | 'DaskMoveDataFrame' | None: """ Clears GPS points with time by ID greater than a user-defined limit. Parameters ---------- move_data: dataframe. The input data. label_id: str, optional The label of the column which contains the id of the trajectories, by default TRAJ_ID time_max: float, optional Indicates the maximum value time a set of points with the same id should have in order not to be dropped, by default 3600 label_dtype : type, optional Represents column id type, by default np.float64. inplace : boolean, optional if set to true the operation is done in place, the original dataframe will be altered and None is returned, by default False Returns ------- dataframe or None The filtered trajectories with the maximum time. """ if not inplace: move_data = move_data.copy() if TIME_TO_PREV not in move_data: move_data.generate_dist_time_speed_features( label_id=label_id, label_dtype=label_dtype ) logger.debug( '\nClean gps points with time max by id < %s seconds' % time_max ) move_dataid_drop = ( move_data.groupby([label_id], as_index=False) .agg({TIME_TO_PREV: 'sum'}) .query(f'{TIME_TO_PREV} < {time_max}') ) logger.debug( '...Ids total: %s\nIds to drop:%s' % ( move_data[label_id].nunique(), move_dataid_drop[label_id].nunique() ) ) if move_dataid_drop.shape[0] > 0: before_drop = move_data.shape[0] filter_ = move_data[label_id].isin(move_dataid_drop[label_id]) idx = move_data[filter_].index move_data.drop(idx, inplace=True) logger.debug( '...Rows before drop: %s\n Rows after drop: %s' % (before_drop, move_data.shape[0]) ) if not inplace: return move_data