Source code for pymove.utils.integration

"""
Integration operations.

union_poi_bank,
union_poi_bus_station,
union_poi_bar_restaurant,
union_poi_parks,
union_poi_police,
join_collective_areas,
join_with_pois,
join_with_pois_by_category,
join_with_events,
join_with_event_by_dist_and_time,
join_with_home_by_id,
merge_home_with_poi

"""
from __future__ import annotations

from collections import namedtuple

import numpy as np
from numpy import ndarray
from pandas import DataFrame, Timedelta
from pandas.core.series import Series

from pymove.preprocessing import filters
from pymove.utils.constants import (
    ADDRESS,
    CITY,
    DATETIME,
    DIST_EVENT,
    DIST_HOME,
    DIST_POI,
    EVENT_ID,
    EVENT_TYPE,
    GEOMETRY,
    HOME,
    ID_POI,
    LATITUDE,
    LONGITUDE,
    NAME_POI,
    TRAJ_ID,
    TYPE_POI,
    VIOLATING,
)
from pymove.utils.distances import haversine
from pymove.utils.log import logger, progress_bar


[docs]def union_poi_bank( data: DataFrame, label_poi: str = TYPE_POI, banks: list[str] | None = None, inplace: bool = False ) -> DataFrame | None: """ Performs the union between the different bank categories. For Points of Interest in a single category named 'banks'. Parameters ---------- data : DataFrame Input points of interest data label_poi : str, optional Label referring to the Point of Interest category, by default TYPE_POI banks : list of str, optional Names of poi refering to banks, by default banks = [ 'bancos_filiais', 'bancos_agencias', 'bancos_postos', 'bancos_PAE', 'bank', ] inplace : boolean, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Returns ------- DataFrame data with poi or None Examples -------- >>> from pymove.utils.integration import union_poi_bank >>> pois_df lat lon id type_poi 0 39.984094 116.319236 1 bank 1 39.984198 116.319322 2 randomvalue 2 39.984224 116.319402 3 bancos_postos 3 39.984211 116.319389 4 randomvalue 4 39.984217 116.319422 5 bancos_PAE 5 39.984710 116.319865 6 bancos_postos 6 39.984674 116.319810 7 bancos_agencias 7 39.984623 116.319773 8 bancos_filiais 8 39.984606 116.319732 9 banks 9 39.984555 116.319728 10 banks >>> union_poi_bank(pois_df) lat lon id type_poi 0 39.984094 116.319236 1 banks 1 39.984198 116.319322 2 randomvalue 2 39.984224 116.319402 3 banks 3 39.984211 116.319389 4 randomvalue 4 39.984217 116.319422 5 banks 5 39.984710 116.319865 6 banks 6 39.984674 116.319810 7 banks 7 39.984623 116.319773 8 banks 8 39.984606 116.319732 9 banks 9 39.984555 116.319728 10 banks """ if not inplace: data = data.copy() logger.debug('union bank categories to one category') logger.debug(f'... There are {data[label_poi].nunique()} -- {label_poi}') if banks is None: banks = [ 'bancos_filiais', 'bancos_agencias', 'bancos_postos', 'bancos_PAE', 'bank', ] filter_bank = data[label_poi].isin(banks) data.at[data[filter_bank].index, label_poi] = 'banks' if not inplace: return data
[docs]def union_poi_bus_station( data: DataFrame, label_poi: str = TYPE_POI, bus_stations: list[str] | None = None, inplace: bool = False ) -> DataFrame | None: """ Performs the union between the different bus station categories. For Points of Interest in a single category named 'bus_station'. Parameters ---------- data : DataFrame Input points of interest data label_poi : str, optional Label referring to the Point of Interest category, by default TYPE_POI bus_stations : list of str, optional Names of poi refering to bus_stations, by default bus_stations = [ 'transit_station', 'pontos_de_onibus' ] inplace : boolean, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Returns ------- DataFrame data with poi or None Examples -------- >>> from pymove.utils.integration import union_poi_bus_station >>> pois_df lat lon id type_poi 0 39.984094 116.319236 1 transit_station 1 39.984198 116.319322 2 randomvalue 2 39.984224 116.319402 3 transit_station 3 39.984211 116.319389 4 pontos_de_onibus 4 39.984217 116.319422 5 transit_station 5 39.984710 116.319865 6 randomvalue 6 39.984674 116.319810 7 bus_station 7 39.984623 116.319773 8 bus_station >>> union_poi_bus_station(pois_df) lat lon id type_poi 0 39.984094 116.319236 1 bus_station 1 39.984198 116.319322 2 randomvalue 2 39.984224 116.319402 3 bus_station 3 39.984211 116.319389 4 bus_station 4 39.984217 116.319422 5 bus_station 5 39.984710 116.319865 6 randomvalue 6 39.984674 116.319810 7 bus_station 7 39.984623 116.319773 8 bus_station """ if not inplace: data = data.copy() logger.debug('union bus station categories to one category') if bus_stations is None: bus_stations = [ 'transit_station', 'pontos_de_onibus' ] filter_bus_station = data[label_poi].isin( bus_stations ) data.at[data[filter_bus_station].index, label_poi] = 'bus_station' if not inplace: return data
[docs]def union_poi_bar_restaurant( data: DataFrame, label_poi: str = TYPE_POI, bar_restaurant: list[str] | None = None, inplace: bool = False ) -> DataFrame | None: """ Performs the union between bar and restaurant categories. For Points of Interest in a single category named 'bar-restaurant'. Parameters ---------- data : DataFrame Input points of interest data label_poi : str, optional Label referring to the Point of Interest category, by default TYPE_POI bar_restaurant : list of str, optional Names of poi refering to bars or restaurants, by default bar_restaurant = [ 'restaurant', 'bar' ] inplace : boolean, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Returns ------- DataFrame data with poi or None Examples -------- >>> from pymove.utils.integration import union_poi_bar_restaurant >>> pois_df lat lon id type_poi 0 39.984094 116.319236 1 restaurant 1 39.984198 116.319322 2 restaurant 2 39.984224 116.319402 3 randomvalue 3 39.984211 116.319389 4 bar 4 39.984217 116.319422 5 bar 5 39.984710 116.319865 6 bar-restaurant 6 39.984674 116.319810 7 random123 7 39.984623 116.319773 8 123 >>> union_poi_bar_restaurant(pois_df) lat lon id type_poi 0 39.984094 116.319236 1 bar-restaurant 1 39.984198 116.319322 2 bar-restaurant 2 39.984224 116.319402 3 randomvalue 3 39.984211 116.319389 4 bar-restaurant 4 39.984217 116.319422 5 bar-restaurant 5 39.984710 116.319865 6 bar-restaurant 6 39.984674 116.319810 7 random123 7 39.984623 116.319773 8 123 """ if not inplace: data = data.copy() logger.debug('union restaurant and bar categories to one category') if bar_restaurant is None: bar_restaurant = ['restaurant', 'bar'] filter_bar_restaurant = data[label_poi].isin(bar_restaurant) data.at[data[filter_bar_restaurant].index, label_poi] = 'bar-restaurant' if not inplace: return data
[docs]def union_poi_parks( data: DataFrame, label_poi: str = TYPE_POI, parks: list[str] | None = None, inplace: bool = False ) -> DataFrame | None: """ Performs the union between park categories. For Points of Interest in a single category named 'parks'. Parameters ---------- data : DataFrame Input points of interest data label_poi : str, optional Label referring to the Point of Interest category, by default TYPE_POI parks : list of str, optional Names of poi refering to parks, by default parks = [ 'pracas_e_parques', 'park' ] inplace : boolean, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Returns ------- DataFrame data with poi or None Examples -------- >>> from pymove.utils.integration import union_poi_parks >>> pois_df lat lon id type_poi 0 39.984094 116.319236 1 pracas_e_parques 1 39.984198 116.319322 2 park 2 39.984224 116.319402 3 parks 3 39.984211 116.319389 4 random 4 39.984217 116.319422 5 123 5 39.984710 116.319865 6 park 6 39.984674 116.319810 7 parks 7 39.984623 116.319773 8 pracas_e_parques >>> union_poi_parks(pois_df) lat lon id type_poi 0 39.984094 116.319236 1 parks 1 39.984198 116.319322 2 parks 2 39.984224 116.319402 3 parks 3 39.984211 116.319389 4 random 4 39.984217 116.319422 5 123 5 39.984710 116.319865 6 parks 6 39.984674 116.319810 7 parks 7 39.984623 116.319773 8 parks """ if not inplace: data = data.copy() logger.debug('union parks categories to one category') if parks is None: parks = ['pracas_e_parques', 'park'] filter_parks = data[label_poi].isin(parks) data.at[data[filter_parks].index, label_poi] = 'parks' if not inplace: return data
[docs]def union_poi_police( data: DataFrame, label_poi: str = TYPE_POI, police: list[str] | None = None, inplace: bool = False ) -> DataFrame | None: """ Performs the union between police categories. For Points of Interest in a single category named 'police'. Parameters ---------- data : DataFrame Input points of interest data label_poi : str, optional Label referring to the Point of Interest category, by default TYPE_POI police : list of str, optional Names of poi refering to police stations, by default police = [ 'distritos_policiais', 'delegacia' ] inplace : boolean, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Returns ------- DataFrame data with poi or None Examples -------- >>> from pymove.utils.integration import union_poi_police >>> pois_df lat lon id type_poi 0 39.984094 116.319236 1 distritos_policiais 1 39.984198 116.319322 2 police 2 39.984224 116.319402 3 police 3 39.984211 116.319389 4 distritos_policiais 4 39.984217 116.319422 5 random 5 39.984710 116.319865 6 randomvalue 6 39.984674 116.319810 7 123 7 39.984623 116.319773 8 bus_station >>> union_poi_police(pois_df) lat lon id type_poi 0 39.984094 116.319236 1 police 1 39.984198 116.319322 2 police 2 39.984224 116.319402 3 police 3 39.984211 116.319389 4 police 4 39.984217 116.319422 5 random 5 39.984710 116.319865 6 randomvalue 6 39.984674 116.319810 7 123 7 39.984623 116.319773 8 bus_station """ if not inplace: data = data.copy() logger.debug('union distritos policies and police categories') if police is None: police = ['distritos_policiais', 'delegacia'] filter_police = data[label_poi].isin(police) data.at[data[filter_police].index, label_poi] = 'police' if not inplace: return data
[docs]def join_collective_areas( data: DataFrame, areas: DataFrame, label_geometry: str = GEOMETRY, inplace: bool = False ) -> DataFrame | None: """ Performs the integration between trajectories and collective areas. Generating a new column that informs if the point of the trajectory is inserted in a collective area. Parameters ---------- data : geopandas.GeoDataFrame The input trajectory data areas : geopandas.GeoDataFrame The input coletive areas data label_geometry : str, optional Label referring to the Point of Interest category, by default GEOMETRY inplace : boolean, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Returns ------- DataFrame data with joined geometries or None Examples -------- >>> from pymove.utils.integration import join_collective_areas >>> data lat lon datetime id geometry 0 39.984094 116.319236 2008-10-23 05:53:05 1 POINT (116.31924 39.98409) 1 39.984198 116.319322 2008-10-23 05:53:06 1 POINT (116.31932 39.98420) 2 39.984224 116.319402 2008-10-23 05:53:11 1 POINT (116.31940 39.98422) 3 39.984211 116.319389 2008-10-23 05:53:16 1 POINT (116.31939 39.98421) 4 39.984217 116.319422 2008-10-23 05:53:21 1 POINT (116.31942 39.98422) >>> area_c lat lon datetime id geometry 0 39.984094 116.319236 2008-10-23 05:53:05 1 POINT (116.319236 39.984094) 1 40.006436 116.317701 2008-10-23 10:53:31 1 POINT (116.317701 40.006436) 2 40.014125 116.306159 2008-10-23 23:43:56 1 POINT (116.306159 40.014125) 3 39.984211 116.319389 2008-10-23 05:53:16 1 POINT (116.319389 39.984211) POINT (116.32687 39.97901) >>> join_collective_areas(gdf, area_c) >>> gdf.head() lat lon datetime id \ geometry violating 0 39.984094 116.319236 2008-10-23 05:53:05 1 \ POINT (116.319236 39.984094) True 1 39.984198 116.319322 2008-10-23 05:53:06 1 \ POINT (116.319322 39.984198) False 2 39.984224 116.319402 2008-10-23 05:53:11 1 \ POINT (116.319402 39.984224) False 3 39.984211 116.319389 2008-10-23 05:53:16 1 \ POINT (116.319389 39.984211) True 4 39.984217 116.319422 2008-10-23 05:53:21 1 \ POINT (116.319422 39.984217) False """ if not inplace: data = data.copy() logger.debug('Integration between trajectories and collectives areas') Geometry = namedtuple('Geometry', 'geom coordinates') polygons = areas[label_geometry].apply( lambda g: Geometry(g.__class__, g.__geo_interface__.get('coordinates')) ).unique() polygons = [p.geom(p.coordinates) for p in polygons] data[VIOLATING] = False for p in progress_bar(polygons, desc='Joining trajectories and areas'): intersects = data[label_geometry].apply(lambda x: x.intersects(p)) index = data[intersects].index data.at[index, VIOLATING] = True if not inplace: return data
def _reset_and_creates_id_and_lat_lon( data: DataFrame, df_pois: DataFrame, lat_lon_poi: bool = True, reset_index: bool = True ) -> tuple[ndarray, ndarray, ndarray, ndarray, ndarray]: """ Resets the indexes of the dataframes. Returns the minimum distance between the two dataframes, and return their respective variables (id, tags, latitude and longitude). Parameters ---------- data : DataFrame The input trajectory data. df_pois : DataFrame The input point of interest data. lat_lon_poi : bool, optional Flag to determine if the ids and tags is of size equivalent to df_pois, by default True reset_index : bool, optional Flag for reset index of the df_pois and data dataframes before the join, by default True Returns ------- distances, ids, tags, lat, lon: arrays with default values for join operation Examples -------- >>> from pymove.utils.integration import _reset_and_creates_id_and_lat_lon >>> move_df.head() lat lon datetime id 0 39.984094 116.319236 2008-10-23 05:53:05 1 >>> pois.head() lat lon id type_poi name_poi 0 39.984094 116.319236 1 policia distrito_pol_1 >>> _reset_and_creates_id_and_lat_lon(move_df, pois) ( array([inf]), array([''], dtype=object), array([''], dtype=object), array([inf]), array([inf]) ) """ if reset_index: logger.debug('... Resetting index to operation...') data.reset_index(drop=True, inplace=True) df_pois.reset_index(drop=True, inplace=True) # create numpy array to store new column to DataFrame of movement objects distances = np.full( data.shape[0], np.Infinity, dtype=np.float64 ) ids = np.full(data.shape[0], '', dtype='object_') tags = np.full(data.shape[0], '', dtype='object_') # creating lat and lon array to operation if lat_lon_poi: lat = np.full(df_pois.shape[0], np.Infinity, dtype=np.float64) lon = np.full(df_pois.shape[0], np.Infinity, dtype=np.float64) else: lat = np.full(data.shape[0], np.Infinity, dtype=np.float64) lon = np.full(data.shape[0], np.Infinity, dtype=np.float64) return distances, ids, tags, lat, lon def _reset_set_window__and_creates_event_id_type( data: DataFrame, df_events: DataFrame, time_window: float, label_date: str = DATETIME ) -> tuple[Series, Series, ndarray, ndarray, ndarray]: """ Resets the indexes of the dataframes. Set time window, and returns the current distance between the two dataframes, and return their respective variables (event_id, event_type). Parameters ---------- data : DataFrame The input trajectory data. df_events : DataFrame The input event point of interest data. time_window : float Number of seconds of the time window. label_date : str, optional Label of data referring to the datetime, by default DATETIME Returns ------- window_starts, window_ends, current_distances, event_id, event_type Examples -------- >>> from pymove.utils.integration import _reset_set_window__and_creates_event_id_type >>> move_df.head() lat lon datetime id 0 39.984094 116.319236 2008-10-23 05:53:05 1 >>> pois_df lat lon event_id datetime event_type 0 39.984094 116.319236 1 2008-10-24 01:57:57 show do tropykalia >>> _reset_set_window__and_creates_event_id_type(move_df, pois, 600) ( 0 2008-10-23 05:43:05 Name: datetime, dtype: datetime64[ns], 0 2008-10-23 06:03:05 Name: datetime, dtype: datetime64[ns], array([inf]), array([''], dtype=object), array([''], dtype=object) ) """ # get a vector with windows time to each point data.reset_index(drop=True, inplace=True) df_events.reset_index(drop=True, inplace=True) # compute windows time window_starts = data[label_date] - Timedelta(seconds=time_window) window_ends = data[label_date] + Timedelta(seconds=time_window) # create vector to store distances current_distances = np.full( data.shape[0], np.Infinity, dtype=np.float64 ) event_type = np.full(data.shape[0], '', dtype='object_') event_id = np.full(data.shape[0], '', dtype='object_') return window_starts, window_ends, current_distances, event_id, event_type def _reset_set_window_and_creates_event_id_type_all( data: DataFrame, df_events: DataFrame, time_window: float, label_date: str = DATETIME ) -> tuple[Series, Series, ndarray, ndarray, ndarray]: """ Resets the indexes of the dataframes. Set time window, and returns the current distance between the two dataframes, and return their respective variables (event_id, event_type). Parameters ---------- data : DataFrame The input trajectory data. df_events : DataFrame The input event point of interest data. time_window : float Number of seconds of the time window. label_date : str Label of data referring to the datetime. Returns ------- window_starts, window_ends, current_distances, event_id, event_type arrays with default values for join operation Examples -------- >>> from pymove.utils.integration import _reset_set_window_and_creates_event_id_type_all # noqa >>> move_df.head() lat lon datetime id 0 39.984094 116.319236 2008-10-23 05:53:05 1 >>> pois_df lat lon event_id datetime event_type 0 39.984094 116.319236 1 2008-10-24 01:57:57 show do tropykalia >>> _reset_set_window_and_creates_event_id_type_all(move_df, pois, 600) ( 0 2008-10-23 05:43:05 Name: datetime, dtype: datetime64[ns], 0 2008-10-23 06:03:05 Name: datetime, dtype: datetime64[ns], array([None], dtype=object), array([None], dtype=object), array([None], dtype=object) ) """ # get a vector with windows time to each point data.reset_index(drop=True, inplace=True) df_events.reset_index(drop=True, inplace=True) # compute windows time window_starts = data[label_date] - Timedelta(seconds=time_window) window_ends = data[label_date] + Timedelta(seconds=time_window) # create vector to store distances current_distances = np.full( data.shape[0], None, dtype=np.ndarray ) event_type = np.full(data.shape[0], None, dtype=np.ndarray) event_id = np.full(data.shape[0], None, dtype=np.ndarray) return window_starts, window_ends, current_distances, event_id, event_type
[docs]def join_with_pois( data: DataFrame, df_pois: DataFrame, label_id: str = TRAJ_ID, label_poi_name: str = NAME_POI, reset_index: bool = True, inplace: bool = False ): """ Performs the integration between trajectories and the closest point of interest. Generating two new columns referring to the name and the distance from the point of interest closest to each point of the trajectory. Parameters ---------- data : DataFrame The input trajectory data. df_pois : DataFrame The input point of interest data. label_id : str, optional Label of df_pois referring to the Point of Interest id, by default TRAJ_ID label_poi_name : str, optional Label of df_pois referring to the Point of Interest name, by default NAME_POI reset_index : bool, optional Flag for reset index of the df_pois and data dataframes before the join, by default True inplace : boolean, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Examples -------- >>> from pymove.utils.integration import join_with_pois >>> move_df lat lon datetime id 0 39.984094 116.319236 2008-10-23 05:53:05 1 1 39.984559 116.326696 2008-10-23 10:37:26 1 2 40.002899 116.321520 2008-10-23 10:50:16 1 3 40.016238 116.307691 2008-10-23 11:03:06 1 4 40.013814 116.306525 2008-10-23 11:58:33 2 5 40.009735 116.315069 2008-10-23 23:50:45 2 >>> pois lat lon id type_poi name_poi 0 39.984094 116.319236 1 policia distrito_pol_1 1 39.991013 116.326384 2 policia policia_federal 2 40.010000 116.312615 3 comercio supermercado_aroldo >>> join_with_pois(move_df, pois) lat lon datetime id id_poi \ dist_poi name_poi 0 39.984094 116.319236 2008-10-23 05:53:05 1 1 \ 0.000000 distrito_pol_1 1 39.984559 116.326696 2008-10-23 10:37:26 1 1 \ 637.690216 distrito_pol_1 2 40.002899 116.321520 2008-10-23 10:50:16 1 3 \ 1094.860663 supermercado_aroldo 3 40.016238 116.307691 2008-10-23 11:03:06 1 3 \ 810.542998 supermercado_aroldo 4 40.013814 116.306525 2008-10-23 11:58:33 2 3 \ 669.973155 supermercado_aroldo 5 40.009735 116.315069 2008-10-23 23:50:45 2 3 \ 211.069129 supermercado_aroldo """ if not inplace: data = data.copy() df_pois = df_pois.copy() values = _reset_and_creates_id_and_lat_lon(data, df_pois, False, reset_index) minimum_distances, ids_pois, tag_pois, lat_poi, lon_poi = values for idx, row in progress_bar( df_pois.iterrows(), total=len(df_pois), desc='Optimized integration with POIs' ): # update lat and lon of current index lat_poi.fill(row[LATITUDE]) lon_poi.fill(row[LONGITUDE]) # First iteration is minimum distances if idx == 0: minimum_distances = np.array( haversine( lat_poi, lon_poi, data[LATITUDE].values, data[LONGITUDE].values ) ) ids_pois.fill(row[label_id]) tag_pois.fill(row[label_poi_name]) else: # compute dist between a POI and ALL current_distances = np.float64( haversine( lat_poi, lon_poi, data[LATITUDE].values, data[LONGITUDE].values ) ) compare = current_distances < minimum_distances minimum_distances = np.minimum( current_distances, minimum_distances, dtype=np.float64 ) ids_pois[compare] = row[label_id] tag_pois[compare] = row[label_poi_name] data[ID_POI] = ids_pois data[DIST_POI] = minimum_distances data[NAME_POI] = tag_pois logger.debug('Integration with POI was finalized') if not inplace: return data
[docs]def join_with_pois_by_category( data: DataFrame, df_pois: DataFrame, label_category: str = TYPE_POI, label_id: str = TRAJ_ID, inplace: bool = False ): """ Performs the integration between trajectories and each type of points of interest. Generating new columns referring to the category and distance from the nearest point of interest that has this category at each point of the trajectory. Parameters ---------- data : DataFrame The input trajectory data. df_pois : DataFrame The input point of interest data. label_category : str, optional Label of df_pois referring to the point of interest category, by default TYPE_POI label_id : str, optional Label of df_pois referring to the point of interest id, by default TRAJ_ID inplace : boolean, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Examples -------- >>> from pymove.utils.integration import join_with_pois_by_category >>> move_df lat lon datetime id 0 39.984094 116.319236 2008-10-23 05:53:05 1 1 39.984559 116.326696 2008-10-23 10:37:26 1 2 40.002899 116.321520 2008-10-23 10:50:16 1 3 40.016238 116.307691 2008-10-23 11:03:06 1 4 40.013814 116.306525 2008-10-23 11:58:33 2 5 40.009735 116.315069 2008-10-23 23:50:45 2 >>> pois lat lon id type_poi name_poi 0 39.984094 116.319236 1 policia distrito_pol_1 1 39.991013 116.326384 2 policia policia_federal 2 40.010000 116.312615 3 comercio supermercado_aroldo >>> join_with_pois_by_category(move_df, pois) lat lon datetime id \ id_policia dist_policia id_comercio dist_comercio 0 39.984094 116.319236 2008-10-23 05:53:05 1 \ 1 0.000000 3 2935.310277 1 39.984559 116.326696 2008-10-23 10:37:26 1 \ 1 637.690216 3 3072.696379 2 40.002899 116.321520 2008-10-23 10:50:16 1 \ 2 1385.087181 3 1094.860663 3 40.016238 116.307691 2008-10-23 11:03:06 1 \ 2 3225.288831 3 810.542998 4 40.013814 116.306525 2008-10-23 11:58:33 2 \ 2 3047.838222 3 669.973155 5 40.009735 116.315069 2008-10-23 23:50:45 2 \ 2 2294.075820 3 211.069129 """ if not inplace: data = data.copy() df_pois = df_pois.copy() logger.debug('Integration with POIs...') # get a vector with windows time to each point data.reset_index(drop=True, inplace=True) df_pois.reset_index(drop=True, inplace=True) # create numpy array to store new column to DataFrame of movement objects current_distances = np.full( data.shape[0], np.Infinity, dtype=np.float64 ) ids_pois = np.full(data.shape[0], np.NAN, dtype='object_') unique_categories = df_pois[label_category].unique() size_categories = len(unique_categories) logger.debug('There are %s categories' % size_categories) for i, c in enumerate(unique_categories, start=1): # creating lat and lon array to operation df_category = df_pois[df_pois[label_category] == c] df_category.reset_index(drop=True, inplace=True) desc = f'computing dist to {c} category ({i}/{size_categories})' for idx, row in progress_bar(data.iterrows(), total=len(data), desc=desc): lat_user = np.full( df_category.shape[0], row[LATITUDE], dtype=np.float64 ) lon_user = np.full( df_category.shape[0], row[LONGITUDE], dtype=np.float64 ) # computing distances to distances = haversine( lat_user, lon_user, df_category[LATITUDE].values, df_category[LONGITUDE].values, ) # get index to arg_min and min distance index_min = np.argmin(distances) # setting data for a single object movement current_distances[idx] = np.min(distances) ids_pois[idx] = df_category.at[index_min, label_id] data['id_%s' % c] = ids_pois data['dist_%s' % c] = current_distances logger.debug('Integration with POI was finalized') if not inplace: return data
[docs]def join_with_events( data: DataFrame, df_events: DataFrame, label_date: str = DATETIME, time_window: int = 900, label_event_id: str = EVENT_ID, label_event_type: str = EVENT_TYPE, inplace: bool = False ): """ Performs the integration between trajectories and the closest event in time window. Generating new columns referring to the category of the point of interest, the distance from the nearest point of interest based on time of each point of the trajectories. Parameters ---------- data : DataFrame The input trajectory data. df_events : DataFrame The input events points of interest data. label_date : str, optional Label of data referring to the datetime of the input trajectory data, by default DATETIME time_window : float, optional tolerable length of time range in `seconds` for assigning the event's point of interest to the trajectory point, by default 900 label_event_id : str, optional Label of df_events referring to the id of the event, by default EVENT_ID label_event_type : str, optional Label of df_events referring to the type of the event, by default EVENT_TYPE inplace : boolean, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Examples -------- >>> from pymove.utils.integration import join_with_events >>> move_df lat lon datetime id 0 39.984094 116.319236 2008-10-23 05:53:05 1 1 39.984559 116.326696 2008-10-23 10:37:26 1 2 39.993527 116.326483 2008-10-24 00:02:14 2 3 39.978575 116.326975 2008-10-24 00:22:01 3 4 39.981668 116.310769 2008-10-24 01:57:57 3 >>> events lat lon id datetime event_type event_id 0 39.984094 116.319236 1 2008-10-23 05:53:05 show forro_tropykalia 1 39.991013 116.326384 2 2008-10-23 10:37:26 show dia_do_municipio 2 40.010000 116.312615 3 2008-10-24 01:57:57 feira adocao_de_animais >>> join_with_events(move_df, events) lat lon datetime id \ event_type dist_event event_id 0 39.984094 116.319236 2008-10-23 05:53:05 1 \ show 0.000000 forro_tropykalia 1 39.984559 116.326696 2008-10-23 10:37:26 1 \ show 718.144152 dia_do_municipio 2 39.993527 116.326483 2008-10-24 00:02:14 2 \ inf 3 39.978575 116.326975 2008-10-24 00:22:01 3 \ inf 4 39.981668 116.310769 2008-10-24 01:57:57 3 \ feira 3154.296880 adocao_de_animais Raises ------ ValueError If feature generation fails """ if not inplace: data = data.copy() df_events = df_events.copy() values = _reset_set_window__and_creates_event_id_type( data, df_events, time_window, label_date ) *_, current_distances, event_id, event_type = values window_starts, window_ends, *_ = _reset_set_window__and_creates_event_id_type( df_events, data, time_window, label_date ) minimum_distances = np.full( data.shape[0], np.Infinity, dtype=np.float64 ) for idx, row in progress_bar( df_events.iterrows(), total=len(df_events), desc='Integration with Events' ): df_filtered = filters.by_datetime( data, window_starts[idx], window_ends[idx] ) if df_filtered is None: raise ValueError('Filtering datetime failed!') size_filter = df_filtered.shape[0] if size_filter > 0: indexes = df_filtered.index lat_event = np.full( df_filtered.shape[0], row[LATITUDE], dtype=np.float64 ) lon_event = np.full( df_filtered.shape[0], row[LONGITUDE], dtype=np.float64 ) # First iteration is minimum distances if idx == 0: minimum_distances[indexes] = haversine( lat_event, lon_event, df_filtered[LATITUDE].values, df_filtered[LONGITUDE].values, ) event_id[indexes] = row[label_event_id] event_type[indexes] = row[label_event_type] else: current_distances[indexes] = haversine( lat_event, lon_event, df_filtered[LATITUDE].values, df_filtered[LONGITUDE].values, ) compare = current_distances < minimum_distances minimum_distances = np.minimum( current_distances, minimum_distances ) event_id[compare] = row[label_event_id] event_type[compare] = row[label_event_type] data[label_event_id] = event_id data[DIST_EVENT] = minimum_distances data[label_event_type] = event_type logger.debug('Integration with events was completed') if not inplace: return data
[docs]def join_with_event_by_dist_and_time( data: DataFrame, df_events: DataFrame, label_date: str = DATETIME, label_event_id: str = EVENT_ID, label_event_type: str = EVENT_TYPE, time_window: float = 3600, radius: float = 1000, inplace: bool = False ): """ Performs the integration between trajectories and events on windows. Generating new columns referring to the category of the point of interest, the distance between the location of the user and location of the poi based on the distance and on time of each point of the trajectories. Parameters ---------- data : DataFrame The input trajectory data. df_pois : DataFrame The input events points of interest data. label_date : str, optional Label of data referring to the datetime of the input trajectory data, by default DATETIME label_event_id : str, optional Label of df_events referring to the id of the event, by default EVENT_ID label_event_type : str, optional Label of df_events referring to the type of the event, by default EVENT_TYPE time_window : float, optional tolerable length of time range in `seconds`for assigning the event's point of interest to the trajectory point, by default 3600 radius: float, optional maximum radius of pois in `meters`, by default 1000 inplace : boolean, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Examples -------- >>> from pymove.utils.integration import join_with_pois_by_dist_and_datetime >>> move_df lat lon datetime id 0 39.984094 116.319236 2008-10-23 05:53:05 1 1 39.984559 116.326696 2008-10-23 10:37:26 1 2 39.993527 116.326483 2008-10-24 00:02:14 2 3 39.978575 116.326975 2008-10-24 00:22:01 3 4 39.981668 116.310769 2008-10-24 01:57:57 3 >>> events lat lon id datetime type_poi name_poi 0 39.984094 116.319236 1 2008-10-23 05:53:05 show forro_tropykalia 1 39.991013 116.326384 2 2008-10-23 10:27:26 corrida racha_de_jumento 2 39.990013 116.316384 2 2008-10-23 10:37:26 show dia_do_municipio 3 40.010000 116.312615 3 2008-10-24 01:57:57 feira adocao_de_animais >>> join_with_pois_by_dist_and_datetime(move_df, pois) >>> move_df lat lon datetime id \ type_poi dist_event name_poi 0 39.984094 116.319236 2008-10-23 05:53:05 1 \ [show] [0.0] [forro_tropykalia] 1 39.984559 116.326696 2008-10-23 10:37:26 1 \ [corrida, show] [718.144, 1067.53] [racha_de_jumento, dia_do_municipio] 2 39.993527 116.326483 2008-10-24 00:02:14 2 \ None None None 3 39.978575 116.326975 2008-10-24 00:22:01 3 \ None None None 4 39.981668 116.310769 2008-10-24 01:57:57 3 \ None None None Raises ------ ValueError If feature generation fails """ if label_date not in df_events: raise KeyError("POI's DataFrame must contain a %s column" % label_date) if not inplace: data = data.copy() df_events = df_events.copy() values = _reset_set_window_and_creates_event_id_type_all( data, df_events, time_window, label_date ) window_start, window_end, current_distances, event_id, event_type = values for idx, row in progress_bar( data.iterrows(), total=len(data), desc='Integration with Events' ): # set min and max of coordinates by radius bbox = filters.get_bbox_by_radius( (row[LATITUDE], row[LONGITUDE]), radius ) # filter event by radius df_filtered = filters.by_bbox( df_events, bbox, inplace=False ) if df_filtered is None: raise ValueError('Filtering bbox failed') # filter event by datetime filters.by_datetime( df_filtered, start_datetime=window_start[idx], end_datetime=window_end[idx], inplace=True ) # get df_filtered size size_filter = df_filtered.shape[0] if size_filter > 0: # reseting index of data frame df_filtered.reset_index(drop=True, inplace=True) # create lat and lon array to operation lat_user = np.full( size_filter, row[LATITUDE], dtype=np.float64 ) lon_user = np.full( size_filter, row[LONGITUDE], dtype=np.float64 ) # calculate of distances between points distances = haversine( lat_user, lon_user, df_filtered[LATITUDE].to_numpy(), df_filtered[LONGITUDE].to_numpy() ) current_distances[idx] = distances event_type[idx] = df_filtered[label_event_type].to_numpy(dtype=np.ndarray) event_id[idx] = df_filtered[label_event_id].to_numpy(dtype=np.ndarray) data[label_event_id] = event_id data[DIST_EVENT] = current_distances data[label_event_type] = event_type logger.debug('Integration with event was completed') if not inplace: return data
[docs]def join_with_home_by_id( data: DataFrame, df_home: DataFrame, label_id: str = TRAJ_ID, label_address: str = ADDRESS, label_city: str = CITY, drop_id_without_home: bool = False, inplace: bool = False ): """ Performs the integration between trajectories and home points. Generating new columns referring to the distance of the nearest home point, address and city of each trajectory point. Parameters ---------- data : DataFrame The input trajectory data. df_home : DataFrame The input home points data. label_id : str, optional Label of df_home referring to the home point id, by default TRAJ_ID label_address : str, optional Label of df_home referring to the home point address, by default ADDRESS label_city : str, optional Label of df_home referring to the point city, by default CITY drop_id_without_home : bool, optional flag as an option to drop id's that don't have houses, by default False inplace : boolean, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Examples -------- >>> from pymove.utils.integration import join_with_home_by_id >>> move_df lat lon datetime id 0 39.984094 116.319236 2008-10-23 05:53:05 1 1 39.984559 116.326696 2008-10-23 10:37:26 1 2 40.002899 116.321520 2008-10-23 10:50:16 1 3 40.016238 116.307691 2008-10-23 11:03:06 1 4 40.013814 116.306525 2008-10-23 11:58:33 2 5 40.009735 116.315069 2008-10-23 23:50:45 2 >>> home_df lat lon id formatted_address city 0 39.984094 116.319236 1 rua da mae quixiling 1 40.013821 116.306531 2 rua da familia quixeramoling >>> join_with_home_by_id(move_df, home_df) >>> move_df id lat lon datetime dist_home \ home city 0 1 39.984094 116.319236 2008-10-23 05:53:05 0.000000 \ rua da mae quixiling 1 1 39.984559 116.326696 2008-10-23 10:37:26 637.690216 \ rua da mae quixiling 2 1 40.002899 116.321520 2008-10-23 10:50:16 2100.053501 \ rua da mae quixiling 3 1 40.016238 116.307691 2008-10-23 11:03:06 3707.066732 \ rua da mae quixiling 4 2 40.013814 116.306525 2008-10-23 11:58:33 0.931101 \ rua da familia quixeramoling 5 2 40.009735 116.315069 2008-10-23 23:50:45 857.417540 \ rua da familia quixeramoling """ if not inplace: data = data.copy() df_home = df_home.copy() ids_without_home = [] if data.index.name is None: logger.debug(f'...setting {label_id} as index') data.set_index(label_id, inplace=True) for idx in progress_bar( data.index.unique(), total=len(data.index.unique()), desc='Integration with Home' ): filter_home = df_home[label_id] == idx if df_home[filter_home].shape[0] == 0: logger.debug(f'...id: {idx} has not HOME') ids_without_home.append(idx) else: home = df_home[filter_home].iloc[0] lat_user = data.at[idx, LATITUDE].values lon_user = data.at[idx, LONGITUDE].values # if user has a single tuple if not isinstance(lat_user, np.ndarray): lat_home = home[LATITUDE].values lon_home = home[LONGITUDE].values data.at[idx, DIST_HOME] = haversine( lat_user, lon_user, lat_home, lon_home ) data.at[idx, HOME] = home[label_address] data.at[idx, label_city] = home[label_city] else: lat_home = np.full( data.loc[idx].shape[0], home[LATITUDE], dtype=np.float64 ) lon_home = np.full( data.loc[idx].shape[0], home[LONGITUDE], dtype=np.float64 ) data.at[idx, DIST_HOME] = haversine( lat_user, lon_user, lat_home, lon_home ) data.at[idx, HOME] = np.array(home[label_address]) data.at[idx, label_city] = np.array(home[label_city]) data.reset_index(inplace=True) logger.debug('... Resetting index') if drop_id_without_home: data.drop(data.loc[data[TRAJ_ID].isin(ids_without_home)].index, inplace=True) if not inplace: return data
[docs]def merge_home_with_poi( data: DataFrame, label_dist_poi: str = DIST_POI, label_name_poi: str = NAME_POI, label_id_poi: str = ID_POI, label_home: str = HOME, label_dist_home: str = DIST_HOME, drop_columns: bool = True, inplace: bool = False ): """ Performs or merges the points of interest and the trajectories. Considering the starting points as other points of interest, generating a new DataFrame. Parameters ---------- data : DataFrame The input trajectory data, with join_with_pois and join_with_home_by_id applied. label_dist_poi : str, optional Label of data referring to the distance from the nearest point of interest, by default DIST_POI label_name_poi : str, optional Label of data referring to the name from the nearest point of interest, by default NAME_POI label_id_poi : str, optional Label of data referring to the id from the nearest point of interest, by default ID_POI label_home : str, optional Label of df_home referring to the home point, by default HOME label_dist_home: str, optional Label of df_home referring to the distance to the home point, by default DIST_HOME drop_columns : bool, optional Flag that controls the deletion of the columns referring to the id and the distance from the home point, by default inplace : boolean, optional if set to true the original dataframe will be altered to contain the result of the filtering, otherwise a copy will be returned, by default False Examples -------- >>> from pymove.utils.integration import ( >>> merge_home_with_poi, >>> join_with_home_by_id >>> ) >>> move_df lat lon datetime id \ id_poi dist_poi name_poi 0 39.984094 116.319236 2008-10-23 05:53:05 1 \ 1 0.000000 distrito_pol_1 1 39.984559 116.326696 2008-10-23 10:37:26 1 \ 1 637.690216 distrito_pol_1 2 40.002899 116.321520 2008-10-23 10:50:16 1 \ 2 1385.087181 policia_federal 3 40.016238 116.307691 2008-10-23 11:03:06 1 \ 2 3225.288831 policia_federal 4 40.013814 116.306525 2008-10-23 11:58:33 2 \ 2 3047.838222 policia_federal 5 40.009735 116.315069 2008-10-23 23:50:45 2 \ 2 2294.075820 policia_federal >>> home_df lat lon id formatted_address city 0 39.984094 116.319236 1 rua da mae quixiling 1 40.013821 116.306531 2 rua da familia quixeramoling >>> join_with_home_by_id(move, home_df, inplace=True) >>> move_df id lat lon datetime id_poi dist_poi \ name_poi dist_home home city 0 1 39.984094 116.319236 2008-10-23 05:53:05 1 0.000000 \ distrito_pol_1 0.000000 rua da mae quixiling 1 1 39.984559 116.326696 2008-10-23 10:37:26 1 637.690216 \ distrito_pol_1 637.690216 rua da mae quixiling 2 1 40.002899 116.321520 2008-10-23 10:50:16 2 1385.087181 \ policia_federal 2100.053501 rua da mae quixiling 3 1 40.016238 16.307691 2008-10-23 11:03:06 2 3225.288831 \ policia_federal 3707.066732 rua da mae quixiling 4 2 40.013814 116.306525 2008-10-23 11:58:33 2 3047.838222 \ policia_federal 0.931101 rua da familia quixeramoling 5 2 40.009735 116.315069 2008-10-23 23:50:45 2 2294.075820 \ policia_federal 857.417540 rua da familia quixeramoling >>> merge_home_with_poi(move_df) id lat lon datetime id_poi \ dist_poi name_poi city 0 1 39.984094 116.319236 2008-10-23 05:53:05 rua da mae \ 0.000000 home quixiling 1 1 39.984559 116.326696 2008-10-23 10:37:26 rua da mae \ 637.690216 home quixiling 2 1 40.002899 116.321520 2008-10-23 10:50:16 2 \ 1385.087181 policia_federal quixiling 3 1 40.016238 116.307691 2008-10-23 11:03:06 2 \ 3225.288831 policia_federal quixiling 4 2 40.013814 116.306525 2008-10-23 11:58:33 rua da familia \ 0.931101 home quixeramoling 5 2 40.009735 116.315069 2008-10-23 23:50:45 rua da familia \ 857.417540 home quixeramoling """ if not inplace: data = data.copy() logger.debug('merge home with POI using shortest distance') idx = data[data[label_dist_home] <= data[label_dist_poi]].index data.loc[idx, label_name_poi] = label_home data.loc[idx, label_dist_poi] = data.loc[idx, label_dist_home] data.loc[idx, label_id_poi] = data.loc[idx, label_home] if(drop_columns): data.drop(columns=[label_dist_home, label_home], inplace=True) if not inplace: return data