Source code for pymove.utils.integration

"""
Integration operations.

union_poi_bank,
union_poi_bus_station,
union_poi_bar_restaurant,
union_poi_parks,
union_poi_police,
join_collective_areas,
join_with_pois,
join_with_pois_by_category,
join_with_events,
join_with_event_by_dist_and_time,
join_with_home_by_id,
merge_home_with_poi

"""
from __future__ import annotations

from collections import namedtuple

import numpy as np
from numpy import ndarray
from pandas import DataFrame, Timedelta
from pandas.core.series import Series

from pymove.preprocessing import filters
from pymove.utils.constants import (
    ADDRESS,
    CITY,
    DATETIME,
    DIST_EVENT,
    DIST_HOME,
    DIST_POI,
    EVENT_ID,
    EVENT_TYPE,
    GEOMETRY,
    HOME,
    ID_POI,
    LATITUDE,
    LONGITUDE,
    NAME_POI,
    TRAJ_ID,
    TYPE_POI,
    VIOLATING,
)
from pymove.utils.distances import haversine
from pymove.utils.log import logger, progress_bar


[docs]def union_poi_bank(
    data: DataFrame,
    label_poi: str = TYPE_POI,
    banks: list[str] | None = None,
    inplace: bool = False
) -> DataFrame | None:
    """
    Performs the union between the different bank categories.

    For Points of Interest in a single category named 'banks'.

    Parameters
    ----------
    data : DataFrame
        Input points of interest data
    label_poi : str, optional
        Label referring to the Point of Interest category, by default TYPE_POI
    banks : list of str, optional
        Names of poi refering to banks, by default
            banks = [
            'bancos_filiais',
            'bancos_agencias',
            'bancos_postos',
            'bancos_PAE',
            'bank',
        ]
    inplace : boolean, optional
        if set to true the original dataframe will be altered to contain
        the result of the filtering, otherwise a copy will be returned, by default False

    Returns
    -------
    DataFrame
        data with poi or None

    Examples
    --------
    >>> from pymove.utils.integration import union_poi_bank
    >>> pois_df
              lat          lon   id          type_poi
    0   39.984094   116.319236    1              bank
    1   39.984198   116.319322    2       randomvalue
    2   39.984224   116.319402    3     bancos_postos
    3   39.984211   116.319389    4       randomvalue
    4   39.984217   116.319422    5        bancos_PAE
    5   39.984710   116.319865    6     bancos_postos
    6   39.984674   116.319810    7   bancos_agencias
    7   39.984623   116.319773    8    bancos_filiais
    8   39.984606   116.319732    9             banks
    9   39.984555   116.319728   10             banks
    >>> union_poi_bank(pois_df)
              lat          lon   id      type_poi
    0   39.984094   116.319236    1         banks
    1   39.984198   116.319322    2   randomvalue
    2   39.984224   116.319402    3         banks
    3   39.984211   116.319389    4   randomvalue
    4   39.984217   116.319422    5         banks
    5   39.984710   116.319865    6         banks
    6   39.984674   116.319810    7         banks
    7   39.984623   116.319773    8         banks
    8   39.984606   116.319732    9         banks
    9   39.984555   116.319728   10         banks
    """
    if not inplace:
        data = data.copy()
    logger.debug('union bank categories to one category')
    logger.debug(f'... There are {data[label_poi].nunique()} -- {label_poi}')
    if banks is None:
        banks = [
            'bancos_filiais',
            'bancos_agencias',
            'bancos_postos',
            'bancos_PAE',
            'bank',
        ]
    filter_bank = data[label_poi].isin(banks)
    data.at[data[filter_bank].index, label_poi] = 'banks'
    if not inplace:
        return data


[docs]def union_poi_bus_station(
    data: DataFrame,
    label_poi: str = TYPE_POI,
    bus_stations: list[str] | None = None,
    inplace: bool = False
) -> DataFrame | None:
    """
    Performs the union between the different bus station categories.

    For Points of Interest in a single category named 'bus_station'.

    Parameters
    ----------
    data : DataFrame
        Input points of interest data
    label_poi : str, optional
        Label referring to the Point of Interest category, by default TYPE_POI
    bus_stations : list of str, optional
        Names of poi refering to bus_stations, by default
            bus_stations = [
                'transit_station',
                'pontos_de_onibus'
            ]
    inplace : boolean, optional
        if set to true the original dataframe will be altered to contain
        the result of the filtering, otherwise a copy will be returned, by default False

    Returns
    -------
    DataFrame
        data with poi or None

    Examples
    --------
    >>> from pymove.utils.integration import union_poi_bus_station
    >>> pois_df
              lat          lon  id           type_poi
    0   39.984094   116.319236   1    transit_station
    1   39.984198   116.319322   2        randomvalue
    2   39.984224   116.319402   3    transit_station
    3   39.984211   116.319389   4   pontos_de_onibus
    4   39.984217   116.319422   5    transit_station
    5   39.984710   116.319865   6        randomvalue
    6   39.984674   116.319810   7        bus_station
    7   39.984623   116.319773   8        bus_station
    >>> union_poi_bus_station(pois_df)
              lat          lon  id           type_poi
    0   39.984094   116.319236   1        bus_station
    1   39.984198   116.319322   2        randomvalue
    2   39.984224   116.319402   3        bus_station
    3   39.984211   116.319389   4        bus_station
    4   39.984217   116.319422   5        bus_station
    5   39.984710   116.319865   6        randomvalue
    6   39.984674   116.319810   7        bus_station
    7   39.984623   116.319773   8        bus_station
    """
    if not inplace:
        data = data.copy()
    logger.debug('union bus station categories to one category')
    if bus_stations is None:
        bus_stations = [
            'transit_station',
            'pontos_de_onibus'
        ]
    filter_bus_station = data[label_poi].isin(
        bus_stations
    )
    data.at[data[filter_bus_station].index, label_poi] = 'bus_station'
    if not inplace:
        return data


[docs]def union_poi_bar_restaurant(
    data: DataFrame,
    label_poi: str = TYPE_POI,
    bar_restaurant: list[str] | None = None,
    inplace: bool = False
) -> DataFrame | None:
    """
    Performs the union between bar and restaurant categories.

    For Points of Interest in a single category named 'bar-restaurant'.

    Parameters
    ----------
    data : DataFrame
        Input points of interest data
    label_poi : str, optional
        Label referring to the Point of Interest category, by default TYPE_POI
    bar_restaurant : list of str, optional
        Names of poi refering to bars or restaurants, by default
         bar_restaurant = [
            'restaurant',
            'bar'
        ]
    inplace : boolean, optional
        if set to true the original dataframe will be altered to contain
        the result of the filtering, otherwise a copy will be returned, by default False

    Returns
    -------
    DataFrame
        data with poi or None

    Examples
    --------
    >>> from pymove.utils.integration import union_poi_bar_restaurant
    >>> pois_df
              lat          lon   id         type_poi
    0   39.984094   116.319236    1       restaurant
    1   39.984198   116.319322    2       restaurant
    2   39.984224   116.319402    3      randomvalue
    3   39.984211   116.319389    4              bar
    4   39.984217   116.319422    5              bar
    5   39.984710   116.319865    6   bar-restaurant
    6   39.984674   116.319810    7        random123
    7   39.984623   116.319773    8              123
    >>> union_poi_bar_restaurant(pois_df)
              lat          lon   id         type_poi
    0   39.984094   116.319236    1   bar-restaurant
    1   39.984198   116.319322    2   bar-restaurant
    2   39.984224   116.319402    3      randomvalue
    3   39.984211   116.319389    4   bar-restaurant
    4   39.984217   116.319422    5   bar-restaurant
    5   39.984710   116.319865    6   bar-restaurant
    6   39.984674   116.319810    7        random123
    7   39.984623   116.319773    8              123
    """
    if not inplace:
        data = data.copy()
    logger.debug('union restaurant and bar categories to one category')
    if bar_restaurant is None:
        bar_restaurant = ['restaurant', 'bar']
    filter_bar_restaurant = data[label_poi].isin(bar_restaurant)
    data.at[data[filter_bar_restaurant].index, label_poi] = 'bar-restaurant'
    if not inplace:
        return data


[docs]def union_poi_parks(
    data: DataFrame,
    label_poi: str = TYPE_POI,
    parks: list[str] | None = None,
    inplace: bool = False
) -> DataFrame | None:
    """
    Performs the union between park categories.

    For Points of Interest in a single category named 'parks'.

    Parameters
    ----------
    data : DataFrame
        Input points of interest data
    label_poi : str, optional
        Label referring to the Point of Interest category, by default TYPE_POI
    parks : list of str, optional
        Names of poi refering to parks, by default
            parks = [
                'pracas_e_parques',
                'park'
            ]
    inplace : boolean, optional
        if set to true the original dataframe will be altered to contain
        the result of the filtering, otherwise a copy will be returned, by default False

    Returns
    -------
    DataFrame
        data with poi or None

    Examples
    --------
    >>> from pymove.utils.integration import union_poi_parks
    >>> pois_df
              lat          lon   id           type_poi
    0   39.984094   116.319236    1   pracas_e_parques
    1   39.984198   116.319322    2               park
    2   39.984224   116.319402    3              parks
    3   39.984211   116.319389    4             random
    4   39.984217   116.319422    5                123
    5   39.984710   116.319865    6               park
    6   39.984674   116.319810    7              parks
    7   39.984623   116.319773    8   pracas_e_parques
    >>> union_poi_parks(pois_df)
              lat          lon   id           type_poi
    0   39.984094   116.319236    1              parks
    1   39.984198   116.319322    2              parks
    2   39.984224   116.319402    3              parks
    3   39.984211   116.319389    4             random
    4   39.984217   116.319422    5                123
    5   39.984710   116.319865    6              parks
    6   39.984674   116.319810    7              parks
    7   39.984623   116.319773    8              parks
    """
    if not inplace:
        data = data.copy()
    logger.debug('union parks categories to one category')
    if parks is None:
        parks = ['pracas_e_parques', 'park']
    filter_parks = data[label_poi].isin(parks)
    data.at[data[filter_parks].index, label_poi] = 'parks'
    if not inplace:
        return data


[docs]def union_poi_police(
    data: DataFrame,
    label_poi: str = TYPE_POI,
    police: list[str] | None = None,
    inplace: bool = False
) -> DataFrame | None:
    """
    Performs the union between police categories.

    For Points of Interest in a single category named 'police'.

    Parameters
    ----------
    data : DataFrame
        Input points of interest data
    label_poi : str, optional
        Label referring to the Point of Interest category, by default TYPE_POI
    police : list of str, optional
        Names of poi refering to police stations, by default
            police = [
                'distritos_policiais',
                'delegacia'
            ]
    inplace : boolean, optional
        if set to true the original dataframe will be altered to contain
        the result of the filtering, otherwise a copy will be returned, by default False

    Returns
    -------
    DataFrame
        data with poi or None

    Examples
    --------
    >>> from pymove.utils.integration import union_poi_police
    >>> pois_df
              lat          lon   id              type_poi
    0   39.984094   116.319236    1   distritos_policiais
    1   39.984198   116.319322    2                police
    2   39.984224   116.319402    3                police
    3   39.984211   116.319389    4   distritos_policiais
    4   39.984217   116.319422    5                random
    5   39.984710   116.319865    6           randomvalue
    6   39.984674   116.319810    7                   123
    7   39.984623   116.319773    8           bus_station
    >>> union_poi_police(pois_df)
              lat          lon   id              type_poi
    0   39.984094   116.319236    1                police
    1   39.984198   116.319322    2                police
    2   39.984224   116.319402    3                police
    3   39.984211   116.319389    4                police
    4   39.984217   116.319422    5                random
    5   39.984710   116.319865    6           randomvalue
    6   39.984674   116.319810    7                   123
    7   39.984623   116.319773    8           bus_station
    """
    if not inplace:
        data = data.copy()
    logger.debug('union distritos policies and police categories')
    if police is None:
        police = ['distritos_policiais', 'delegacia']
    filter_police = data[label_poi].isin(police)
    data.at[data[filter_police].index, label_poi] = 'police'
    if not inplace:
        return data


[docs]def join_collective_areas(
    data: DataFrame,
    areas: DataFrame,
    label_geometry: str = GEOMETRY,
    inplace: bool = False
) -> DataFrame | None:
    """
    Performs the integration between trajectories and collective areas.

    Generating a new column that informs if the point of the
    trajectory is inserted in a collective area.

    Parameters
    ----------
    data : geopandas.GeoDataFrame
        The input trajectory data
    areas : geopandas.GeoDataFrame
        The input coletive areas data
    label_geometry : str, optional
        Label referring to the Point of Interest category, by default GEOMETRY
    inplace : boolean, optional
        if set to true the original dataframe will be altered to contain
        the result of the filtering, otherwise a copy will be returned, by default False

    Returns
    -------
    DataFrame
        data with joined geometries or None

    Examples
    --------
    >>> from pymove.utils.integration import join_collective_areas
    >>> data
              lat          lon              datetime   id                      geometry
    0   39.984094   116.319236   2008-10-23 05:53:05    1    POINT (116.31924 39.98409)
    1   39.984198   116.319322   2008-10-23 05:53:06    1    POINT (116.31932 39.98420)
    2   39.984224   116.319402   2008-10-23 05:53:11    1    POINT (116.31940 39.98422)
    3   39.984211   116.319389   2008-10-23 05:53:16    1    POINT (116.31939 39.98421)
    4   39.984217   116.319422   2008-10-23 05:53:21    1    POINT (116.31942 39.98422)
    >>> area_c
             lat         lon               datetime  id                         geometry
    0  39.984094  116.319236    2008-10-23 05:53:05   1     POINT (116.319236 39.984094)
    1  40.006436  116.317701    2008-10-23 10:53:31   1     POINT (116.317701 40.006436)
    2  40.014125  116.306159    2008-10-23 23:43:56   1     POINT (116.306159 40.014125)
    3  39.984211  116.319389    2008-10-23 05:53:16   1     POINT (116.319389 39.984211)
        POINT (116.32687 39.97901)
    >>> join_collective_areas(gdf, area_c)
    >>> gdf.head()
             lat         lon                datetime  id \
                            geometry    violating
    0  39.984094  116.319236    2008-10-23 05:53:05   1 \
        POINT (116.319236 39.984094)         True
    1  39.984198  116.319322    2008-10-23 05:53:06   1 \
        POINT (116.319322 39.984198)        False
    2  39.984224  116.319402    2008-10-23 05:53:11   1 \
        POINT (116.319402 39.984224)        False
    3  39.984211  116.319389    2008-10-23 05:53:16   1 \
        POINT (116.319389 39.984211)         True
    4  39.984217  116.319422    2008-10-23 05:53:21   1 \
        POINT (116.319422 39.984217)        False

    """
    if not inplace:
        data = data.copy()
    logger.debug('Integration between trajectories and collectives areas')
    Geometry = namedtuple('Geometry', 'geom coordinates')

    polygons = areas[label_geometry].apply(
        lambda g: Geometry(g.__class__, g.__geo_interface__.get('coordinates'))
    ).unique()
    polygons = [p.geom(p.coordinates) for p in polygons]
    data[VIOLATING] = False
    for p in progress_bar(polygons, desc='Joining trajectories and areas'):
        intersects = data[label_geometry].apply(lambda x: x.intersects(p))
        index = data[intersects].index
        data.at[index, VIOLATING] = True
    if not inplace:
        return data


def _reset_and_creates_id_and_lat_lon(
    data: DataFrame,
    df_pois: DataFrame,
    lat_lon_poi: bool = True,
    reset_index: bool = True
) -> tuple[ndarray, ndarray, ndarray, ndarray, ndarray]:
    """
    Resets the indexes of the dataframes.

    Returns the minimum distance
    between the two dataframes, and return their respective variables
    (id, tags, latitude and longitude).

    Parameters
    ----------
    data : DataFrame
        The input trajectory data.
    df_pois : DataFrame
        The input point of interest data.
    lat_lon_poi : bool, optional
        Flag to determine if the ids and tags is of size equivalent to df_pois,
        by default True
    reset_index : bool, optional
        Flag for reset index of the df_pois and data dataframes before the join,
        by default True

    Returns
    -------
    distances, ids, tags, lat, lon: arrays with default values for join operation

    Examples
    --------
    >>> from pymove.utils.integration import _reset_and_creates_id_and_lat_lon
    >>> move_df.head()
              lat          lon              datetime   id
    0   39.984094   116.319236   2008-10-23 05:53:05    1
    >>> pois.head()
              lat          lon   id     type_poi                  name_poi
    0   39.984094   116.319236    1      policia            distrito_pol_1
    >>> _reset_and_creates_id_and_lat_lon(move_df, pois)
    (
        array([inf]),
        array([''], dtype=object),
        array([''], dtype=object),
        array([inf]),
        array([inf])
    )
    """
    if reset_index:
        logger.debug('... Resetting index to operation...')
        data.reset_index(drop=True, inplace=True)
        df_pois.reset_index(drop=True, inplace=True)

    # create numpy array to store new column to DataFrame of movement objects
    distances = np.full(
        data.shape[0], np.Infinity, dtype=np.float64
    )

    ids = np.full(data.shape[0], '', dtype='object_')
    tags = np.full(data.shape[0], '', dtype='object_')

    # creating lat and lon array to operation
    if lat_lon_poi:
        lat = np.full(df_pois.shape[0], np.Infinity, dtype=np.float64)
        lon = np.full(df_pois.shape[0], np.Infinity, dtype=np.float64)
    else:
        lat = np.full(data.shape[0], np.Infinity, dtype=np.float64)
        lon = np.full(data.shape[0], np.Infinity, dtype=np.float64)

    return distances, ids, tags, lat, lon


def _reset_set_window__and_creates_event_id_type(
    data: DataFrame, df_events: DataFrame, time_window: float, label_date: str = DATETIME
) -> tuple[Series, Series, ndarray, ndarray, ndarray]:
    """
    Resets the indexes of the dataframes.

    Set time window, and returns
    the current distance between the two dataframes, and return their
    respective variables (event_id, event_type).

    Parameters
    ----------
    data : DataFrame
        The input trajectory data.
    df_events : DataFrame
        The input event point of interest data.
    time_window : float
        Number of seconds of the time window.
    label_date : str, optional
        Label of data referring to the datetime, by default DATETIME

    Returns
    -------
    window_starts, window_ends, current_distances, event_id, event_type

    Examples
    --------
    >>> from pymove.utils.integration import
    _reset_set_window__and_creates_event_id_type
    >>> move_df.head()
              lat          lon              datetime   id
    0   39.984094   116.319236   2008-10-23 05:53:05    1
    >>> pois_df
              lat          lon   event_id              datetime             event_type
    0   39.984094   116.319236          1   2008-10-24 01:57:57     show do tropykalia
    >>> _reset_set_window__and_creates_event_id_type(move_df, pois, 600)
    (
        0   2008-10-23 05:43:05
        Name: datetime, dtype: datetime64[ns],
        0   2008-10-23 06:03:05
        Name: datetime, dtype: datetime64[ns],
        array([inf]),
        array([''], dtype=object),
        array([''], dtype=object)
    )
    """
    # get a vector with windows time to each point
    data.reset_index(drop=True, inplace=True)
    df_events.reset_index(drop=True, inplace=True)

    # compute windows time
    window_starts = data[label_date] - Timedelta(seconds=time_window)
    window_ends = data[label_date] + Timedelta(seconds=time_window)

    # create vector to store distances
    current_distances = np.full(
        data.shape[0], np.Infinity, dtype=np.float64
    )
    event_type = np.full(data.shape[0], '', dtype='object_')
    event_id = np.full(data.shape[0], '', dtype='object_')

    return window_starts, window_ends, current_distances, event_id, event_type


def _reset_set_window_and_creates_event_id_type_all(
    data: DataFrame, df_events: DataFrame, time_window: float, label_date: str = DATETIME
) -> tuple[Series, Series, ndarray, ndarray, ndarray]:
    """
    Resets the indexes of the dataframes.

    Set time window, and returns
    the current distance between the two dataframes, and return their
    respective variables (event_id, event_type).

    Parameters
    ----------
    data : DataFrame
        The input trajectory data.
    df_events : DataFrame
        The input event point of interest data.
    time_window : float
        Number of seconds of the time window.
    label_date : str
        Label of data referring to the datetime.

    Returns
    -------
    window_starts, window_ends, current_distances, event_id, event_type
        arrays with default values for join operation

    Examples
    --------
    >>> from pymove.utils.integration import _reset_set_window_and_creates_event_id_type_all  # noqa
    >>> move_df.head()
              lat          lon              datetime   id
    0   39.984094   116.319236   2008-10-23 05:53:05    1
    >>> pois_df
              lat          lon   event_id              datetime             event_type
    0   39.984094   116.319236          1   2008-10-24 01:57:57     show do tropykalia
    >>> _reset_set_window_and_creates_event_id_type_all(move_df, pois, 600)
    (
        0   2008-10-23 05:43:05
        Name: datetime, dtype: datetime64[ns],
        0   2008-10-23 06:03:05
        Name: datetime, dtype: datetime64[ns],
        array([None], dtype=object),
        array([None], dtype=object),
        array([None], dtype=object)
    )
    """
    # get a vector with windows time to each point
    data.reset_index(drop=True, inplace=True)
    df_events.reset_index(drop=True, inplace=True)

    # compute windows time
    window_starts = data[label_date] - Timedelta(seconds=time_window)
    window_ends = data[label_date] + Timedelta(seconds=time_window)

    # create vector to store distances
    current_distances = np.full(
        data.shape[0], None, dtype=np.ndarray
    )
    event_type = np.full(data.shape[0], None, dtype=np.ndarray)
    event_id = np.full(data.shape[0], None, dtype=np.ndarray)

    return window_starts, window_ends, current_distances, event_id, event_type


[docs]def join_with_pois(
    data: DataFrame,
    df_pois: DataFrame,
    label_id: str = TRAJ_ID,
    label_poi_name: str = NAME_POI,
    reset_index: bool = True,
    inplace: bool = False
):
    """
    Performs the integration between trajectories and the closest point of interest.

    Generating two new columns referring to the
    name and the distance from the point of interest closest
    to each point of the trajectory.

    Parameters
    ----------
    data : DataFrame
        The input trajectory data.
    df_pois : DataFrame
        The input point of interest data.
    label_id : str, optional
        Label of df_pois referring to the Point of Interest id, by default TRAJ_ID
    label_poi_name : str, optional
        Label of df_pois referring to the Point of Interest name, by default NAME_POI
    reset_index : bool, optional
        Flag for reset index of the df_pois and data dataframes before the join,
        by default True
    inplace : boolean, optional
        if set to true the original dataframe will be altered to contain
        the result of the filtering, otherwise a copy will be returned, by default False

    Examples
    --------
    >>> from pymove.utils.integration import join_with_pois
    >>>  move_df
              lat          lon              datetime   id
    0   39.984094   116.319236   2008-10-23 05:53:05    1
    1   39.984559   116.326696   2008-10-23 10:37:26    1
    2   40.002899   116.321520   2008-10-23 10:50:16    1
    3   40.016238   116.307691   2008-10-23 11:03:06    1
    4   40.013814   116.306525   2008-10-23 11:58:33    2
    5   40.009735   116.315069   2008-10-23 23:50:45    2
    >>> pois
              lat          lon   id   type_poi              name_poi
    0   39.984094   116.319236    1    policia        distrito_pol_1
    1   39.991013   116.326384    2    policia       policia_federal
    2   40.010000   116.312615    3   comercio   supermercado_aroldo
    >>> join_with_pois(move_df, pois)
              lat          lon              datetime   id   id_poi \
           dist_poi              name_poi
    0   39.984094   116.319236   2008-10-23 05:53:05    1        1 \
           0.000000        distrito_pol_1
    1   39.984559   116.326696   2008-10-23 10:37:26    1        1 \
         637.690216        distrito_pol_1
    2   40.002899   116.321520   2008-10-23 10:50:16    1        3 \
        1094.860663   supermercado_aroldo
    3   40.016238   116.307691   2008-10-23 11:03:06    1        3 \
         810.542998   supermercado_aroldo
    4   40.013814   116.306525   2008-10-23 11:58:33    2        3 \
         669.973155   supermercado_aroldo
    5   40.009735   116.315069   2008-10-23 23:50:45    2        3 \
         211.069129   supermercado_aroldo
    """
    if not inplace:
        data = data.copy()
        df_pois = df_pois.copy()

    values = _reset_and_creates_id_and_lat_lon(data, df_pois, False, reset_index)
    minimum_distances, ids_pois, tag_pois, lat_poi, lon_poi = values

    for idx, row in progress_bar(
        df_pois.iterrows(), total=len(df_pois), desc='Optimized integration with POIs'
    ):
        # update lat and lon of current index
        lat_poi.fill(row[LATITUDE])
        lon_poi.fill(row[LONGITUDE])

        # First iteration is minimum distances
        if idx == 0:
            minimum_distances = np.array(
                haversine(
                    lat_poi,
                    lon_poi,
                    data[LATITUDE].values,
                    data[LONGITUDE].values
                )
            )
            ids_pois.fill(row[label_id])
            tag_pois.fill(row[label_poi_name])
        else:
            # compute dist between a POI and ALL
            current_distances = np.float64(
                haversine(
                    lat_poi,
                    lon_poi,
                    data[LATITUDE].values,
                    data[LONGITUDE].values
                )
            )
            compare = current_distances < minimum_distances
            minimum_distances = np.minimum(
                current_distances, minimum_distances, dtype=np.float64
            )
            ids_pois[compare] = row[label_id]
            tag_pois[compare] = row[label_poi_name]

    data[ID_POI] = ids_pois
    data[DIST_POI] = minimum_distances
    data[NAME_POI] = tag_pois
    logger.debug('Integration with POI was finalized')

    if not inplace:
        return data


[docs]def join_with_pois_by_category(
    data: DataFrame,
    df_pois: DataFrame,
    label_category: str = TYPE_POI,
    label_id: str = TRAJ_ID,
    inplace: bool = False
):
    """
    Performs the integration between trajectories and each type of points of interest.

    Generating new columns referring to the
    category and distance from the nearest point of interest
    that has this category at each point of the trajectory.

    Parameters
    ----------
    data : DataFrame
        The input trajectory data.
    df_pois : DataFrame
        The input point of interest data.
    label_category : str, optional
        Label of df_pois referring to the point of interest category, by default TYPE_POI
    label_id : str, optional
        Label of df_pois referring to the point of interest id, by default TRAJ_ID
    inplace : boolean, optional
        if set to true the original dataframe will be altered to contain
        the result of the filtering, otherwise a copy will be returned, by default False

    Examples
    --------
    >>> from pymove.utils.integration import join_with_pois_by_category
    >>>  move_df
              lat          lon              datetime   id
    0   39.984094   116.319236   2008-10-23 05:53:05    1
    1   39.984559   116.326696   2008-10-23 10:37:26    1
    2   40.002899   116.321520   2008-10-23 10:50:16    1
    3   40.016238   116.307691   2008-10-23 11:03:06    1
    4   40.013814   116.306525   2008-10-23 11:58:33    2
    5   40.009735   116.315069   2008-10-23 23:50:45    2
    >>> pois
              lat          lon   id   type_poi              name_poi
    0   39.984094   116.319236    1    policia        distrito_pol_1
    1   39.991013   116.326384    2    policia       policia_federal
    2   40.010000   116.312615    3   comercio   supermercado_aroldo
    >>> join_with_pois_by_category(move_df, pois)
              lat          lon              datetime   id \
            id_policia   dist_policia   id_comercio   dist_comercio
    0   39.984094   116.319236   2008-10-23 05:53:05    1 \
                     1       0.000000             3     2935.310277
    1   39.984559   116.326696   2008-10-23 10:37:26    1 \
                     1     637.690216             3     3072.696379
    2   40.002899   116.321520   2008-10-23 10:50:16    1 \
                     2    1385.087181             3     1094.860663
    3   40.016238   116.307691   2008-10-23 11:03:06    1 \
                     2    3225.288831             3      810.542998
    4   40.013814   116.306525   2008-10-23 11:58:33    2 \
                     2    3047.838222             3      669.973155
    5   40.009735   116.315069   2008-10-23 23:50:45    2 \
                     2    2294.075820             3      211.069129
    """
    if not inplace:
        data = data.copy()
        df_pois = df_pois.copy()

    logger.debug('Integration with POIs...')

    # get a vector with windows time to each point
    data.reset_index(drop=True, inplace=True)
    df_pois.reset_index(drop=True, inplace=True)

    # create numpy array to store new column to DataFrame of movement objects
    current_distances = np.full(
        data.shape[0], np.Infinity, dtype=np.float64
    )
    ids_pois = np.full(data.shape[0], np.NAN, dtype='object_')

    unique_categories = df_pois[label_category].unique()
    size_categories = len(unique_categories)
    logger.debug('There are %s categories' % size_categories)

    for i, c in enumerate(unique_categories, start=1):
        # creating lat and lon array to operation
        df_category = df_pois[df_pois[label_category] == c]
        df_category.reset_index(drop=True, inplace=True)

        desc = f'computing dist to {c} category ({i}/{size_categories})'
        for idx, row in progress_bar(data.iterrows(), total=len(data), desc=desc):
            lat_user = np.full(
                df_category.shape[0], row[LATITUDE], dtype=np.float64
            )
            lon_user = np.full(
                df_category.shape[0], row[LONGITUDE], dtype=np.float64
            )

            # computing distances to
            distances = haversine(
                lat_user,
                lon_user,
                df_category[LATITUDE].values,
                df_category[LONGITUDE].values,
            )

            # get index to arg_min and min distance
            index_min = np.argmin(distances)

            # setting data for a single object movement
            current_distances[idx] = np.min(distances)
            ids_pois[idx] = df_category.at[index_min, label_id]

        data['id_%s' % c] = ids_pois
        data['dist_%s' % c] = current_distances
    logger.debug('Integration with POI was finalized')

    if not inplace:
        return data


[docs]def join_with_events(
    data: DataFrame,
    df_events: DataFrame,
    label_date: str = DATETIME,
    time_window: int = 900,
    label_event_id: str = EVENT_ID,
    label_event_type: str = EVENT_TYPE,
    inplace: bool = False
):
    """
    Performs the integration between trajectories and the closest event in time window.

    Generating new columns referring to the
    category of the point of interest, the distance from the
    nearest point of interest based on time of each point of
    the trajectories.

    Parameters
    ----------
    data : DataFrame
        The input trajectory data.
    df_events : DataFrame
        The input events points of interest data.
    label_date : str, optional
        Label of data referring to the datetime of the input trajectory data,
        by default DATETIME
    time_window : float, optional
        tolerable length of time range in `seconds` for assigning the event's
        point of interest to the trajectory point, by default 900
    label_event_id : str, optional
        Label of df_events referring to the id of the event, by default EVENT_ID
    label_event_type : str, optional
        Label of df_events referring to the type of the event, by default EVENT_TYPE
    inplace : boolean, optional
        if set to true the original dataframe will be altered to contain
        the result of the filtering, otherwise a copy will be returned, by default False

    Examples
    --------
    >>> from pymove.utils.integration import join_with_events
    >>>  move_df
             lat         lon            datetime  id
    0  39.984094  116.319236 2008-10-23 05:53:05   1
    1  39.984559  116.326696 2008-10-23 10:37:26   1
    2  39.993527  116.326483 2008-10-24 00:02:14   2
    3  39.978575  116.326975 2008-10-24 00:22:01   3
    4  39.981668  116.310769 2008-10-24 01:57:57   3
    >>> events
             lat         lon  id            datetime  event_type             event_id
    0  39.984094  116.319236   1 2008-10-23 05:53:05        show     forro_tropykalia
    1  39.991013  116.326384   2 2008-10-23 10:37:26        show     dia_do_municipio
    2  40.010000  116.312615   3 2008-10-24 01:57:57        feira   adocao_de_animais
    >>> join_with_events(move_df, events)
             lat         lon            datetime  id \
               event_type   dist_event             event_id
    0  39.984094  116.319236 2008-10-23 05:53:05   1 \
                     show     0.000000     forro_tropykalia
    1  39.984559  116.326696 2008-10-23 10:37:26   1 \
                     show   718.144152     dia_do_municipio
    2  39.993527  116.326483 2008-10-24 00:02:14   2 \
                                   inf
    3  39.978575  116.326975 2008-10-24 00:22:01   3 \
                                   inf
    4  39.981668  116.310769 2008-10-24 01:57:57   3 \
                    feira  3154.296880    adocao_de_animais

    Raises
    ------
    ValueError
        If feature generation fails

    """
    if not inplace:
        data = data.copy()
        df_events = df_events.copy()

    values = _reset_set_window__and_creates_event_id_type(
        data, df_events, time_window, label_date
    )
    *_, current_distances, event_id, event_type = values
    window_starts, window_ends, *_ = _reset_set_window__and_creates_event_id_type(
        df_events, data, time_window, label_date
    )

    minimum_distances = np.full(
        data.shape[0], np.Infinity, dtype=np.float64
    )

    for idx, row in progress_bar(
        df_events.iterrows(), total=len(df_events), desc='Integration with Events'
    ):
        df_filtered = filters.by_datetime(
            data, window_starts[idx], window_ends[idx]
        )
        if df_filtered is None:
            raise ValueError('Filtering datetime failed!')

        size_filter = df_filtered.shape[0]

        if size_filter > 0:
            indexes = df_filtered.index
            lat_event = np.full(
                df_filtered.shape[0], row[LATITUDE], dtype=np.float64
            )
            lon_event = np.full(
                df_filtered.shape[0], row[LONGITUDE], dtype=np.float64
            )

            # First iteration is minimum distances
            if idx == 0:
                minimum_distances[indexes] = haversine(
                    lat_event,
                    lon_event,
                    df_filtered[LATITUDE].values,
                    df_filtered[LONGITUDE].values,
                )
                event_id[indexes] = row[label_event_id]
                event_type[indexes] = row[label_event_type]
            else:
                current_distances[indexes] = haversine(
                    lat_event,
                    lon_event,
                    df_filtered[LATITUDE].values,
                    df_filtered[LONGITUDE].values,
                )
                compare = current_distances < minimum_distances
                minimum_distances = np.minimum(
                    current_distances, minimum_distances
                )
                event_id[compare] = row[label_event_id]
                event_type[compare] = row[label_event_type]

    data[label_event_id] = event_id
    data[DIST_EVENT] = minimum_distances
    data[label_event_type] = event_type
    logger.debug('Integration with events was completed')

    if not inplace:
        return data


[docs]def join_with_event_by_dist_and_time(
    data: DataFrame,
    df_events: DataFrame,
    label_date: str = DATETIME,
    label_event_id: str = EVENT_ID,
    label_event_type: str = EVENT_TYPE,
    time_window: float = 3600,
    radius: float = 1000,
    inplace: bool = False
):
    """
    Performs the integration between trajectories and events on windows.

    Generating new columns referring to the category of the point of interest,
    the distance between the location of the user and location of the poi
    based on the distance and on time of each point of the trajectories.

    Parameters
    ----------
    data : DataFrame
        The input trajectory data.
    df_pois : DataFrame
        The input events points of interest data.
    label_date : str, optional
        Label of data referring to the datetime of the input trajectory data,
        by default DATETIME
    label_event_id : str, optional
        Label of df_events referring to the id of the event, by default EVENT_ID
    label_event_type : str, optional
        Label of df_events referring to the type of the event, by default EVENT_TYPE
    time_window : float, optional
        tolerable length of time range in `seconds`for assigning the event's
        point of interest to the trajectory point, by default 3600
    radius: float, optional
        maximum radius of pois in `meters`, by default 1000
    inplace : boolean, optional
        if set to true the original dataframe will be altered to contain
        the result of the filtering, otherwise a copy will be returned, by default False

    Examples
    --------
    >>> from pymove.utils.integration import join_with_pois_by_dist_and_datetime
    >>>  move_df
             lat         lon            datetime  id
    0  39.984094  116.319236 2008-10-23 05:53:05   1
    1  39.984559  116.326696 2008-10-23 10:37:26   1
    2  39.993527  116.326483 2008-10-24 00:02:14   2
    3  39.978575  116.326975 2008-10-24 00:22:01   3
    4  39.981668  116.310769 2008-10-24 01:57:57   3
    >>> events
             lat         lon  id            datetime type_poi           name_poi
    0  39.984094  116.319236   1 2008-10-23 05:53:05     show   forro_tropykalia
    1  39.991013  116.326384   2 2008-10-23 10:27:26  corrida   racha_de_jumento
    2  39.990013  116.316384   2 2008-10-23 10:37:26     show   dia_do_municipio
    3  40.010000  116.312615   3 2008-10-24 01:57:57    feira  adocao_de_animais
    >>> join_with_pois_by_dist_and_datetime(move_df, pois)
    >>> move_df
             lat         lon            datetime  id \
               type_poi          dist_event                              name_poi
    0  39.984094  116.319236 2008-10-23 05:53:05   1 \
                 [show]               [0.0]                    [forro_tropykalia]
    1  39.984559  116.326696 2008-10-23 10:37:26   1 \
        [corrida, show]  [718.144, 1067.53]  [racha_de_jumento, dia_do_municipio]
    2  39.993527  116.326483 2008-10-24 00:02:14   2 \
                  None                 None                                  None
    3  39.978575  116.326975 2008-10-24 00:22:01   3 \
                  None                 None                                  None
    4  39.981668  116.310769 2008-10-24 01:57:57   3 \
                  None                 None                                  None

    Raises
    ------
    ValueError
        If feature generation fails

    """
    if label_date not in df_events:
        raise KeyError("POI's DataFrame must contain a %s column" % label_date)

    if not inplace:
        data = data.copy()
        df_events = df_events.copy()

    values = _reset_set_window_and_creates_event_id_type_all(
        data, df_events, time_window, label_date
    )

    window_start, window_end, current_distances, event_id, event_type = values

    for idx, row in progress_bar(
        data.iterrows(), total=len(data), desc='Integration with Events'
    ):
        # set min and max of coordinates by radius
        bbox = filters.get_bbox_by_radius(
            (row[LATITUDE], row[LONGITUDE]), radius
        )

        # filter event by radius
        df_filtered = filters.by_bbox(
            df_events, bbox, inplace=False
        )

        if df_filtered is None:
            raise ValueError('Filtering bbox failed')

        # filter event by datetime
        filters.by_datetime(
            df_filtered,
            start_datetime=window_start[idx],
            end_datetime=window_end[idx],
            inplace=True
        )

        # get df_filtered size
        size_filter = df_filtered.shape[0]

        if size_filter > 0:
            # reseting index of data frame
            df_filtered.reset_index(drop=True, inplace=True)

            # create lat and lon array to operation
            lat_user = np.full(
                size_filter, row[LATITUDE], dtype=np.float64
            )
            lon_user = np.full(
                size_filter, row[LONGITUDE], dtype=np.float64
            )

            # calculate of distances between points
            distances = haversine(
                lat_user,
                lon_user,
                df_filtered[LATITUDE].to_numpy(),
                df_filtered[LONGITUDE].to_numpy()
            )

            current_distances[idx] = distances
            event_type[idx] = df_filtered[label_event_type].to_numpy(dtype=np.ndarray)
            event_id[idx] = df_filtered[label_event_id].to_numpy(dtype=np.ndarray)

    data[label_event_id] = event_id
    data[DIST_EVENT] = current_distances
    data[label_event_type] = event_type
    logger.debug('Integration with event was completed')

    if not inplace:
        return data


[docs]def join_with_home_by_id(
    data: DataFrame,
    df_home: DataFrame,
    label_id: str = TRAJ_ID,
    label_address: str = ADDRESS,
    label_city: str = CITY,
    drop_id_without_home: bool = False,
    inplace: bool = False
):
    """
    Performs the integration between trajectories and home points.

    Generating new columns referring to the distance of the nearest
    home point, address and city of each trajectory point.

    Parameters
    ----------
    data : DataFrame
        The input trajectory data.
    df_home : DataFrame
        The input home points data.
    label_id : str, optional
        Label of df_home referring to the home point id, by default TRAJ_ID
    label_address : str, optional
        Label of df_home referring to the home point address, by default ADDRESS
    label_city : str, optional
        Label of df_home referring to the point city, by default CITY
    drop_id_without_home : bool, optional
        flag as an option to drop id's that don't have houses, by default False
    inplace : boolean, optional
        if set to true the original dataframe will be altered to contain
        the result of the filtering, otherwise a copy will be returned, by default False

    Examples
    --------
    >>> from pymove.utils.integration import join_with_home_by_id
    >>> move_df
              lat          lon              datetime   id
    0   39.984094   116.319236   2008-10-23 05:53:05    1
    1   39.984559   116.326696   2008-10-23 10:37:26    1
    2   40.002899   116.321520   2008-10-23 10:50:16    1
    3   40.016238   116.307691   2008-10-23 11:03:06    1
    4   40.013814   116.306525   2008-10-23 11:58:33    2
    5   40.009735   116.315069   2008-10-23 23:50:45    2
    >>> home_df
              lat          lon   id   formatted_address            city
    0   39.984094   116.319236    1          rua da mae       quixiling
    1   40.013821   116.306531    2      rua da familia   quixeramoling
    >>> join_with_home_by_id(move_df, home_df)
    >>> move_df
        id         lat          lon              datetime     dist_home \
                  home                city
    0    1   39.984094   116.319236   2008-10-23 05:53:05      0.000000 \
            rua da mae           quixiling
    1    1   39.984559   116.326696   2008-10-23 10:37:26    637.690216 \
            rua da mae           quixiling
    2    1   40.002899   116.321520   2008-10-23 10:50:16   2100.053501 \
            rua da mae           quixiling
    3    1   40.016238   116.307691   2008-10-23 11:03:06   3707.066732 \
            rua da mae           quixiling
    4    2   40.013814   116.306525   2008-10-23 11:58:33      0.931101 \
        rua da familia       quixeramoling
    5    2   40.009735   116.315069   2008-10-23 23:50:45    857.417540 \
        rua da familia       quixeramoling
    """
    if not inplace:
        data = data.copy()
        df_home = df_home.copy()

    ids_without_home = []

    if data.index.name is None:
        logger.debug(f'...setting {label_id} as index')
        data.set_index(label_id, inplace=True)

    for idx in progress_bar(
        data.index.unique(), total=len(data.index.unique()), desc='Integration with Home'
    ):
        filter_home = df_home[label_id] == idx

        if df_home[filter_home].shape[0] == 0:
            logger.debug(f'...id: {idx} has not HOME')
            ids_without_home.append(idx)
        else:
            home = df_home[filter_home].iloc[0]
            lat_user = data.at[idx, LATITUDE].values
            lon_user = data.at[idx, LONGITUDE].values

            # if user has a single tuple
            if not isinstance(lat_user, np.ndarray):
                lat_home = home[LATITUDE].values
                lon_home = home[LONGITUDE].values
                data.at[idx, DIST_HOME] = haversine(
                    lat_user, lon_user, lat_home, lon_home
                )
                data.at[idx, HOME] = home[label_address]
                data.at[idx, label_city] = home[label_city]
            else:
                lat_home = np.full(
                    data.loc[idx].shape[0], home[LATITUDE], dtype=np.float64
                )
                lon_home = np.full(
                    data.loc[idx].shape[0], home[LONGITUDE], dtype=np.float64
                )
                data.at[idx, DIST_HOME] = haversine(
                    lat_user, lon_user, lat_home, lon_home
                )
                data.at[idx, HOME] = np.array(home[label_address])
                data.at[idx, label_city] = np.array(home[label_city])

    data.reset_index(inplace=True)
    logger.debug('... Resetting index')

    if drop_id_without_home:
        data.drop(data.loc[data[TRAJ_ID].isin(ids_without_home)].index, inplace=True)

    if not inplace:
        return data


[docs]def merge_home_with_poi(
    data: DataFrame,
    label_dist_poi: str = DIST_POI,
    label_name_poi: str = NAME_POI,
    label_id_poi: str = ID_POI,
    label_home: str = HOME,
    label_dist_home: str = DIST_HOME,
    drop_columns: bool = True,
    inplace: bool = False
):
    """
    Performs or merges the points of interest and the trajectories.

    Considering the starting points as other points of interest,
    generating a new DataFrame.

    Parameters
    ----------
    data : DataFrame
        The input trajectory data, with join_with_pois and join_with_home_by_id applied.
    label_dist_poi : str, optional
        Label of data referring to the distance from the nearest point of interest,
        by default DIST_POI
    label_name_poi : str, optional
        Label of data referring to the name from the nearest point of interest,
        by default NAME_POI
    label_id_poi : str, optional
        Label of data referring to the id from the nearest point of interest,
        by default ID_POI
    label_home : str, optional
        Label of df_home referring to the home point, by default HOME
    label_dist_home: str, optional
        Label of df_home referring to the distance to the home point,
        by default DIST_HOME
    drop_columns : bool, optional
        Flag that controls the deletion of the columns referring to the
        id and the distance from the home point, by default
    inplace : boolean, optional
        if set to true the original dataframe will be altered to contain
        the result of the filtering, otherwise a copy will be returned, by default False

    Examples
    --------
    >>> from pymove.utils.integration import (
    >>>    merge_home_with_poi,
    >>>    join_with_home_by_id
    >>> )
    >>> move_df
              lat          lon              datetime   id \
                   id_poi       dist_poi          name_poi
    0   39.984094   116.319236   2008-10-23 05:53:05    1 \
                        1       0.000000    distrito_pol_1
    1   39.984559   116.326696   2008-10-23 10:37:26    1 \
                        1     637.690216    distrito_pol_1
    2   40.002899   116.321520   2008-10-23 10:50:16    1 \
                        2    1385.087181   policia_federal
    3   40.016238   116.307691   2008-10-23 11:03:06    1 \
                        2    3225.288831   policia_federal
    4   40.013814   116.306525   2008-10-23 11:58:33    2 \
                        2    3047.838222   policia_federal
    5   40.009735   116.315069   2008-10-23 23:50:45    2 \
                        2    2294.075820   policia_federal
    >>> home_df
               lat          lon   id   formatted_address            city
    0   39.984094   116.319236    1          rua da mae       quixiling
    1   40.013821   116.306531    2      rua da familia   quixeramoling
    >>> join_with_home_by_id(move, home_df, inplace=True)
    >>> move_df
        id         lat          lon              datetime   id_poi      dist_poi \
               name_poi    dist_home         home                city
    0    1   39.984094   116.319236   2008-10-23 05:53:05        1      0.000000 \
         distrito_pol_1     0.000000        rua da mae       quixiling
    1    1   39.984559   116.326696   2008-10-23 10:37:26        1    637.690216 \
         distrito_pol_1   637.690216        rua da mae       quixiling
    2    1   40.002899   116.321520   2008-10-23 10:50:16        2   1385.087181 \
        policia_federal  2100.053501        rua da mae       quixiling
    3    1   40.016238    16.307691   2008-10-23 11:03:06        2   3225.288831 \
        policia_federal  3707.066732        rua da mae       quixiling
    4    2   40.013814   116.306525   2008-10-23 11:58:33        2   3047.838222 \
        policia_federal     0.931101    rua da familia   quixeramoling
    5    2   40.009735   116.315069   2008-10-23 23:50:45        2   2294.075820 \
        policia_federal   857.417540    rua da familia   quixeramoling
    >>> merge_home_with_poi(move_df)
        id         lat          lon              datetime           id_poi \
          dist_poi           name_poi            city
    0    1   39.984094   116.319236   2008-10-23 05:53:05       rua da mae \
          0.000000               home       quixiling
    1    1   39.984559   116.326696   2008-10-23 10:37:26       rua da mae \
        637.690216               home       quixiling
    2    1   40.002899   116.321520   2008-10-23 10:50:16                2 \
       1385.087181    policia_federal       quixiling
    3    1   40.016238   116.307691   2008-10-23 11:03:06                2 \
       3225.288831    policia_federal       quixiling
    4    2   40.013814   116.306525   2008-10-23 11:58:33   rua da familia \
          0.931101               home   quixeramoling
    5    2   40.009735   116.315069   2008-10-23 23:50:45   rua da familia \
        857.417540               home   quixeramoling
    """
    if not inplace:
        data = data.copy()

    logger.debug('merge home with POI using shortest distance')
    idx = data[data[label_dist_home] <= data[label_dist_poi]].index

    data.loc[idx, label_name_poi] = label_home
    data.loc[idx, label_dist_poi] = data.loc[idx, label_dist_home]
    data.loc[idx, label_id_poi] = data.loc[idx, label_home]

    if(drop_columns):
        data.drop(columns=[label_dist_home, label_home], inplace=True)

    if not inplace:
        return data