Module gtfs_segments.mobility

Expand source code
import os
import re
from .utils import *
import pandas as pd
import numpy as np

MOBILITY_SOURCES_link = "https://bit.ly/catalogs-csv"
ABBREV_link = 'https://raw.githubusercontent.com/UTEL-UIUC/gtfs_segments/main/state_abbreviations.json'

def fetch_gtfs_source(place ='ALL'):
    """
    It reads the mobility data sources csv file and generates a dataframe with the sources that are of
    type gtfs and are from the US
    
    Args:
      place: The place you want to get the GTFS data for. This can be a city, state, or country.
    Defaults to ALL
    
    Returns:
      A dataframe with sources
    """
    abb_df = pd.read_json(ABBREV_link)
    sources_df = pd.read_csv(MOBILITY_SOURCES_link)
    sources_df = sources_df[sources_df['location.country_code'] == 'US']
    sources_df = sources_df[sources_df['data_type'] == 'gtfs']
    sources_df = pd.merge(sources_df,abb_df,how='left',left_on='location.subdivision_name',right_on='state')
    sources_df = sources_df[~sources_df.state_code.isna()]
    sources_df['location.municipality'] = sources_df['location.municipality'].astype("str")
    sources_df.drop(['entity_type','mdb_source_id','data_type','location.country_code','note',
                     'static_reference','urls.direct_download','urls.authentication_type','urls.license','location.bounding_box.extracted_on', 'urls.authentication_info','urls.api_key_parameter_name','features'],axis=1,inplace=True)
    file_names = []
    for i,row in sources_df.iterrows():
        if row['location.municipality'] != 'nan':
            if len(sources_df[(sources_df['location.municipality'] == row['location.municipality']) & (sources_df['provider'] == row['provider'])]) <= 1:
                f_name = str(row['location.municipality'])+'-'+str(row['provider'])+'-'+str(row['state_code'])
            else:
                f_name = str(row['location.municipality'])+'-'+str(row['provider'])+'-'+str(row['name'])+'-'+str(row['state_code'])
        else:
            if len(sources_df[(sources_df['location.subdivision_name'] == row['location.subdivision_name']) & (sources_df['provider'] == row['provider'])]) <= 1:
                f_name = str(row['location.subdivision_name'])+'-'+str(row['provider'])+'-'+str(row['state_code'])
            else:
                f_name =str(row['location.subdivision_name'])+'-'+str(row['provider'])+'-'+str(row['name'])+'-'+str(row['state_code'])
        f_name = f_name.replace('/','').strip()
        file_names.append(f_name)
    sources_df.drop(['provider','location.municipality','location.subdivision_name','name','state_code','state'],axis=1,inplace=True)
    sources_df.insert(0,'provider',file_names)
    sources_df.columns = sources_df.columns.str.replace('location.bounding_box.',"",regex=True)
    if place == 'ALL':
        return sources_df
    else:
        sources_df = sources_df[sources_df.apply(lambda row: row.astype(str).str.contains(place.lower(), case=False).any(), axis=1)]
        if len(sources_df) == 0:
            return "No sources found for the given place"
        else:
            return sources_df


def summary_stats_mobility(df,folder_path,filename,b_day,link,bounds,max_spacing = 3000,export = False):
    """
    It takes in a dataframe, a folder path, a filename, a busiest day, a link, a bounding box, a max
    spacing, and a boolean for exporting the summary to a csv. 
    
    It then calculates the percentage of segments that have a spacing greater than the max spacing. It
    then filters the dataframe to only include segments with a spacing less than the max spacing. It
    then calculates the segment weighted mean, route weighted mean, traversal weighted mean, traversal
    weighted standard deviation, traversal weighted 25th percentile, traversal weighted 50th percentile,
    traversal weighted 75th percentile, number of segments, number of routes, number of traversals, and
    the max spacing. It then creates a dictionary with all of the above values and creates a dataframe
    from the dictionary. It then exports the dataframe to a csv if the export boolean is true. If the
    export boolean is false, it transposes the dataframe and returns it.
    
    Args:
      df: the dataframe containing the mobility data
      folder_path: The path to the folder where you want to save the summary.csv file.
      filename: The name of the file you want to save the data as.
      b_day: The busiest day of the week
      link: The link of the map you want to use.
      bounds: The bounding box of the area you want to analyze.
      max_spacing: The maximum distance between two stops that you want to consider. Defaults to 3000
      export: If True, the summary will be saved as a csv file in the folder_path. If False, the summary
    will be returned as a dataframe. Defaults to False
    
    Returns:
      A dataframe with the summary statistics of the mobility data.
    """
    percent_spacing = round(df[df["distance"] > max_spacing]['traversals'].sum()/df['traversals'].sum() *100,3)
    df = df[df["distance"] <= max_spacing]
    csv_path = os.path.join(folder_path,'summary.csv')
    stop_weighted_mean = df.groupby(['segment_id','distance']).first().reset_index()["distance"].mean()
    route_weighted_mean = df.groupby(['route_id','segment_id','distance']).first().reset_index()["distance"].mean()
    weighted_data =  np.hstack([np.repeat(x, y) for x, y in zip(df['distance'], df.traversals)])
    df_dict = {"Name":filename,
            'Busiest Day': b_day,
            'Link': link,
            'Min Latitude': bounds[0][1],
            'Min Longitude': bounds[0][0],
            'Max Latitude': bounds[1][1],
            'Max Longitude': bounds[1][0],
            'Segment Weighted Mean' : stop_weighted_mean,
            'Route Weighted Mean' : route_weighted_mean,
            'Traversal Weighted Mean': round(np.mean(weighted_data),3),
            'Traversal Weighted Std': round(np.std(weighted_data),3),
            'Traversal Weighted 25 % Quantile': round(np.quantile(weighted_data,0.25),3),
            'Traversal Weighted 50 % Quantile': round(np.quantile(weighted_data,0.5),3),
            'Traversal Weighted 75 % Quantile': round(np.quantile(weighted_data,0.75),3),
            'No of Segments':len(df),
            'No of Routes':len(df.route_id.unique()),
            'No of Traversals':sum(df.traversals),  
            'Max Spacing':max_spacing,
            '% Segments w/ spacing > max_spacing':percent_spacing}
    summary_df = pd.DataFrame([df_dict])
    # df.set_index(summary_df.columns[0],inplace=True)
    if export:
        summary_df.to_csv(csv_path,index = False)
        return "Saved the summary.csv in "+folder_path
    else:
       summary_df = summary_df.T
       return summary_df
   
def download_latest_data(out_folder_path,sources_df):
    """
    It iterates over the rows of the dataframe, and for each row, it tries to download the file from the
    URL in the `urls.latest` column, and write it to the folder specified in the `provider` column
    
    Args:
      out_folder_path: The path to the folder where you want to save the data
      sources_df: This is the dataframe that contains the urls for the data.
    """
    for i,row in sources_df.iterrows():
        try:
            download_write_file(row['urls.latest'],os.path.join(out_folder_path,row['provider']))
        except:
            continue
    print("Downloaded the latest data")    
    

Functions

def download_latest_data(out_folder_path, sources_df)

It iterates over the rows of the dataframe, and for each row, it tries to download the file from the URL in the urls.latest column, and write it to the folder specified in the provider column

Args

out_folder_path
The path to the folder where you want to save the data
sources_df
This is the dataframe that contains the urls for the data.
Expand source code
def download_latest_data(out_folder_path,sources_df):
    """
    It iterates over the rows of the dataframe, and for each row, it tries to download the file from the
    URL in the `urls.latest` column, and write it to the folder specified in the `provider` column
    
    Args:
      out_folder_path: The path to the folder where you want to save the data
      sources_df: This is the dataframe that contains the urls for the data.
    """
    for i,row in sources_df.iterrows():
        try:
            download_write_file(row['urls.latest'],os.path.join(out_folder_path,row['provider']))
        except:
            continue
    print("Downloaded the latest data")    
def fetch_gtfs_source(place='ALL')

It reads the mobility data sources csv file and generates a dataframe with the sources that are of type gtfs and are from the US

Args

place
The place you want to get the GTFS data for. This can be a city, state, or country.

Defaults to ALL

Returns

A dataframe with sources

Expand source code
def fetch_gtfs_source(place ='ALL'):
    """
    It reads the mobility data sources csv file and generates a dataframe with the sources that are of
    type gtfs and are from the US
    
    Args:
      place: The place you want to get the GTFS data for. This can be a city, state, or country.
    Defaults to ALL
    
    Returns:
      A dataframe with sources
    """
    abb_df = pd.read_json(ABBREV_link)
    sources_df = pd.read_csv(MOBILITY_SOURCES_link)
    sources_df = sources_df[sources_df['location.country_code'] == 'US']
    sources_df = sources_df[sources_df['data_type'] == 'gtfs']
    sources_df = pd.merge(sources_df,abb_df,how='left',left_on='location.subdivision_name',right_on='state')
    sources_df = sources_df[~sources_df.state_code.isna()]
    sources_df['location.municipality'] = sources_df['location.municipality'].astype("str")
    sources_df.drop(['entity_type','mdb_source_id','data_type','location.country_code','note',
                     'static_reference','urls.direct_download','urls.authentication_type','urls.license','location.bounding_box.extracted_on', 'urls.authentication_info','urls.api_key_parameter_name','features'],axis=1,inplace=True)
    file_names = []
    for i,row in sources_df.iterrows():
        if row['location.municipality'] != 'nan':
            if len(sources_df[(sources_df['location.municipality'] == row['location.municipality']) & (sources_df['provider'] == row['provider'])]) <= 1:
                f_name = str(row['location.municipality'])+'-'+str(row['provider'])+'-'+str(row['state_code'])
            else:
                f_name = str(row['location.municipality'])+'-'+str(row['provider'])+'-'+str(row['name'])+'-'+str(row['state_code'])
        else:
            if len(sources_df[(sources_df['location.subdivision_name'] == row['location.subdivision_name']) & (sources_df['provider'] == row['provider'])]) <= 1:
                f_name = str(row['location.subdivision_name'])+'-'+str(row['provider'])+'-'+str(row['state_code'])
            else:
                f_name =str(row['location.subdivision_name'])+'-'+str(row['provider'])+'-'+str(row['name'])+'-'+str(row['state_code'])
        f_name = f_name.replace('/','').strip()
        file_names.append(f_name)
    sources_df.drop(['provider','location.municipality','location.subdivision_name','name','state_code','state'],axis=1,inplace=True)
    sources_df.insert(0,'provider',file_names)
    sources_df.columns = sources_df.columns.str.replace('location.bounding_box.',"",regex=True)
    if place == 'ALL':
        return sources_df
    else:
        sources_df = sources_df[sources_df.apply(lambda row: row.astype(str).str.contains(place.lower(), case=False).any(), axis=1)]
        if len(sources_df) == 0:
            return "No sources found for the given place"
        else:
            return sources_df
def summary_stats_mobility(df, folder_path, filename, b_day, link, bounds, max_spacing=3000, export=False)

It takes in a dataframe, a folder path, a filename, a busiest day, a link, a bounding box, a max spacing, and a boolean for exporting the summary to a csv.

It then calculates the percentage of segments that have a spacing greater than the max spacing. It then filters the dataframe to only include segments with a spacing less than the max spacing. It then calculates the segment weighted mean, route weighted mean, traversal weighted mean, traversal weighted standard deviation, traversal weighted 25th percentile, traversal weighted 50th percentile, traversal weighted 75th percentile, number of segments, number of routes, number of traversals, and the max spacing. It then creates a dictionary with all of the above values and creates a dataframe from the dictionary. It then exports the dataframe to a csv if the export boolean is true. If the export boolean is false, it transposes the dataframe and returns it.

Args

df
the dataframe containing the mobility data
folder_path
The path to the folder where you want to save the summary.csv file.
filename
The name of the file you want to save the data as.
b_day
The busiest day of the week
link
The link of the map you want to use.
bounds
The bounding box of the area you want to analyze.
max_spacing
The maximum distance between two stops that you want to consider. Defaults to 3000
export
If True, the summary will be saved as a csv file in the folder_path. If False, the summary

will be returned as a dataframe. Defaults to False

Returns

A dataframe with the summary statistics of the mobility data.

Expand source code
def summary_stats_mobility(df,folder_path,filename,b_day,link,bounds,max_spacing = 3000,export = False):
    """
    It takes in a dataframe, a folder path, a filename, a busiest day, a link, a bounding box, a max
    spacing, and a boolean for exporting the summary to a csv. 
    
    It then calculates the percentage of segments that have a spacing greater than the max spacing. It
    then filters the dataframe to only include segments with a spacing less than the max spacing. It
    then calculates the segment weighted mean, route weighted mean, traversal weighted mean, traversal
    weighted standard deviation, traversal weighted 25th percentile, traversal weighted 50th percentile,
    traversal weighted 75th percentile, number of segments, number of routes, number of traversals, and
    the max spacing. It then creates a dictionary with all of the above values and creates a dataframe
    from the dictionary. It then exports the dataframe to a csv if the export boolean is true. If the
    export boolean is false, it transposes the dataframe and returns it.
    
    Args:
      df: the dataframe containing the mobility data
      folder_path: The path to the folder where you want to save the summary.csv file.
      filename: The name of the file you want to save the data as.
      b_day: The busiest day of the week
      link: The link of the map you want to use.
      bounds: The bounding box of the area you want to analyze.
      max_spacing: The maximum distance between two stops that you want to consider. Defaults to 3000
      export: If True, the summary will be saved as a csv file in the folder_path. If False, the summary
    will be returned as a dataframe. Defaults to False
    
    Returns:
      A dataframe with the summary statistics of the mobility data.
    """
    percent_spacing = round(df[df["distance"] > max_spacing]['traversals'].sum()/df['traversals'].sum() *100,3)
    df = df[df["distance"] <= max_spacing]
    csv_path = os.path.join(folder_path,'summary.csv')
    stop_weighted_mean = df.groupby(['segment_id','distance']).first().reset_index()["distance"].mean()
    route_weighted_mean = df.groupby(['route_id','segment_id','distance']).first().reset_index()["distance"].mean()
    weighted_data =  np.hstack([np.repeat(x, y) for x, y in zip(df['distance'], df.traversals)])
    df_dict = {"Name":filename,
            'Busiest Day': b_day,
            'Link': link,
            'Min Latitude': bounds[0][1],
            'Min Longitude': bounds[0][0],
            'Max Latitude': bounds[1][1],
            'Max Longitude': bounds[1][0],
            'Segment Weighted Mean' : stop_weighted_mean,
            'Route Weighted Mean' : route_weighted_mean,
            'Traversal Weighted Mean': round(np.mean(weighted_data),3),
            'Traversal Weighted Std': round(np.std(weighted_data),3),
            'Traversal Weighted 25 % Quantile': round(np.quantile(weighted_data,0.25),3),
            'Traversal Weighted 50 % Quantile': round(np.quantile(weighted_data,0.5),3),
            'Traversal Weighted 75 % Quantile': round(np.quantile(weighted_data,0.75),3),
            'No of Segments':len(df),
            'No of Routes':len(df.route_id.unique()),
            'No of Traversals':sum(df.traversals),  
            'Max Spacing':max_spacing,
            '% Segments w/ spacing > max_spacing':percent_spacing}
    summary_df = pd.DataFrame([df_dict])
    # df.set_index(summary_df.columns[0],inplace=True)
    if export:
        summary_df.to_csv(csv_path,index = False)
        return "Saved the summary.csv in "+folder_path
    else:
       summary_df = summary_df.T
       return summary_df