Module `gtfs_segments.utils`

Expand source code

import os
import shutil
import requests
import traceback
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
## Plot style
plt.style.use('ggplot')
from scipy.stats import gaussian_kde
from shapely.geometry import Point

def plot_hist(df,save_fig = False,show_mean = False,**kwargs):
    """
    It takes a dataframe with two columns, one with the distance between stops and the other with the
    number of traversals between those stops, and plots a weighted histogram of the distances
    
    Args:
      df: The dataframe that contains the data
      save_fig: If True, the figure will be saved to the file_path. Defaults to False
      show_mean: If True, will show the mean of the distribution. Defaults to False
    
    Returns:
      A matplotlib axis
    """
    if "max_spacing" not in kwargs.keys():
        max_spacing = 3000
        print("Using max_spacing = 3000")
    else:
        max_spacing = kwargs['max_spacing']
    if "ax" in kwargs.keys():
        ax = kwargs['ax']
    else:
        fig, ax = plt.subplots(figsize=(8,6))
    df = df[df['distance'] < max_spacing]
    data = np.hstack([np.repeat(x, y) for x, y in zip(df['distance'], df.traversals)])
    plt.hist(data,range=(0,max_spacing),density = True,bins = int(max_spacing/50),fc=(0, 105/255, 160/255, 0.4),ec = "white",lw =0.8)
    x = np.arange(0,max_spacing,5)
    plt.plot(x,gaussian_kde(data)(x),lw = 1.5,color=(0, 85/255, 120/255, 1))
    # sns.histplot(data,binwidth=50,stat = "density",kde=True,ax=ax)
    plt.xlim([0,max_spacing])
    plt.xlabel('Stop Spacing [m]')
    plt.ylabel('Density - Traversal Weighted')
    plt.title("Histogram of Spacing")
    if show_mean:
        plt.axvline(np.mean(data), color='k', linestyle='dashed', linewidth=2)
        min_ylim, max_ylim = plt.ylim()
        plt.text(np.mean(data)*1.1, max_ylim*0.9, 'Mean: {:.0f}'.format(np.mean(data)),fontsize=12)
    if "title" in kwargs.keys():
        plt.title(kwargs['title'])
    if save_fig == True:
        assert "file_path" in kwargs.keys(), "Please pass in the `file_path`"
        plt.savefig(kwargs['file_path'], dpi=300)
    plt.show()
    plt.close(fig)
    return ax

def summary_stats(df,export = False,**kwargs):
    """
    It takes in a dataframe, and returns a dataframe with summary statistics
    
    Args:
      df: The dataframe that you want to get the summary statistics for.
      export: If True, the summary will be exported to a csv file. Defaults to False
    
    Returns:
      A dataframe with the summary statistics
    """
    if "max_spacing" not in kwargs.keys():
        max_spacing = 3000
        print("Using max_spacing = 3000")
    else:
        max_spacing = kwargs['max_spacing']
    percent_spacing = round(df[df["distance"] > max_spacing]['traversals'].sum()/df['traversals'].sum() *100,3)
    df = df[df["distance"] <= max_spacing]
    stop_weighted_mean = df.groupby(['segment_id','distance']).first().reset_index()["distance"].mean()
    route_weighted_mean = df.groupby(['route_id','segment_id','distance']).first().reset_index()["distance"].mean()
    weighted_data =  np.hstack([np.repeat(x, y) for x, y in zip(df['distance'], df.traversals)])
    
    df_dict = {
            'Segment Weighted Mean' : stop_weighted_mean,
            'Route Weighted Mean' : route_weighted_mean,
            'Traversal Weighted Mean': round(np.mean(weighted_data),3),
            'Traversal Weighted Std': round(np.mean(weighted_data),3),
            'Traversal Weighted 25 % Quantile': round(np.quantile(weighted_data,0.25),3),
            'Traversal Weighted 50 % Quantile': round(np.quantile(weighted_data,0.50),3),
            'Traversal Weighted 75 % Quantile': round(np.quantile(weighted_data,0.75),3),
            'No of Segments':int(len(df)),
            'No of Routes':int(len(df.route_id.unique())),
            'No of Traversals':int(sum(df.traversals)),  
            'Max Spacing':int(max_spacing),
            '% Segments w/ spacing > max_spacing':percent_spacing}
    summary_df = pd.DataFrame([df_dict])
    # df.set_index(summary_df.columns[0],inplace=True)
    if export:
        assert "file_path" in kwargs.keys(), "Please pass in the `file_path`"
        summary_df.to_csv(kwargs['file_path'],index = False)
        print("Saved the summary in "+kwargs['file_path'])
    summary_df = summary_df.T
    return summary_df 
        
def export_segments(df,file_path,output_format, geometry = True):
    """
    This function takes a GeoDataFrame of segments, a file path, an output format, and a boolean value
    for whether or not to include the geometry in the output. 
    
    If the output format is GeoJSON, the function will output the GeoDataFrame to a GeoJSON file. 
    
    If the output format is CSV, the function will output the GeoDataFrame to a CSV file. If the
    geometry boolean is set to True, the function will output the CSV file with the geometry column. If
    the geometry boolean is set to False, the function will output the CSV file without the geometry
    column. 
    
    The function will also add additional columns to the CSV file, including the start and end points of
    the segments, the start and end longitude and latitude of the segments, and the distance of the
    segments. 
    
    The function will also add a column to the CSV file that indicates the number of times the segment
    was traversed.
    
    Args:
      df: the dataframe containing the segments
      file_path: The path to the file you want to export to.
      output_format: geojson or csv
      geometry: If True, the output will include the geometry of the segments. If False, the output will
    only include the start and end points of the segments. Defaults to True
    """
    ## Output to GeoJSON
    if output_format == 'geojson':
        df.to_file(file_path+'.json', driver="GeoJSON")
    elif output_format == 'csv':
        s_df = df[['route_id','segment_id','stop_id1','stop_id2','distance','traversals','geometry']].copy()
        geom_list =  s_df.geometry.apply(lambda g: np.array(g.coords))
        s_df['start_point'] = [Point(g[0]).wkt for g in geom_list]
        s_df['end_point'] = [Point(g[-1]).wkt for g in geom_list]
        s_df['start_lon'] = [g[0][0] for g in geom_list]
        s_df['start_lat'] = [g[0][1] for g in geom_list]
        s_df['end_lon'] = [g[-1][0] for g in geom_list]
        s_df['end_lat'] = [g[-1][1] for g in geom_list]
        sg_df = s_df[['route_id','segment_id','stop_id1','stop_id2','distance','traversals','start_point','end_point','geometry']]
        if geometry == True:
            ## Output With LS
            sg_df.to_csv(file_path+'.csv',index = False)
        else:
            d_df = s_df[['route_id','segment_id','stop_id1','stop_id2','start_lat','start_lon','end_lat','end_lon','distance','traversals']]
            ## Output without LS
            d_df.to_csv(file_path+'.csv',index = False)


def process(pipeline_gtfs,row,max_spacing):
    """
    It takes a pipeline, a row from the sources_df, and a max_spacing, and returns the output of the
    pipeline
    
    Args:
      pipeline_gtfs: This is the function that will be used to process the GTFS data.
      row: This is a row in the sources_df dataframe. It contains the name of the provider, the url to
    the gtfs file, and the bounding box of the area that the gtfs file covers.
      max_spacing: Maximum Allowed Spacing between two consecutive stops.
    
    Returns:
      The return value is a tuple of the form (filename,folder_path,df)
    """
    filename = row['provider']
    url = row['urls.latest']
    bounds = [[row['minimum_longitude'],row['minimum_latitude']],[row['maximum_longitude'],row['maximum_latitude']]]
    print(filename)
    try:
        return pipeline_gtfs(filename,url,bounds,max_spacing)
    except:
        traceback.print_exc()
        folder_path  = os.path.join('output_files',filename)
        return failed_pipeline("Failed for ",filename,folder_path)

def failed_pipeline(message,filename,folder_path):
    """
    "If the folder path exists, delete it and return the failure message."
    
    Args:
      message: The message to be returned
      filename: The name of the file that is being processed
      folder_path: The path to the folder where the file is located
    
    Returns:
      a string that is the concatenation of the message and the filename, indicating failure
    """

    if os.path.exists(folder_path):
        shutil.rmtree(folder_path)
    return message + filename

def download_write_file(url,folder_path):
    """
    It takes a URL and a folder path as input, creates a new folder if it does not exist, downloads the
    file from the URL, and writes the file to the folder path
    
    Args:
      url: The URL of the GTFS file you want to download
      folder_path: The path to the folder where you want to save the GTFS file.
    
    Returns:
      The location of the file that was downloaded.
    """
    # Create a new directory if it does not exist
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    ## Download file from URL
    r = requests.get(url, allow_redirects=True)
    gtfs_file_loc = os.path.join(folder_path,"gtfs.zip")
    
    ## Write file locally
    file = open(gtfs_file_loc, "wb")
    file.write(r.content)
    file.close()
    return gtfs_file_loc

Functions

def download_write_file(url, folder_path)

It takes a URL and a folder path as input, creates a new folder if it does not exist, downloads the file from the URL, and writes the file to the folder path

Args

url: The URL of the GTFS file you want to download
folder_path: The path to the folder where you want to save the GTFS file.

Returns

The location of the file that was downloaded.

Expand source code

def download_write_file(url,folder_path):
    """
    It takes a URL and a folder path as input, creates a new folder if it does not exist, downloads the
    file from the URL, and writes the file to the folder path
    
    Args:
      url: The URL of the GTFS file you want to download
      folder_path: The path to the folder where you want to save the GTFS file.
    
    Returns:
      The location of the file that was downloaded.
    """
    # Create a new directory if it does not exist
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    ## Download file from URL
    r = requests.get(url, allow_redirects=True)
    gtfs_file_loc = os.path.join(folder_path,"gtfs.zip")
    
    ## Write file locally
    file = open(gtfs_file_loc, "wb")
    file.write(r.content)
    file.close()
    return gtfs_file_loc

def export_segments(df, file_path, output_format, geometry=True)

This function takes a GeoDataFrame of segments, a file path, an output format, and a boolean value for whether or not to include the geometry in the output.

If the output format is GeoJSON, the function will output the GeoDataFrame to a GeoJSON file.

If the output format is CSV, the function will output the GeoDataFrame to a CSV file. If the geometry boolean is set to True, the function will output the CSV file with the geometry column. If the geometry boolean is set to False, the function will output the CSV file without the geometry column.

The function will also add additional columns to the CSV file, including the start and end points of the segments, the start and end longitude and latitude of the segments, and the distance of the segments.

The function will also add a column to the CSV file that indicates the number of times the segment was traversed.

Args

df: the dataframe containing the segments
file_path: The path to the file you want to export to.
output_format: geojson or csv
geometry: If True, the output will include the geometry of the segments. If False, the output will

only include the start and end points of the segments. Defaults to True

Expand source code

def export_segments(df,file_path,output_format, geometry = True):
    """
    This function takes a GeoDataFrame of segments, a file path, an output format, and a boolean value
    for whether or not to include the geometry in the output. 
    
    If the output format is GeoJSON, the function will output the GeoDataFrame to a GeoJSON file. 
    
    If the output format is CSV, the function will output the GeoDataFrame to a CSV file. If the
    geometry boolean is set to True, the function will output the CSV file with the geometry column. If
    the geometry boolean is set to False, the function will output the CSV file without the geometry
    column. 
    
    The function will also add additional columns to the CSV file, including the start and end points of
    the segments, the start and end longitude and latitude of the segments, and the distance of the
    segments. 
    
    The function will also add a column to the CSV file that indicates the number of times the segment
    was traversed.
    
    Args:
      df: the dataframe containing the segments
      file_path: The path to the file you want to export to.
      output_format: geojson or csv
      geometry: If True, the output will include the geometry of the segments. If False, the output will
    only include the start and end points of the segments. Defaults to True
    """
    ## Output to GeoJSON
    if output_format == 'geojson':
        df.to_file(file_path+'.json', driver="GeoJSON")
    elif output_format == 'csv':
        s_df = df[['route_id','segment_id','stop_id1','stop_id2','distance','traversals','geometry']].copy()
        geom_list =  s_df.geometry.apply(lambda g: np.array(g.coords))
        s_df['start_point'] = [Point(g[0]).wkt for g in geom_list]
        s_df['end_point'] = [Point(g[-1]).wkt for g in geom_list]
        s_df['start_lon'] = [g[0][0] for g in geom_list]
        s_df['start_lat'] = [g[0][1] for g in geom_list]
        s_df['end_lon'] = [g[-1][0] for g in geom_list]
        s_df['end_lat'] = [g[-1][1] for g in geom_list]
        sg_df = s_df[['route_id','segment_id','stop_id1','stop_id2','distance','traversals','start_point','end_point','geometry']]
        if geometry == True:
            ## Output With LS
            sg_df.to_csv(file_path+'.csv',index = False)
        else:
            d_df = s_df[['route_id','segment_id','stop_id1','stop_id2','start_lat','start_lon','end_lat','end_lon','distance','traversals']]
            ## Output without LS
            d_df.to_csv(file_path+'.csv',index = False)

def failed_pipeline(message, filename, folder_path)

"If the folder path exists, delete it and return the failure message."

Args

message: The message to be returned
filename: The name of the file that is being processed
folder_path: The path to the folder where the file is located

Returns

a string that is the concatenation of the message and the filename, indicating failure

Expand source code

def failed_pipeline(message,filename,folder_path):
    """
    "If the folder path exists, delete it and return the failure message."
    
    Args:
      message: The message to be returned
      filename: The name of the file that is being processed
      folder_path: The path to the folder where the file is located
    
    Returns:
      a string that is the concatenation of the message and the filename, indicating failure
    """

    if os.path.exists(folder_path):
        shutil.rmtree(folder_path)
    return message + filename

def plot_hist(df, save_fig=False, show_mean=False, **kwargs)

It takes a dataframe with two columns, one with the distance between stops and the other with the number of traversals between those stops, and plots a weighted histogram of the distances

Args

df: The dataframe that contains the data
save_fig: If True, the figure will be saved to the file_path. Defaults to False
show_mean: If True, will show the mean of the distribution. Defaults to False

Returns

A matplotlib axis

Expand source code

def plot_hist(df,save_fig = False,show_mean = False,**kwargs):
    """
    It takes a dataframe with two columns, one with the distance between stops and the other with the
    number of traversals between those stops, and plots a weighted histogram of the distances
    
    Args:
      df: The dataframe that contains the data
      save_fig: If True, the figure will be saved to the file_path. Defaults to False
      show_mean: If True, will show the mean of the distribution. Defaults to False
    
    Returns:
      A matplotlib axis
    """
    if "max_spacing" not in kwargs.keys():
        max_spacing = 3000
        print("Using max_spacing = 3000")
    else:
        max_spacing = kwargs['max_spacing']
    if "ax" in kwargs.keys():
        ax = kwargs['ax']
    else:
        fig, ax = plt.subplots(figsize=(8,6))
    df = df[df['distance'] < max_spacing]
    data = np.hstack([np.repeat(x, y) for x, y in zip(df['distance'], df.traversals)])
    plt.hist(data,range=(0,max_spacing),density = True,bins = int(max_spacing/50),fc=(0, 105/255, 160/255, 0.4),ec = "white",lw =0.8)
    x = np.arange(0,max_spacing,5)
    plt.plot(x,gaussian_kde(data)(x),lw = 1.5,color=(0, 85/255, 120/255, 1))
    # sns.histplot(data,binwidth=50,stat = "density",kde=True,ax=ax)
    plt.xlim([0,max_spacing])
    plt.xlabel('Stop Spacing [m]')
    plt.ylabel('Density - Traversal Weighted')
    plt.title("Histogram of Spacing")
    if show_mean:
        plt.axvline(np.mean(data), color='k', linestyle='dashed', linewidth=2)
        min_ylim, max_ylim = plt.ylim()
        plt.text(np.mean(data)*1.1, max_ylim*0.9, 'Mean: {:.0f}'.format(np.mean(data)),fontsize=12)
    if "title" in kwargs.keys():
        plt.title(kwargs['title'])
    if save_fig == True:
        assert "file_path" in kwargs.keys(), "Please pass in the `file_path`"
        plt.savefig(kwargs['file_path'], dpi=300)
    plt.show()
    plt.close(fig)
    return ax

def process(pipeline_gtfs, row, max_spacing)

It takes a pipeline, a row from the sources_df, and a max_spacing, and returns the output of the pipeline

Args

pipeline_gtfs: This is the function that will be used to process the GTFS data.
row: This is a row in the sources_df dataframe. It contains the name of the provider, the url to

the gtfs file, and the bounding box of the area that the gtfs file covers. max_spacing: Maximum Allowed Spacing between two consecutive stops.

Returns

The return value is a tuple of the form (filename,folder_path,df)

Expand source code

def process(pipeline_gtfs,row,max_spacing):
    """
    It takes a pipeline, a row from the sources_df, and a max_spacing, and returns the output of the
    pipeline
    
    Args:
      pipeline_gtfs: This is the function that will be used to process the GTFS data.
      row: This is a row in the sources_df dataframe. It contains the name of the provider, the url to
    the gtfs file, and the bounding box of the area that the gtfs file covers.
      max_spacing: Maximum Allowed Spacing between two consecutive stops.
    
    Returns:
      The return value is a tuple of the form (filename,folder_path,df)
    """
    filename = row['provider']
    url = row['urls.latest']
    bounds = [[row['minimum_longitude'],row['minimum_latitude']],[row['maximum_longitude'],row['maximum_latitude']]]
    print(filename)
    try:
        return pipeline_gtfs(filename,url,bounds,max_spacing)
    except:
        traceback.print_exc()
        folder_path  = os.path.join('output_files',filename)
        return failed_pipeline("Failed for ",filename,folder_path)

def summary_stats(df, export=False, **kwargs)

It takes in a dataframe, and returns a dataframe with summary statistics

Args

df: The dataframe that you want to get the summary statistics for.
export: If True, the summary will be exported to a csv file. Defaults to False

Returns

A dataframe with the summary statistics

Expand source code

def summary_stats(df,export = False,**kwargs):
    """
    It takes in a dataframe, and returns a dataframe with summary statistics
    
    Args:
      df: The dataframe that you want to get the summary statistics for.
      export: If True, the summary will be exported to a csv file. Defaults to False
    
    Returns:
      A dataframe with the summary statistics
    """
    if "max_spacing" not in kwargs.keys():
        max_spacing = 3000
        print("Using max_spacing = 3000")
    else:
        max_spacing = kwargs['max_spacing']
    percent_spacing = round(df[df["distance"] > max_spacing]['traversals'].sum()/df['traversals'].sum() *100,3)
    df = df[df["distance"] <= max_spacing]
    stop_weighted_mean = df.groupby(['segment_id','distance']).first().reset_index()["distance"].mean()
    route_weighted_mean = df.groupby(['route_id','segment_id','distance']).first().reset_index()["distance"].mean()
    weighted_data =  np.hstack([np.repeat(x, y) for x, y in zip(df['distance'], df.traversals)])
    
    df_dict = {
            'Segment Weighted Mean' : stop_weighted_mean,
            'Route Weighted Mean' : route_weighted_mean,
            'Traversal Weighted Mean': round(np.mean(weighted_data),3),
            'Traversal Weighted Std': round(np.mean(weighted_data),3),
            'Traversal Weighted 25 % Quantile': round(np.quantile(weighted_data,0.25),3),
            'Traversal Weighted 50 % Quantile': round(np.quantile(weighted_data,0.50),3),
            'Traversal Weighted 75 % Quantile': round(np.quantile(weighted_data,0.75),3),
            'No of Segments':int(len(df)),
            'No of Routes':int(len(df.route_id.unique())),
            'No of Traversals':int(sum(df.traversals)),  
            'Max Spacing':int(max_spacing),
            '% Segments w/ spacing > max_spacing':percent_spacing}
    summary_df = pd.DataFrame([df_dict])
    # df.set_index(summary_df.columns[0],inplace=True)
    if export:
        assert "file_path" in kwargs.keys(), "Please pass in the `file_path`"
        summary_df.to_csv(kwargs['file_path'],index = False)
        print("Saved the summary in "+kwargs['file_path'])
    summary_df = summary_df.T
    return summary_df