Module gtfs_segments.utils

Expand source code
import os
import shutil
import requests
import traceback
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
## Plot style
plt.style.use('ggplot')
from scipy.stats import gaussian_kde
from shapely.geometry import Point

def plot_hist(df,save_fig = False,show_mean = False,**kwargs):
    """
    It takes a dataframe with two columns, one with the distance between stops and the other with the
    number of traversals between those stops, and plots a weighted histogram of the distances
    
    Args:
      df: The dataframe that contains the data
      save_fig: If True, the figure will be saved to the file_path. Defaults to False
      show_mean: If True, will show the mean of the distribution. Defaults to False
    
    Returns:
      A matplotlib axis
    """
    if "max_spacing" not in kwargs.keys():
        max_spacing = 3000
        print("Using max_spacing = 3000")
    else:
        max_spacing = kwargs['max_spacing']
    if "ax" in kwargs.keys():
        ax = kwargs['ax']
    else:
        fig, ax = plt.subplots(figsize=(8,6))
    df = df[df['distance'] < max_spacing]
    data = np.hstack([np.repeat(x, y) for x, y in zip(df['distance'], df.traversals)])
    plt.hist(data,range=(0,max_spacing),density = True,bins = int(max_spacing/50),fc=(0, 105/255, 160/255, 0.4),ec = "white",lw =0.8)
    x = np.arange(0,max_spacing,5)
    plt.plot(x,gaussian_kde(data)(x),lw = 1.5,color=(0, 85/255, 120/255, 1))
    # sns.histplot(data,binwidth=50,stat = "density",kde=True,ax=ax)
    plt.xlim([0,max_spacing])
    plt.xlabel('Stop Spacing [m]')
    plt.ylabel('Density - Traversal Weighted')
    plt.title("Histogram of Spacing")
    if show_mean:
        plt.axvline(np.mean(data), color='k', linestyle='dashed', linewidth=2)
        min_ylim, max_ylim = plt.ylim()
        plt.text(np.mean(data)*1.1, max_ylim*0.9, 'Mean: {:.0f}'.format(np.mean(data)),fontsize=12)
    if "title" in kwargs.keys():
        plt.title(kwargs['title'])
    if save_fig == True:
        assert "file_path" in kwargs.keys(), "Please pass in the `file_path`"
        plt.savefig(kwargs['file_path'], dpi=300)
    plt.show()
    plt.close(fig)
    return ax

def summary_stats(df,export = False,**kwargs):
    """
    It takes in a dataframe, and returns a dataframe with summary statistics
    
    Args:
      df: The dataframe that you want to get the summary statistics for.
      export: If True, the summary will be exported to a csv file. Defaults to False
    
    Returns:
      A dataframe with the summary statistics
    """
    if "max_spacing" not in kwargs.keys():
        max_spacing = 3000
        print("Using max_spacing = 3000")
    else:
        max_spacing = kwargs['max_spacing']
    percent_spacing = round(df[df["distance"] > max_spacing]['traversals'].sum()/df['traversals'].sum() *100,3)
    df = df[df["distance"] <= max_spacing]
    stop_weighted_mean = df.groupby(['segment_id','distance']).first().reset_index()["distance"].mean()
    route_weighted_mean = df.groupby(['route_id','segment_id','distance']).first().reset_index()["distance"].mean()
    weighted_data =  np.hstack([np.repeat(x, y) for x, y in zip(df['distance'], df.traversals)])
    
    df_dict = {
            'Segment Weighted Mean' : stop_weighted_mean,
            'Route Weighted Mean' : route_weighted_mean,
            'Traversal Weighted Mean': round(np.mean(weighted_data),3),
            'Traversal Weighted Std': round(np.mean(weighted_data),3),
            'Traversal Weighted 25 % Quantile': round(np.quantile(weighted_data,0.25),3),
            'Traversal Weighted 50 % Quantile': round(np.quantile(weighted_data,0.50),3),
            'Traversal Weighted 75 % Quantile': round(np.quantile(weighted_data,0.75),3),
            'No of Segments':int(len(df)),
            'No of Routes':int(len(df.route_id.unique())),
            'No of Traversals':int(sum(df.traversals)),  
            'Max Spacing':int(max_spacing),
            '% Segments w/ spacing > max_spacing':percent_spacing}
    summary_df = pd.DataFrame([df_dict])
    # df.set_index(summary_df.columns[0],inplace=True)
    if export:
        assert "file_path" in kwargs.keys(), "Please pass in the `file_path`"
        summary_df.to_csv(kwargs['file_path'],index = False)
        print("Saved the summary in "+kwargs['file_path'])
    summary_df = summary_df.T
    return summary_df 
        
def export_segments(df,file_path,output_format, geometry = True):
    """
    This function takes a GeoDataFrame of segments, a file path, an output format, and a boolean value
    for whether or not to include the geometry in the output. 
    
    If the output format is GeoJSON, the function will output the GeoDataFrame to a GeoJSON file. 
    
    If the output format is CSV, the function will output the GeoDataFrame to a CSV file. If the
    geometry boolean is set to True, the function will output the CSV file with the geometry column. If
    the geometry boolean is set to False, the function will output the CSV file without the geometry
    column. 
    
    The function will also add additional columns to the CSV file, including the start and end points of
    the segments, the start and end longitude and latitude of the segments, and the distance of the
    segments. 
    
    The function will also add a column to the CSV file that indicates the number of times the segment
    was traversed.
    
    Args:
      df: the dataframe containing the segments
      file_path: The path to the file you want to export to.
      output_format: geojson or csv
      geometry: If True, the output will include the geometry of the segments. If False, the output will
    only include the start and end points of the segments. Defaults to True
    """
    ## Output to GeoJSON
    if output_format == 'geojson':
        df.to_file(file_path+'.json', driver="GeoJSON")
    elif output_format == 'csv':
        s_df = df[['route_id','segment_id','stop_id1','stop_id2','distance','traversals','geometry']].copy()
        geom_list =  s_df.geometry.apply(lambda g: np.array(g.coords))
        s_df['start_point'] = [Point(g[0]).wkt for g in geom_list]
        s_df['end_point'] = [Point(g[-1]).wkt for g in geom_list]
        s_df['start_lon'] = [g[0][0] for g in geom_list]
        s_df['start_lat'] = [g[0][1] for g in geom_list]
        s_df['end_lon'] = [g[-1][0] for g in geom_list]
        s_df['end_lat'] = [g[-1][1] for g in geom_list]
        sg_df = s_df[['route_id','segment_id','stop_id1','stop_id2','distance','traversals','start_point','end_point','geometry']]
        if geometry == True:
            ## Output With LS
            sg_df.to_csv(file_path+'.csv',index = False)
        else:
            d_df = s_df[['route_id','segment_id','stop_id1','stop_id2','start_lat','start_lon','end_lat','end_lon','distance','traversals']]
            ## Output without LS
            d_df.to_csv(file_path+'.csv',index = False)


def process(pipeline_gtfs,row,max_spacing):
    """
    It takes a pipeline, a row from the sources_df, and a max_spacing, and returns the output of the
    pipeline
    
    Args:
      pipeline_gtfs: This is the function that will be used to process the GTFS data.
      row: This is a row in the sources_df dataframe. It contains the name of the provider, the url to
    the gtfs file, and the bounding box of the area that the gtfs file covers.
      max_spacing: Maximum Allowed Spacing between two consecutive stops.
    
    Returns:
      The return value is a tuple of the form (filename,folder_path,df)
    """
    filename = row['provider']
    url = row['urls.latest']
    bounds = [[row['minimum_longitude'],row['minimum_latitude']],[row['maximum_longitude'],row['maximum_latitude']]]
    print(filename)
    try:
        return pipeline_gtfs(filename,url,bounds,max_spacing)
    except:
        traceback.print_exc()
        folder_path  = os.path.join('output_files',filename)
        return failed_pipeline("Failed for ",filename,folder_path)

def failed_pipeline(message,filename,folder_path):
    """
    "If the folder path exists, delete it and return the failure message."
    
    Args:
      message: The message to be returned
      filename: The name of the file that is being processed
      folder_path: The path to the folder where the file is located
    
    Returns:
      a string that is the concatenation of the message and the filename, indicating failure
    """

    if os.path.exists(folder_path):
        shutil.rmtree(folder_path)
    return message + filename

def download_write_file(url,folder_path):
    """
    It takes a URL and a folder path as input, creates a new folder if it does not exist, downloads the
    file from the URL, and writes the file to the folder path
    
    Args:
      url: The URL of the GTFS file you want to download
      folder_path: The path to the folder where you want to save the GTFS file.
    
    Returns:
      The location of the file that was downloaded.
    """
    # Create a new directory if it does not exist
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    ## Download file from URL
    r = requests.get(url, allow_redirects=True)
    gtfs_file_loc = os.path.join(folder_path,"gtfs.zip")
    
    ## Write file locally
    file = open(gtfs_file_loc, "wb")
    file.write(r.content)
    file.close()
    return gtfs_file_loc

Functions

def download_write_file(url, folder_path)

It takes a URL and a folder path as input, creates a new folder if it does not exist, downloads the file from the URL, and writes the file to the folder path

Args

url
The URL of the GTFS file you want to download
folder_path
The path to the folder where you want to save the GTFS file.

Returns

The location of the file that was downloaded.

Expand source code
def download_write_file(url,folder_path):
    """
    It takes a URL and a folder path as input, creates a new folder if it does not exist, downloads the
    file from the URL, and writes the file to the folder path
    
    Args:
      url: The URL of the GTFS file you want to download
      folder_path: The path to the folder where you want to save the GTFS file.
    
    Returns:
      The location of the file that was downloaded.
    """
    # Create a new directory if it does not exist
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    ## Download file from URL
    r = requests.get(url, allow_redirects=True)
    gtfs_file_loc = os.path.join(folder_path,"gtfs.zip")
    
    ## Write file locally
    file = open(gtfs_file_loc, "wb")
    file.write(r.content)
    file.close()
    return gtfs_file_loc
def export_segments(df, file_path, output_format, geometry=True)

This function takes a GeoDataFrame of segments, a file path, an output format, and a boolean value for whether or not to include the geometry in the output.

If the output format is GeoJSON, the function will output the GeoDataFrame to a GeoJSON file.

If the output format is CSV, the function will output the GeoDataFrame to a CSV file. If the geometry boolean is set to True, the function will output the CSV file with the geometry column. If the geometry boolean is set to False, the function will output the CSV file without the geometry column.

The function will also add additional columns to the CSV file, including the start and end points of the segments, the start and end longitude and latitude of the segments, and the distance of the segments.

The function will also add a column to the CSV file that indicates the number of times the segment was traversed.

Args

df
the dataframe containing the segments
file_path
The path to the file you want to export to.
output_format
geojson or csv
geometry
If True, the output will include the geometry of the segments. If False, the output will

only include the start and end points of the segments. Defaults to True

Expand source code
def export_segments(df,file_path,output_format, geometry = True):
    """
    This function takes a GeoDataFrame of segments, a file path, an output format, and a boolean value
    for whether or not to include the geometry in the output. 
    
    If the output format is GeoJSON, the function will output the GeoDataFrame to a GeoJSON file. 
    
    If the output format is CSV, the function will output the GeoDataFrame to a CSV file. If the
    geometry boolean is set to True, the function will output the CSV file with the geometry column. If
    the geometry boolean is set to False, the function will output the CSV file without the geometry
    column. 
    
    The function will also add additional columns to the CSV file, including the start and end points of
    the segments, the start and end longitude and latitude of the segments, and the distance of the
    segments. 
    
    The function will also add a column to the CSV file that indicates the number of times the segment
    was traversed.
    
    Args:
      df: the dataframe containing the segments
      file_path: The path to the file you want to export to.
      output_format: geojson or csv
      geometry: If True, the output will include the geometry of the segments. If False, the output will
    only include the start and end points of the segments. Defaults to True
    """
    ## Output to GeoJSON
    if output_format == 'geojson':
        df.to_file(file_path+'.json', driver="GeoJSON")
    elif output_format == 'csv':
        s_df = df[['route_id','segment_id','stop_id1','stop_id2','distance','traversals','geometry']].copy()
        geom_list =  s_df.geometry.apply(lambda g: np.array(g.coords))
        s_df['start_point'] = [Point(g[0]).wkt for g in geom_list]
        s_df['end_point'] = [Point(g[-1]).wkt for g in geom_list]
        s_df['start_lon'] = [g[0][0] for g in geom_list]
        s_df['start_lat'] = [g[0][1] for g in geom_list]
        s_df['end_lon'] = [g[-1][0] for g in geom_list]
        s_df['end_lat'] = [g[-1][1] for g in geom_list]
        sg_df = s_df[['route_id','segment_id','stop_id1','stop_id2','distance','traversals','start_point','end_point','geometry']]
        if geometry == True:
            ## Output With LS
            sg_df.to_csv(file_path+'.csv',index = False)
        else:
            d_df = s_df[['route_id','segment_id','stop_id1','stop_id2','start_lat','start_lon','end_lat','end_lon','distance','traversals']]
            ## Output without LS
            d_df.to_csv(file_path+'.csv',index = False)
def failed_pipeline(message, filename, folder_path)

"If the folder path exists, delete it and return the failure message."

Args

message
The message to be returned
filename
The name of the file that is being processed
folder_path
The path to the folder where the file is located

Returns

a string that is the concatenation of the message and the filename, indicating failure

Expand source code
def failed_pipeline(message,filename,folder_path):
    """
    "If the folder path exists, delete it and return the failure message."
    
    Args:
      message: The message to be returned
      filename: The name of the file that is being processed
      folder_path: The path to the folder where the file is located
    
    Returns:
      a string that is the concatenation of the message and the filename, indicating failure
    """

    if os.path.exists(folder_path):
        shutil.rmtree(folder_path)
    return message + filename
def plot_hist(df, save_fig=False, show_mean=False, **kwargs)

It takes a dataframe with two columns, one with the distance between stops and the other with the number of traversals between those stops, and plots a weighted histogram of the distances

Args

df
The dataframe that contains the data
save_fig
If True, the figure will be saved to the file_path. Defaults to False
show_mean
If True, will show the mean of the distribution. Defaults to False

Returns

A matplotlib axis

Expand source code
def plot_hist(df,save_fig = False,show_mean = False,**kwargs):
    """
    It takes a dataframe with two columns, one with the distance between stops and the other with the
    number of traversals between those stops, and plots a weighted histogram of the distances
    
    Args:
      df: The dataframe that contains the data
      save_fig: If True, the figure will be saved to the file_path. Defaults to False
      show_mean: If True, will show the mean of the distribution. Defaults to False
    
    Returns:
      A matplotlib axis
    """
    if "max_spacing" not in kwargs.keys():
        max_spacing = 3000
        print("Using max_spacing = 3000")
    else:
        max_spacing = kwargs['max_spacing']
    if "ax" in kwargs.keys():
        ax = kwargs['ax']
    else:
        fig, ax = plt.subplots(figsize=(8,6))
    df = df[df['distance'] < max_spacing]
    data = np.hstack([np.repeat(x, y) for x, y in zip(df['distance'], df.traversals)])
    plt.hist(data,range=(0,max_spacing),density = True,bins = int(max_spacing/50),fc=(0, 105/255, 160/255, 0.4),ec = "white",lw =0.8)
    x = np.arange(0,max_spacing,5)
    plt.plot(x,gaussian_kde(data)(x),lw = 1.5,color=(0, 85/255, 120/255, 1))
    # sns.histplot(data,binwidth=50,stat = "density",kde=True,ax=ax)
    plt.xlim([0,max_spacing])
    plt.xlabel('Stop Spacing [m]')
    plt.ylabel('Density - Traversal Weighted')
    plt.title("Histogram of Spacing")
    if show_mean:
        plt.axvline(np.mean(data), color='k', linestyle='dashed', linewidth=2)
        min_ylim, max_ylim = plt.ylim()
        plt.text(np.mean(data)*1.1, max_ylim*0.9, 'Mean: {:.0f}'.format(np.mean(data)),fontsize=12)
    if "title" in kwargs.keys():
        plt.title(kwargs['title'])
    if save_fig == True:
        assert "file_path" in kwargs.keys(), "Please pass in the `file_path`"
        plt.savefig(kwargs['file_path'], dpi=300)
    plt.show()
    plt.close(fig)
    return ax
def process(pipeline_gtfs, row, max_spacing)

It takes a pipeline, a row from the sources_df, and a max_spacing, and returns the output of the pipeline

Args

pipeline_gtfs
This is the function that will be used to process the GTFS data.
row
This is a row in the sources_df dataframe. It contains the name of the provider, the url to

the gtfs file, and the bounding box of the area that the gtfs file covers. max_spacing: Maximum Allowed Spacing between two consecutive stops.

Returns

The return value is a tuple of the form (filename,folder_path,df)

Expand source code
def process(pipeline_gtfs,row,max_spacing):
    """
    It takes a pipeline, a row from the sources_df, and a max_spacing, and returns the output of the
    pipeline
    
    Args:
      pipeline_gtfs: This is the function that will be used to process the GTFS data.
      row: This is a row in the sources_df dataframe. It contains the name of the provider, the url to
    the gtfs file, and the bounding box of the area that the gtfs file covers.
      max_spacing: Maximum Allowed Spacing between two consecutive stops.
    
    Returns:
      The return value is a tuple of the form (filename,folder_path,df)
    """
    filename = row['provider']
    url = row['urls.latest']
    bounds = [[row['minimum_longitude'],row['minimum_latitude']],[row['maximum_longitude'],row['maximum_latitude']]]
    print(filename)
    try:
        return pipeline_gtfs(filename,url,bounds,max_spacing)
    except:
        traceback.print_exc()
        folder_path  = os.path.join('output_files',filename)
        return failed_pipeline("Failed for ",filename,folder_path)
def summary_stats(df, export=False, **kwargs)

It takes in a dataframe, and returns a dataframe with summary statistics

Args

df
The dataframe that you want to get the summary statistics for.
export
If True, the summary will be exported to a csv file. Defaults to False

Returns

A dataframe with the summary statistics

Expand source code
def summary_stats(df,export = False,**kwargs):
    """
    It takes in a dataframe, and returns a dataframe with summary statistics
    
    Args:
      df: The dataframe that you want to get the summary statistics for.
      export: If True, the summary will be exported to a csv file. Defaults to False
    
    Returns:
      A dataframe with the summary statistics
    """
    if "max_spacing" not in kwargs.keys():
        max_spacing = 3000
        print("Using max_spacing = 3000")
    else:
        max_spacing = kwargs['max_spacing']
    percent_spacing = round(df[df["distance"] > max_spacing]['traversals'].sum()/df['traversals'].sum() *100,3)
    df = df[df["distance"] <= max_spacing]
    stop_weighted_mean = df.groupby(['segment_id','distance']).first().reset_index()["distance"].mean()
    route_weighted_mean = df.groupby(['route_id','segment_id','distance']).first().reset_index()["distance"].mean()
    weighted_data =  np.hstack([np.repeat(x, y) for x, y in zip(df['distance'], df.traversals)])
    
    df_dict = {
            'Segment Weighted Mean' : stop_weighted_mean,
            'Route Weighted Mean' : route_weighted_mean,
            'Traversal Weighted Mean': round(np.mean(weighted_data),3),
            'Traversal Weighted Std': round(np.mean(weighted_data),3),
            'Traversal Weighted 25 % Quantile': round(np.quantile(weighted_data,0.25),3),
            'Traversal Weighted 50 % Quantile': round(np.quantile(weighted_data,0.50),3),
            'Traversal Weighted 75 % Quantile': round(np.quantile(weighted_data,0.75),3),
            'No of Segments':int(len(df)),
            'No of Routes':int(len(df.route_id.unique())),
            'No of Traversals':int(sum(df.traversals)),  
            'Max Spacing':int(max_spacing),
            '% Segments w/ spacing > max_spacing':percent_spacing}
    summary_df = pd.DataFrame([df_dict])
    # df.set_index(summary_df.columns[0],inplace=True)
    if export:
        assert "file_path" in kwargs.keys(), "Please pass in the `file_path`"
        summary_df.to_csv(kwargs['file_path'],index = False)
        print("Saved the summary in "+kwargs['file_path'])
    summary_df = summary_df.T
    return summary_df