Package `gtfs_segments`

Expand source code

__version__ = '0.0.4'
from .gtfs_segments import (
    get_gtfs_segments,
    pipeline_gtfs,
    process_feed
)
from .utils import (
    export_segments,
    plot_hist,
    process,
    summary_stats
)

from .mobility import(
    fetch_gtfs_source,
    summary_stats_mobility,
    download_latest_data
)

__all__ = [
    "__version__",
    "get_gtfs_segments",
    "pipeline_gtfs",
    "process_feed",
    "export_segments",
    "plot_hist",
    "fetch_gtfs_source",
    "summary_stats",
    "process",
    "process",
    "summary_stats_mobility",
    "download_latest_data"
]

Sub-modules

gtfs_segments.geom_utils
gtfs_segments.gtfs_segments
gtfs_segments.mobility
gtfs_segments.partridge_func
gtfs_segments.utils

Functions

def download_latest_data(out_folder_path, sources_df)

It iterates over the rows of the dataframe, and for each row, it tries to download the file from the URL in the urls.latest column, and write it to the folder specified in the provider column

Args

out_folder_path: The path to the folder where you want to save the data
sources_df: This is the dataframe that contains the urls for the data.

Expand source code

def download_latest_data(out_folder_path,sources_df):
    """
    It iterates over the rows of the dataframe, and for each row, it tries to download the file from the
    URL in the `urls.latest` column, and write it to the folder specified in the `provider` column
    
    Args:
      out_folder_path: The path to the folder where you want to save the data
      sources_df: This is the dataframe that contains the urls for the data.
    """
    for i,row in sources_df.iterrows():
        try:
            download_write_file(row['urls.latest'],os.path.join(out_folder_path,row['provider']))
        except:
            continue
    print("Downloaded the latest data")

def export_segments(df, file_path, output_format, geometry=True)

This function takes a GeoDataFrame of segments, a file path, an output format, and a boolean value for whether or not to include the geometry in the output.

If the output format is GeoJSON, the function will output the GeoDataFrame to a GeoJSON file.

If the output format is CSV, the function will output the GeoDataFrame to a CSV file. If the geometry boolean is set to True, the function will output the CSV file with the geometry column. If the geometry boolean is set to False, the function will output the CSV file without the geometry column.

The function will also add additional columns to the CSV file, including the start and end points of the segments, the start and end longitude and latitude of the segments, and the distance of the segments.

The function will also add a column to the CSV file that indicates the number of times the segment was traversed.

Args

df: the dataframe containing the segments
file_path: The path to the file you want to export to.
output_format: geojson or csv
geometry: If True, the output will include the geometry of the segments. If False, the output will

only include the start and end points of the segments. Defaults to True

Expand source code

def export_segments(df,file_path,output_format, geometry = True):
    """
    This function takes a GeoDataFrame of segments, a file path, an output format, and a boolean value
    for whether or not to include the geometry in the output. 
    
    If the output format is GeoJSON, the function will output the GeoDataFrame to a GeoJSON file. 
    
    If the output format is CSV, the function will output the GeoDataFrame to a CSV file. If the
    geometry boolean is set to True, the function will output the CSV file with the geometry column. If
    the geometry boolean is set to False, the function will output the CSV file without the geometry
    column. 
    
    The function will also add additional columns to the CSV file, including the start and end points of
    the segments, the start and end longitude and latitude of the segments, and the distance of the
    segments. 
    
    The function will also add a column to the CSV file that indicates the number of times the segment
    was traversed.
    
    Args:
      df: the dataframe containing the segments
      file_path: The path to the file you want to export to.
      output_format: geojson or csv
      geometry: If True, the output will include the geometry of the segments. If False, the output will
    only include the start and end points of the segments. Defaults to True
    """
    ## Output to GeoJSON
    if output_format == 'geojson':
        df.to_file(file_path+'.json', driver="GeoJSON")
    elif output_format == 'csv':
        s_df = df[['route_id','segment_id','stop_id1','stop_id2','distance','traversals','geometry']].copy()
        geom_list =  s_df.geometry.apply(lambda g: np.array(g.coords))
        s_df['start_point'] = [Point(g[0]).wkt for g in geom_list]
        s_df['end_point'] = [Point(g[-1]).wkt for g in geom_list]
        s_df['start_lon'] = [g[0][0] for g in geom_list]
        s_df['start_lat'] = [g[0][1] for g in geom_list]
        s_df['end_lon'] = [g[-1][0] for g in geom_list]
        s_df['end_lat'] = [g[-1][1] for g in geom_list]
        sg_df = s_df[['route_id','segment_id','stop_id1','stop_id2','distance','traversals','start_point','end_point','geometry']]
        if geometry == True:
            ## Output With LS
            sg_df.to_csv(file_path+'.csv',index = False)
        else:
            d_df = s_df[['route_id','segment_id','stop_id1','stop_id2','start_lat','start_lon','end_lat','end_lon','distance','traversals']]
            ## Output without LS
            d_df.to_csv(file_path+'.csv',index = False)

def fetch_gtfs_source(place='ALL')

It reads the mobility data sources csv file and generates a dataframe with the sources that are of type gtfs and are from the US

Args

place: The place you want to get the GTFS data for. This can be a city, state, or country.

Defaults to ALL

Returns

A dataframe with sources

Expand source code

def fetch_gtfs_source(place ='ALL'):
    """
    It reads the mobility data sources csv file and generates a dataframe with the sources that are of
    type gtfs and are from the US
    
    Args:
      place: The place you want to get the GTFS data for. This can be a city, state, or country.
    Defaults to ALL
    
    Returns:
      A dataframe with sources
    """
    abb_df = pd.read_json(ABBREV_link)
    sources_df = pd.read_csv(MOBILITY_SOURCES_link)
    sources_df = sources_df[sources_df['location.country_code'] == 'US']
    sources_df = sources_df[sources_df['data_type'] == 'gtfs']
    sources_df = pd.merge(sources_df,abb_df,how='left',left_on='location.subdivision_name',right_on='state')
    sources_df = sources_df[~sources_df.state_code.isna()]
    sources_df['location.municipality'] = sources_df['location.municipality'].astype("str")
    sources_df.drop(['entity_type','mdb_source_id','data_type','location.country_code','note',
                     'static_reference','urls.direct_download','urls.authentication_type','urls.license','location.bounding_box.extracted_on', 'urls.authentication_info','urls.api_key_parameter_name','features'],axis=1,inplace=True)
    file_names = []
    for i,row in sources_df.iterrows():
        if row['location.municipality'] != 'nan':
            if len(sources_df[(sources_df['location.municipality'] == row['location.municipality']) & (sources_df['provider'] == row['provider'])]) <= 1:
                f_name = str(row['location.municipality'])+'-'+str(row['provider'])+'-'+str(row['state_code'])
            else:
                f_name = str(row['location.municipality'])+'-'+str(row['provider'])+'-'+str(row['name'])+'-'+str(row['state_code'])
        else:
            if len(sources_df[(sources_df['location.subdivision_name'] == row['location.subdivision_name']) & (sources_df['provider'] == row['provider'])]) <= 1:
                f_name = str(row['location.subdivision_name'])+'-'+str(row['provider'])+'-'+str(row['state_code'])
            else:
                f_name =str(row['location.subdivision_name'])+'-'+str(row['provider'])+'-'+str(row['name'])+'-'+str(row['state_code'])
        f_name = f_name.replace('/','').strip()
        file_names.append(f_name)
    sources_df.drop(['provider','location.municipality','location.subdivision_name','name','state_code','state'],axis=1,inplace=True)
    sources_df.insert(0,'provider',file_names)
    sources_df.columns = sources_df.columns.str.replace('location.bounding_box.',"",regex=True)
    if place == 'ALL':
        return sources_df
    else:
        sources_df = sources_df[sources_df.apply(lambda row: row.astype(str).str.contains(place.lower(), case=False).any(), axis=1)]
        if len(sources_df) == 0:
            return "No sources found for the given place"
        else:
            return sources_df

def get_gtfs_segments(path)

It reads a GTFS file, and returns a list of segments

Args

path: the path to the GTFS file

Returns

A list of segments.

Expand source code

def get_gtfs_segments(path):
    """
    > It reads a GTFS file, and returns a list of segments
    
    Args:
      path: the path to the GTFS file
    
    Returns:
      A list of segments.
    """
    bday ,feed = ptg_read_file(path)
    return process_feed(feed)

def pipeline_gtfs(filename, url, bounds, max_spacing)

It takes a GTFS file, downloads it, reads it, processes it, and then outputs a bunch of files.

Let's go through the function step by step.

First, we define the function and give it a name. We also give it a few arguments:

filename: the name of the file we want to save the output to.
url: the url of the GTFS file we want to download.
bounds: the bounding box of the area we want to analyze.
max_spacing: the maximum spacing we want to analyze.

We then create a folder to save the output to.

Next, we download the GTFS file and save it to the folder we just created.

Then, we read the GTFS file using the ptg_read_file function.

Args

filename: the name of the file you want to save the output to
url: the url of the GTFS file
bounds: the bounding box of the area you want to analyze. This is in the format

[min_lat,min_lon,max_lat,max_lon] max_spacing: The maximum distance between stops that you want to consider.

Returns

a string with the name of the file that was processed.

Expand source code

def pipeline_gtfs(filename,url,bounds,max_spacing):
    """
    It takes a GTFS file, downloads it, reads it, processes it, and then outputs a bunch of files. 
    
    Let's go through the function step by step. 
    
    First, we define the function and give it a name. We also give it a few arguments: 
    
    - filename: the name of the file we want to save the output to. 
    - url: the url of the GTFS file we want to download. 
    - bounds: the bounding box of the area we want to analyze. 
    - max_spacing: the maximum spacing we want to analyze. 
    
    We then create a folder to save the output to. 
    
    Next, we download the GTFS file and save it to the folder we just created. 
    
    Then, we read the GTFS file using the `ptg_read_file` function. 
    
    Args:
      filename: the name of the file you want to save the output to
      url: the url of the GTFS file
      bounds: the bounding box of the area you want to analyze. This is in the format
    [min_lat,min_lon,max_lat,max_lon]
      max_spacing: The maximum distance between stops that you want to consider.
    
    Returns:
      a string with the name of the file that was processed.
    """
    folder_path  = os.path.join('output_files',filename)
    gtfs_file_loc = download_write_file(url,folder_path)
    
    ## read file using GTFS Fucntions
    busisest_day, feed = ptg_read_file(gtfs_file_loc)
    ## Remove Null entries
    message =  inspect_feed(feed)
    if message != True:
        return failed_pipeline(message,filename,folder_path)
    
    df = process_feed(feed)
    df_sub = df[df['distance']  < 3000].copy().reset_index(drop=True)
    if len(df_sub) == 0:
        return failed_pipeline('Only Long Bus Routes in ',filename,folder_path)
    ## Output files and Stats
    summary_stats_mobility(df,folder_path,filename,busisest_day,url,bounds,max_spacing,export=True)

    plot_hist(df,file_path = os.path.join(folder_path,'spacings.png'),title = filename.split(".")[0],max_spacing = max_spacing,save_fig=True)
    export_segments(df,os.path.join(folder_path,'geojson'), output_format ='geojson',geometry = True)
    export_segments(df,os.path.join(folder_path,'spacings_with_geometry'), output_format ='csv',geometry = True)
    export_segments(df,os.path.join(folder_path,'spacings'), output_format ='csv',geometry = False)
    return "Success for "+filename

def plot_hist(df, save_fig=False, show_mean=False, **kwargs)

It takes a dataframe with two columns, one with the distance between stops and the other with the number of traversals between those stops, and plots a weighted histogram of the distances

Args

df: The dataframe that contains the data
save_fig: If True, the figure will be saved to the file_path. Defaults to False
show_mean: If True, will show the mean of the distribution. Defaults to False

Returns

A matplotlib axis

Expand source code

def plot_hist(df,save_fig = False,show_mean = False,**kwargs):
    """
    It takes a dataframe with two columns, one with the distance between stops and the other with the
    number of traversals between those stops, and plots a weighted histogram of the distances
    
    Args:
      df: The dataframe that contains the data
      save_fig: If True, the figure will be saved to the file_path. Defaults to False
      show_mean: If True, will show the mean of the distribution. Defaults to False
    
    Returns:
      A matplotlib axis
    """
    if "max_spacing" not in kwargs.keys():
        max_spacing = 3000
        print("Using max_spacing = 3000")
    else:
        max_spacing = kwargs['max_spacing']
    if "ax" in kwargs.keys():
        ax = kwargs['ax']
    else:
        fig, ax = plt.subplots(figsize=(8,6))
    df = df[df['distance'] < max_spacing]
    data = np.hstack([np.repeat(x, y) for x, y in zip(df['distance'], df.traversals)])
    plt.hist(data,range=(0,max_spacing),density = True,bins = int(max_spacing/50),fc=(0, 105/255, 160/255, 0.4),ec = "white",lw =0.8)
    x = np.arange(0,max_spacing,5)
    plt.plot(x,gaussian_kde(data)(x),lw = 1.5,color=(0, 85/255, 120/255, 1))
    # sns.histplot(data,binwidth=50,stat = "density",kde=True,ax=ax)
    plt.xlim([0,max_spacing])
    plt.xlabel('Stop Spacing [m]')
    plt.ylabel('Density - Traversal Weighted')
    plt.title("Histogram of Spacing")
    if show_mean:
        plt.axvline(np.mean(data), color='k', linestyle='dashed', linewidth=2)
        min_ylim, max_ylim = plt.ylim()
        plt.text(np.mean(data)*1.1, max_ylim*0.9, 'Mean: {:.0f}'.format(np.mean(data)),fontsize=12)
    if "title" in kwargs.keys():
        plt.title(kwargs['title'])
    if save_fig == True:
        assert "file_path" in kwargs.keys(), "Please pass in the `file_path`"
        plt.savefig(kwargs['file_path'], dpi=300)
    plt.show()
    plt.close(fig)
    return ax

def process(pipeline_gtfs, row, max_spacing)

It takes a pipeline, a row from the sources_df, and a max_spacing, and returns the output of the pipeline

Args

pipeline_gtfs: This is the function that will be used to process the GTFS data.
row: This is a row in the sources_df dataframe. It contains the name of the provider, the url to

the gtfs file, and the bounding box of the area that the gtfs file covers. max_spacing: Maximum Allowed Spacing between two consecutive stops.

Returns

The return value is a tuple of the form (filename,folder_path,df)

Expand source code

def process(pipeline_gtfs,row,max_spacing):
    """
    It takes a pipeline, a row from the sources_df, and a max_spacing, and returns the output of the
    pipeline
    
    Args:
      pipeline_gtfs: This is the function that will be used to process the GTFS data.
      row: This is a row in the sources_df dataframe. It contains the name of the provider, the url to
    the gtfs file, and the bounding box of the area that the gtfs file covers.
      max_spacing: Maximum Allowed Spacing between two consecutive stops.
    
    Returns:
      The return value is a tuple of the form (filename,folder_path,df)
    """
    filename = row['provider']
    url = row['urls.latest']
    bounds = [[row['minimum_longitude'],row['minimum_latitude']],[row['maximum_longitude'],row['maximum_latitude']]]
    print(filename)
    try:
        return pipeline_gtfs(filename,url,bounds,max_spacing)
    except:
        traceback.print_exc()
        folder_path  = os.path.join('output_files',filename)
        return failed_pipeline("Failed for ",filename,folder_path)

def process_feed(feed)

It takes a GTFS feed, merges the trip and shape data, filters the stop_times data to only include the trips that are in the feed, merges the stop_times data with the stop data, creates a segment for each stop pair, gets the EPSG zone for the feed, creates a GeoDataFrame, and calculates the length of each segment

Args

feed: a GTFS feed object

Returns

A GeoDataFrame with the following columns:

Expand source code

def process_feed(feed):
    """
    It takes a GTFS feed, merges the trip and shape data, filters the stop_times data to only include
    the trips that are in the feed, merges the stop_times data with the stop data, creates a segment for
    each stop pair, gets the EPSG zone for the feed, creates a GeoDataFrame, and calculates the length
    of each segment
    
    Args:
      feed: a GTFS feed object
    
    Returns:
      A GeoDataFrame with the following columns:
    """
    trip_df = merge_trip_geom(feed.trips,feed.shapes)
    trip_ids = trip_df.trip_id.unique()
    stop_df = filter_stop_df(feed.stop_times,trip_ids)
    stop_loc_df = feed.stops[['stop_id','geometry']]
    stop_df = merge_stop_geom(stop_df,stop_loc_df)    
    stop_df = stop_df.merge(trip_df,on='trip_id',how='left')
    stop_df = create_segments(stop_df)
    # return stop_df
    epsg_zone = get_zone_epsg(stop_df)
    subset_list = np.array(['route_id','shape_id','service_id','segment_id','stop_id1','stop_id2','direction_id','traversals','geometry'])
    col_subset = subset_list[np.in1d(subset_list,stop_df.columns)]
    stop_df = stop_df[col_subset]
    stop_df = make_gdf(stop_df)    
    stop_df['distance'] = stop_df.to_crs(epsg_zone).geometry.length
    return stop_df

def summary_stats(df, export=False, **kwargs)

It takes in a dataframe, and returns a dataframe with summary statistics

Args

df: The dataframe that you want to get the summary statistics for.
export: If True, the summary will be exported to a csv file. Defaults to False

Returns

A dataframe with the summary statistics

Expand source code

def summary_stats(df,export = False,**kwargs):
    """
    It takes in a dataframe, and returns a dataframe with summary statistics
    
    Args:
      df: The dataframe that you want to get the summary statistics for.
      export: If True, the summary will be exported to a csv file. Defaults to False
    
    Returns:
      A dataframe with the summary statistics
    """
    if "max_spacing" not in kwargs.keys():
        max_spacing = 3000
        print("Using max_spacing = 3000")
    else:
        max_spacing = kwargs['max_spacing']
    percent_spacing = round(df[df["distance"] > max_spacing]['traversals'].sum()/df['traversals'].sum() *100,3)
    df = df[df["distance"] <= max_spacing]
    stop_weighted_mean = df.groupby(['segment_id','distance']).first().reset_index()["distance"].mean()
    route_weighted_mean = df.groupby(['route_id','segment_id','distance']).first().reset_index()["distance"].mean()
    weighted_data =  np.hstack([np.repeat(x, y) for x, y in zip(df['distance'], df.traversals)])
    
    df_dict = {
            'Segment Weighted Mean' : stop_weighted_mean,
            'Route Weighted Mean' : route_weighted_mean,
            'Traversal Weighted Mean': round(np.mean(weighted_data),3),
            'Traversal Weighted Std': round(np.mean(weighted_data),3),
            'Traversal Weighted 25 % Quantile': round(np.quantile(weighted_data,0.25),3),
            'Traversal Weighted 50 % Quantile': round(np.quantile(weighted_data,0.50),3),
            'Traversal Weighted 75 % Quantile': round(np.quantile(weighted_data,0.75),3),
            'No of Segments':int(len(df)),
            'No of Routes':int(len(df.route_id.unique())),
            'No of Traversals':int(sum(df.traversals)),  
            'Max Spacing':int(max_spacing),
            '% Segments w/ spacing > max_spacing':percent_spacing}
    summary_df = pd.DataFrame([df_dict])
    # df.set_index(summary_df.columns[0],inplace=True)
    if export:
        assert "file_path" in kwargs.keys(), "Please pass in the `file_path`"
        summary_df.to_csv(kwargs['file_path'],index = False)
        print("Saved the summary in "+kwargs['file_path'])
    summary_df = summary_df.T
    return summary_df

def summary_stats_mobility(df, folder_path, filename, b_day, link, bounds, max_spacing=3000, export=False)

It takes in a dataframe, a folder path, a filename, a busiest day, a link, a bounding box, a max spacing, and a boolean for exporting the summary to a csv.

It then calculates the percentage of segments that have a spacing greater than the max spacing. It then filters the dataframe to only include segments with a spacing less than the max spacing. It then calculates the segment weighted mean, route weighted mean, traversal weighted mean, traversal weighted standard deviation, traversal weighted 25th percentile, traversal weighted 50th percentile, traversal weighted 75th percentile, number of segments, number of routes, number of traversals, and the max spacing. It then creates a dictionary with all of the above values and creates a dataframe from the dictionary. It then exports the dataframe to a csv if the export boolean is true. If the export boolean is false, it transposes the dataframe and returns it.

Args

df: the dataframe containing the mobility data
folder_path: The path to the folder where you want to save the summary.csv file.
filename: The name of the file you want to save the data as.
b_day: The busiest day of the week
link: The link of the map you want to use.
bounds: The bounding box of the area you want to analyze.
max_spacing: The maximum distance between two stops that you want to consider. Defaults to 3000
export: If True, the summary will be saved as a csv file in the folder_path. If False, the summary

will be returned as a dataframe. Defaults to False

Returns

A dataframe with the summary statistics of the mobility data.

Expand source code

def summary_stats_mobility(df,folder_path,filename,b_day,link,bounds,max_spacing = 3000,export = False):
    """
    It takes in a dataframe, a folder path, a filename, a busiest day, a link, a bounding box, a max
    spacing, and a boolean for exporting the summary to a csv. 
    
    It then calculates the percentage of segments that have a spacing greater than the max spacing. It
    then filters the dataframe to only include segments with a spacing less than the max spacing. It
    then calculates the segment weighted mean, route weighted mean, traversal weighted mean, traversal
    weighted standard deviation, traversal weighted 25th percentile, traversal weighted 50th percentile,
    traversal weighted 75th percentile, number of segments, number of routes, number of traversals, and
    the max spacing. It then creates a dictionary with all of the above values and creates a dataframe
    from the dictionary. It then exports the dataframe to a csv if the export boolean is true. If the
    export boolean is false, it transposes the dataframe and returns it.
    
    Args:
      df: the dataframe containing the mobility data
      folder_path: The path to the folder where you want to save the summary.csv file.
      filename: The name of the file you want to save the data as.
      b_day: The busiest day of the week
      link: The link of the map you want to use.
      bounds: The bounding box of the area you want to analyze.
      max_spacing: The maximum distance between two stops that you want to consider. Defaults to 3000
      export: If True, the summary will be saved as a csv file in the folder_path. If False, the summary
    will be returned as a dataframe. Defaults to False
    
    Returns:
      A dataframe with the summary statistics of the mobility data.
    """
    percent_spacing = round(df[df["distance"] > max_spacing]['traversals'].sum()/df['traversals'].sum() *100,3)
    df = df[df["distance"] <= max_spacing]
    csv_path = os.path.join(folder_path,'summary.csv')
    stop_weighted_mean = df.groupby(['segment_id','distance']).first().reset_index()["distance"].mean()
    route_weighted_mean = df.groupby(['route_id','segment_id','distance']).first().reset_index()["distance"].mean()
    weighted_data =  np.hstack([np.repeat(x, y) for x, y in zip(df['distance'], df.traversals)])
    df_dict = {"Name":filename,
            'Busiest Day': b_day,
            'Link': link,
            'Min Latitude': bounds[0][1],
            'Min Longitude': bounds[0][0],
            'Max Latitude': bounds[1][1],
            'Max Longitude': bounds[1][0],
            'Segment Weighted Mean' : stop_weighted_mean,
            'Route Weighted Mean' : route_weighted_mean,
            'Traversal Weighted Mean': round(np.mean(weighted_data),3),
            'Traversal Weighted Std': round(np.std(weighted_data),3),
            'Traversal Weighted 25 % Quantile': round(np.quantile(weighted_data,0.25),3),
            'Traversal Weighted 50 % Quantile': round(np.quantile(weighted_data,0.5),3),
            'Traversal Weighted 75 % Quantile': round(np.quantile(weighted_data,0.75),3),
            'No of Segments':len(df),
            'No of Routes':len(df.route_id.unique()),
            'No of Traversals':sum(df.traversals),  
            'Max Spacing':max_spacing,
            '% Segments w/ spacing > max_spacing':percent_spacing}
    summary_df = pd.DataFrame([df_dict])
    # df.set_index(summary_df.columns[0],inplace=True)
    if export:
        summary_df.to_csv(csv_path,index = False)
        return "Saved the summary.csv in "+folder_path
    else:
       summary_df = summary_df.T
       return summary_df