Module `gtfs_segments.gtfs_segments`

Expand source code

import geopandas as gpd
from .partridge_func import ptg_read_file
from .geom_utils import *
from .utils import *
from .mobility import *


def merge_trip_geom(trip_df,shape_df):
    """
    It takes a dataframe of trips and a dataframe of shapes, and returns a geodataframe of trips with
    the geometry of the shapes
    
    Args:
      trip_df: a dataframe of trips
      shape_df: a GeoDataFrame of the shapes.txt file
    
    Returns:
      A GeoDataFrame
    """
    ## `direction_id` and `shape_id` are optional
    if ('direction_id' in trip_df.columns):
    ## Check is direction_ids are listed as null
        if trip_df['direction_id'].isnull().sum() == 0:
            grp = trip_df.groupby(['route_id','shape_id','direction_id'])
        else:
            grp = trip_df.groupby(['route_id','shape_id'])
    else:
        grp = trip_df.groupby(['route_id','shape_id'])
    trip_df = grp.first().reset_index()
    trip_df['traversals'] = grp.count().reset_index(drop=True)['trip_id']
    subset_list = np.array(['route_id','trip_id','shape_id','service_id','direction_id','traversals'])
    col_subset = subset_list[np.in1d(subset_list,trip_df.columns)]
    trip_df = trip_df[col_subset]
    trip_df = trip_df.dropna(how='all', axis=1)
    trip_df = shape_df.merge(trip_df, on='shape_id',how='left')
    
#     trip_df = trip_df.set_crs(epsg=4326,allow_override=True)
    return make_gdf(trip_df)

def create_segments(stop_df):
    """
    It takes a dataframe of stops and returns a dataframe of segments
    
    Args:
      stop_df: a dataframe with the following columns:
    
    Returns:
      A dataframe with the following columns:
    """
    stop_df = stop_df.rename({'stop_id':'stop_id1'},axis =1)
    start_wkts = stop_df.apply(lambda row: nearest_snap(row['geometry'],row['start']), axis = 1)
    stop_df['start'] = gpd.GeoSeries.from_wkt(start_wkts)
    grp = stop_df.groupby('trip_id').apply(lambda df: df.shift(-1)).reset_index(drop=True)
    stop_df[['stop_id2','end']] = grp[['stop_id1','start']]
    stop_df = stop_df.dropna().reset_index(drop=True)
    stop_df['segment_id'] = stop_df.apply(lambda row: str(row['stop_id1']) +'-'+ str(row['stop_id2']),axis =1)
    # stop_df['segment_id'] = stop_df.apply(lambda row: str(row['stop_id1']) +'-'+ str(row['stop_id2'])+'-'+ str(row['shape_id']),axis =1)
    stop_df['snapped_start_id'] = stop_df.apply(lambda row: row['start'].within(row['geometry']), axis = 1)
    stop_df['snapped_end_id'] = stop_df.apply(lambda row: row['end'].within(row['geometry']), axis = 1)
    split_routes = stop_df.apply(lambda row: split_route(row),axis = 1)
    stop_df['geometry'] = gpd.GeoSeries.from_wkt(split_routes)
    return stop_df

def filter_stop_df(stop_df,trip_ids):
    """
    It takes a dataframe of stops and a list of trip IDs and returns a dataframe of stops that are in
    the list of trip IDs
    
    Args:
      stop_df: the dataframe of all stops
      trip_ids: a list of trip_ids that you want to filter the stop_df by
    
    Returns:
      A dataframe with the trip_id, stop_id, and stop_sequence for the trips in the trip_ids list.
    """
    stop_df = stop_df[['trip_id','stop_id','stop_sequence']]
    stop_df = stop_df[stop_df.trip_id.isin(trip_ids)].reset_index(drop=True)
    stop_df = stop_df.sort_values(['trip_id','stop_sequence']).reset_index(drop=True)
    return stop_df

def merge_stop_geom(stop_df,stop_loc_df):      
    """
    > Merge the stop_loc_df with the stop_df, and then convert the result to a GeoDataFrame
    
    Args:
      stop_df: a dataframe of stops
      stop_loc_df: a GeoDataFrame of the stops
    
    Returns:
      A GeoDataFrame
    """
    stop_df['start'] = stop_df.copy().merge(stop_loc_df,how='left',on='stop_id')['geometry']
    stop_df = gpd.GeoDataFrame(stop_df,geometry='start')
    return make_gdf(stop_df)
    
def process_feed(feed):
    """
    It takes a GTFS feed, merges the trip and shape data, filters the stop_times data to only include
    the trips that are in the feed, merges the stop_times data with the stop data, creates a segment for
    each stop pair, gets the EPSG zone for the feed, creates a GeoDataFrame, and calculates the length
    of each segment
    
    Args:
      feed: a GTFS feed object
    
    Returns:
      A GeoDataFrame with the following columns:
    """
    trip_df = merge_trip_geom(feed.trips,feed.shapes)
    trip_ids = trip_df.trip_id.unique()
    stop_df = filter_stop_df(feed.stop_times,trip_ids)
    stop_loc_df = feed.stops[['stop_id','geometry']]
    stop_df = merge_stop_geom(stop_df,stop_loc_df)    
    stop_df = stop_df.merge(trip_df,on='trip_id',how='left')
    stop_df = create_segments(stop_df)
    # return stop_df
    epsg_zone = get_zone_epsg(stop_df)
    subset_list = np.array(['route_id','shape_id','service_id','segment_id','stop_id1','stop_id2','direction_id','traversals','geometry'])
    col_subset = subset_list[np.in1d(subset_list,stop_df.columns)]
    stop_df = stop_df[col_subset]
    stop_df = make_gdf(stop_df)    
    stop_df['distance'] = stop_df.to_crs(epsg_zone).geometry.length
    return stop_df

def inspect_feed(feed):
    """
    It checks to see if the feed has any bus routes and if it has a `shape_id` column in the `trips`
    table
    
    Args:
      feed: The feed object that you want to inspect.
    
    Returns:
      A message
    """
    message = True
    if len(feed.stop_times) == 0:
        message = 'No Bus Routes in ' 
    if not "shape_id" in feed.trips.columns:
        message = "Missing `shape_id` column in "
    return message 

def get_gtfs_segments(path):
    """
    > It reads a GTFS file, and returns a list of segments
    
    Args:
      path: the path to the GTFS file
    
    Returns:
      A list of segments.
    """
    bday ,feed = ptg_read_file(path)
    return process_feed(feed)

def pipeline_gtfs(filename,url,bounds,max_spacing):
    """
    It takes a GTFS file, downloads it, reads it, processes it, and then outputs a bunch of files. 
    
    Let's go through the function step by step. 
    
    First, we define the function and give it a name. We also give it a few arguments: 
    
    - filename: the name of the file we want to save the output to. 
    - url: the url of the GTFS file we want to download. 
    - bounds: the bounding box of the area we want to analyze. 
    - max_spacing: the maximum spacing we want to analyze. 
    
    We then create a folder to save the output to. 
    
    Next, we download the GTFS file and save it to the folder we just created. 
    
    Then, we read the GTFS file using the `ptg_read_file` function. 
    
    Args:
      filename: the name of the file you want to save the output to
      url: the url of the GTFS file
      bounds: the bounding box of the area you want to analyze. This is in the format
    [min_lat,min_lon,max_lat,max_lon]
      max_spacing: The maximum distance between stops that you want to consider.
    
    Returns:
      a string with the name of the file that was processed.
    """
    folder_path  = os.path.join('output_files',filename)
    gtfs_file_loc = download_write_file(url,folder_path)
    
    ## read file using GTFS Fucntions
    busisest_day, feed = ptg_read_file(gtfs_file_loc)
    ## Remove Null entries
    message =  inspect_feed(feed)
    if message != True:
        return failed_pipeline(message,filename,folder_path)
    
    df = process_feed(feed)
    df_sub = df[df['distance']  < 3000].copy().reset_index(drop=True)
    if len(df_sub) == 0:
        return failed_pipeline('Only Long Bus Routes in ',filename,folder_path)
    ## Output files and Stats
    summary_stats_mobility(df,folder_path,filename,busisest_day,url,bounds,max_spacing,export=True)

    plot_hist(df,file_path = os.path.join(folder_path,'spacings.png'),title = filename.split(".")[0],max_spacing = max_spacing,save_fig=True)
    export_segments(df,os.path.join(folder_path,'geojson'), output_format ='geojson',geometry = True)
    export_segments(df,os.path.join(folder_path,'spacings_with_geometry'), output_format ='csv',geometry = True)
    export_segments(df,os.path.join(folder_path,'spacings'), output_format ='csv',geometry = False)
    return "Success for "+filename

Functions

def create_segments(stop_df)

It takes a dataframe of stops and returns a dataframe of segments

Args

stop_df: a dataframe with the following columns:

Returns

A dataframe with the following columns:

Expand source code

def create_segments(stop_df):
    """
    It takes a dataframe of stops and returns a dataframe of segments
    
    Args:
      stop_df: a dataframe with the following columns:
    
    Returns:
      A dataframe with the following columns:
    """
    stop_df = stop_df.rename({'stop_id':'stop_id1'},axis =1)
    start_wkts = stop_df.apply(lambda row: nearest_snap(row['geometry'],row['start']), axis = 1)
    stop_df['start'] = gpd.GeoSeries.from_wkt(start_wkts)
    grp = stop_df.groupby('trip_id').apply(lambda df: df.shift(-1)).reset_index(drop=True)
    stop_df[['stop_id2','end']] = grp[['stop_id1','start']]
    stop_df = stop_df.dropna().reset_index(drop=True)
    stop_df['segment_id'] = stop_df.apply(lambda row: str(row['stop_id1']) +'-'+ str(row['stop_id2']),axis =1)
    # stop_df['segment_id'] = stop_df.apply(lambda row: str(row['stop_id1']) +'-'+ str(row['stop_id2'])+'-'+ str(row['shape_id']),axis =1)
    stop_df['snapped_start_id'] = stop_df.apply(lambda row: row['start'].within(row['geometry']), axis = 1)
    stop_df['snapped_end_id'] = stop_df.apply(lambda row: row['end'].within(row['geometry']), axis = 1)
    split_routes = stop_df.apply(lambda row: split_route(row),axis = 1)
    stop_df['geometry'] = gpd.GeoSeries.from_wkt(split_routes)
    return stop_df

def filter_stop_df(stop_df, trip_ids)

It takes a dataframe of stops and a list of trip IDs and returns a dataframe of stops that are in the list of trip IDs

Args

stop_df: the dataframe of all stops
trip_ids: a list of trip_ids that you want to filter the stop_df by

Returns

A dataframe with the trip_id, stop_id, and stop_sequence for the trips in the trip_ids list.

Expand source code

def filter_stop_df(stop_df,trip_ids):
    """
    It takes a dataframe of stops and a list of trip IDs and returns a dataframe of stops that are in
    the list of trip IDs
    
    Args:
      stop_df: the dataframe of all stops
      trip_ids: a list of trip_ids that you want to filter the stop_df by
    
    Returns:
      A dataframe with the trip_id, stop_id, and stop_sequence for the trips in the trip_ids list.
    """
    stop_df = stop_df[['trip_id','stop_id','stop_sequence']]
    stop_df = stop_df[stop_df.trip_id.isin(trip_ids)].reset_index(drop=True)
    stop_df = stop_df.sort_values(['trip_id','stop_sequence']).reset_index(drop=True)
    return stop_df

def get_gtfs_segments(path)

It reads a GTFS file, and returns a list of segments

Args

path: the path to the GTFS file

Returns

A list of segments.

Expand source code

def get_gtfs_segments(path):
    """
    > It reads a GTFS file, and returns a list of segments
    
    Args:
      path: the path to the GTFS file
    
    Returns:
      A list of segments.
    """
    bday ,feed = ptg_read_file(path)
    return process_feed(feed)

def inspect_feed(feed)

It checks to see if the feed has any bus routes and if it has a shape_id column in the trips table

Args

feed: The feed object that you want to inspect.

Returns

A message

Expand source code

def inspect_feed(feed):
    """
    It checks to see if the feed has any bus routes and if it has a `shape_id` column in the `trips`
    table
    
    Args:
      feed: The feed object that you want to inspect.
    
    Returns:
      A message
    """
    message = True
    if len(feed.stop_times) == 0:
        message = 'No Bus Routes in ' 
    if not "shape_id" in feed.trips.columns:
        message = "Missing `shape_id` column in "
    return message

def merge_stop_geom(stop_df, stop_loc_df)

Merge the stop_loc_df with the stop_df, and then convert the result to a GeoDataFrame

Args

stop_df: a dataframe of stops
stop_loc_df: a GeoDataFrame of the stops

Returns

A GeoDataFrame

Expand source code

def merge_stop_geom(stop_df,stop_loc_df):      
    """
    > Merge the stop_loc_df with the stop_df, and then convert the result to a GeoDataFrame
    
    Args:
      stop_df: a dataframe of stops
      stop_loc_df: a GeoDataFrame of the stops
    
    Returns:
      A GeoDataFrame
    """
    stop_df['start'] = stop_df.copy().merge(stop_loc_df,how='left',on='stop_id')['geometry']
    stop_df = gpd.GeoDataFrame(stop_df,geometry='start')
    return make_gdf(stop_df)

def merge_trip_geom(trip_df, shape_df)

It takes a dataframe of trips and a dataframe of shapes, and returns a geodataframe of trips with the geometry of the shapes

Args

trip_df: a dataframe of trips
shape_df: a GeoDataFrame of the shapes.txt file

Returns

A GeoDataFrame

Expand source code

def merge_trip_geom(trip_df,shape_df):
    """
    It takes a dataframe of trips and a dataframe of shapes, and returns a geodataframe of trips with
    the geometry of the shapes
    
    Args:
      trip_df: a dataframe of trips
      shape_df: a GeoDataFrame of the shapes.txt file
    
    Returns:
      A GeoDataFrame
    """
    ## `direction_id` and `shape_id` are optional
    if ('direction_id' in trip_df.columns):
    ## Check is direction_ids are listed as null
        if trip_df['direction_id'].isnull().sum() == 0:
            grp = trip_df.groupby(['route_id','shape_id','direction_id'])
        else:
            grp = trip_df.groupby(['route_id','shape_id'])
    else:
        grp = trip_df.groupby(['route_id','shape_id'])
    trip_df = grp.first().reset_index()
    trip_df['traversals'] = grp.count().reset_index(drop=True)['trip_id']
    subset_list = np.array(['route_id','trip_id','shape_id','service_id','direction_id','traversals'])
    col_subset = subset_list[np.in1d(subset_list,trip_df.columns)]
    trip_df = trip_df[col_subset]
    trip_df = trip_df.dropna(how='all', axis=1)
    trip_df = shape_df.merge(trip_df, on='shape_id',how='left')
    
#     trip_df = trip_df.set_crs(epsg=4326,allow_override=True)
    return make_gdf(trip_df)

def pipeline_gtfs(filename, url, bounds, max_spacing)

It takes a GTFS file, downloads it, reads it, processes it, and then outputs a bunch of files.

Let's go through the function step by step.

First, we define the function and give it a name. We also give it a few arguments:

filename: the name of the file we want to save the output to.
url: the url of the GTFS file we want to download.
bounds: the bounding box of the area we want to analyze.
max_spacing: the maximum spacing we want to analyze.

We then create a folder to save the output to.

Next, we download the GTFS file and save it to the folder we just created.

Then, we read the GTFS file using the ptg_read_file function.

Args

filename: the name of the file you want to save the output to
url: the url of the GTFS file
bounds: the bounding box of the area you want to analyze. This is in the format

[min_lat,min_lon,max_lat,max_lon] max_spacing: The maximum distance between stops that you want to consider.

Returns

a string with the name of the file that was processed.

Expand source code

def pipeline_gtfs(filename,url,bounds,max_spacing):
    """
    It takes a GTFS file, downloads it, reads it, processes it, and then outputs a bunch of files. 
    
    Let's go through the function step by step. 
    
    First, we define the function and give it a name. We also give it a few arguments: 
    
    - filename: the name of the file we want to save the output to. 
    - url: the url of the GTFS file we want to download. 
    - bounds: the bounding box of the area we want to analyze. 
    - max_spacing: the maximum spacing we want to analyze. 
    
    We then create a folder to save the output to. 
    
    Next, we download the GTFS file and save it to the folder we just created. 
    
    Then, we read the GTFS file using the `ptg_read_file` function. 
    
    Args:
      filename: the name of the file you want to save the output to
      url: the url of the GTFS file
      bounds: the bounding box of the area you want to analyze. This is in the format
    [min_lat,min_lon,max_lat,max_lon]
      max_spacing: The maximum distance between stops that you want to consider.
    
    Returns:
      a string with the name of the file that was processed.
    """
    folder_path  = os.path.join('output_files',filename)
    gtfs_file_loc = download_write_file(url,folder_path)
    
    ## read file using GTFS Fucntions
    busisest_day, feed = ptg_read_file(gtfs_file_loc)
    ## Remove Null entries
    message =  inspect_feed(feed)
    if message != True:
        return failed_pipeline(message,filename,folder_path)
    
    df = process_feed(feed)
    df_sub = df[df['distance']  < 3000].copy().reset_index(drop=True)
    if len(df_sub) == 0:
        return failed_pipeline('Only Long Bus Routes in ',filename,folder_path)
    ## Output files and Stats
    summary_stats_mobility(df,folder_path,filename,busisest_day,url,bounds,max_spacing,export=True)

    plot_hist(df,file_path = os.path.join(folder_path,'spacings.png'),title = filename.split(".")[0],max_spacing = max_spacing,save_fig=True)
    export_segments(df,os.path.join(folder_path,'geojson'), output_format ='geojson',geometry = True)
    export_segments(df,os.path.join(folder_path,'spacings_with_geometry'), output_format ='csv',geometry = True)
    export_segments(df,os.path.join(folder_path,'spacings'), output_format ='csv',geometry = False)
    return "Success for "+filename

def process_feed(feed)

It takes a GTFS feed, merges the trip and shape data, filters the stop_times data to only include the trips that are in the feed, merges the stop_times data with the stop data, creates a segment for each stop pair, gets the EPSG zone for the feed, creates a GeoDataFrame, and calculates the length of each segment

Args

feed: a GTFS feed object

Returns

A GeoDataFrame with the following columns:

Expand source code

def process_feed(feed):
    """
    It takes a GTFS feed, merges the trip and shape data, filters the stop_times data to only include
    the trips that are in the feed, merges the stop_times data with the stop data, creates a segment for
    each stop pair, gets the EPSG zone for the feed, creates a GeoDataFrame, and calculates the length
    of each segment
    
    Args:
      feed: a GTFS feed object
    
    Returns:
      A GeoDataFrame with the following columns:
    """
    trip_df = merge_trip_geom(feed.trips,feed.shapes)
    trip_ids = trip_df.trip_id.unique()
    stop_df = filter_stop_df(feed.stop_times,trip_ids)
    stop_loc_df = feed.stops[['stop_id','geometry']]
    stop_df = merge_stop_geom(stop_df,stop_loc_df)    
    stop_df = stop_df.merge(trip_df,on='trip_id',how='left')
    stop_df = create_segments(stop_df)
    # return stop_df
    epsg_zone = get_zone_epsg(stop_df)
    subset_list = np.array(['route_id','shape_id','service_id','segment_id','stop_id1','stop_id2','direction_id','traversals','geometry'])
    col_subset = subset_list[np.in1d(subset_list,stop_df.columns)]
    stop_df = stop_df[col_subset]
    stop_df = make_gdf(stop_df)    
    stop_df['distance'] = stop_df.to_crs(epsg_zone).geometry.length
    return stop_df