Module gtfs_segments.gtfs_segments

Expand source code
import geopandas as gpd
from .partridge_func import ptg_read_file
from .geom_utils import *
from .utils import *
from .mobility import *


def merge_trip_geom(trip_df,shape_df):
    """
    It takes a dataframe of trips and a dataframe of shapes, and returns a geodataframe of trips with
    the geometry of the shapes
    
    Args:
      trip_df: a dataframe of trips
      shape_df: a GeoDataFrame of the shapes.txt file
    
    Returns:
      A GeoDataFrame
    """
    ## `direction_id` and `shape_id` are optional
    if ('direction_id' in trip_df.columns):
    ## Check is direction_ids are listed as null
        if trip_df['direction_id'].isnull().sum() == 0:
            grp = trip_df.groupby(['route_id','shape_id','direction_id'])
        else:
            grp = trip_df.groupby(['route_id','shape_id'])
    else:
        grp = trip_df.groupby(['route_id','shape_id'])
    trip_df = grp.first().reset_index()
    trip_df['traversals'] = grp.count().reset_index(drop=True)['trip_id']
    subset_list = np.array(['route_id','trip_id','shape_id','service_id','direction_id','traversals'])
    col_subset = subset_list[np.in1d(subset_list,trip_df.columns)]
    trip_df = trip_df[col_subset]
    trip_df = trip_df.dropna(how='all', axis=1)
    trip_df = shape_df.merge(trip_df, on='shape_id',how='left')
    
#     trip_df = trip_df.set_crs(epsg=4326,allow_override=True)
    return make_gdf(trip_df)

def create_segments(stop_df):
    """
    It takes a dataframe of stops and returns a dataframe of segments
    
    Args:
      stop_df: a dataframe with the following columns:
    
    Returns:
      A dataframe with the following columns:
    """
    stop_df = stop_df.rename({'stop_id':'stop_id1'},axis =1)
    start_wkts = stop_df.apply(lambda row: nearest_snap(row['geometry'],row['start']), axis = 1)
    stop_df['start'] = gpd.GeoSeries.from_wkt(start_wkts)
    grp = stop_df.groupby('trip_id').apply(lambda df: df.shift(-1)).reset_index(drop=True)
    stop_df[['stop_id2','end']] = grp[['stop_id1','start']]
    stop_df = stop_df.dropna().reset_index(drop=True)
    stop_df['segment_id'] = stop_df.apply(lambda row: str(row['stop_id1']) +'-'+ str(row['stop_id2']),axis =1)
    # stop_df['segment_id'] = stop_df.apply(lambda row: str(row['stop_id1']) +'-'+ str(row['stop_id2'])+'-'+ str(row['shape_id']),axis =1)
    stop_df['snapped_start_id'] = stop_df.apply(lambda row: row['start'].within(row['geometry']), axis = 1)
    stop_df['snapped_end_id'] = stop_df.apply(lambda row: row['end'].within(row['geometry']), axis = 1)
    split_routes = stop_df.apply(lambda row: split_route(row),axis = 1)
    stop_df['geometry'] = gpd.GeoSeries.from_wkt(split_routes)
    return stop_df

def filter_stop_df(stop_df,trip_ids):
    """
    It takes a dataframe of stops and a list of trip IDs and returns a dataframe of stops that are in
    the list of trip IDs
    
    Args:
      stop_df: the dataframe of all stops
      trip_ids: a list of trip_ids that you want to filter the stop_df by
    
    Returns:
      A dataframe with the trip_id, stop_id, and stop_sequence for the trips in the trip_ids list.
    """
    stop_df = stop_df[['trip_id','stop_id','stop_sequence']]
    stop_df = stop_df[stop_df.trip_id.isin(trip_ids)].reset_index(drop=True)
    stop_df = stop_df.sort_values(['trip_id','stop_sequence']).reset_index(drop=True)
    return stop_df

def merge_stop_geom(stop_df,stop_loc_df):      
    """
    > Merge the stop_loc_df with the stop_df, and then convert the result to a GeoDataFrame
    
    Args:
      stop_df: a dataframe of stops
      stop_loc_df: a GeoDataFrame of the stops
    
    Returns:
      A GeoDataFrame
    """
    stop_df['start'] = stop_df.copy().merge(stop_loc_df,how='left',on='stop_id')['geometry']
    stop_df = gpd.GeoDataFrame(stop_df,geometry='start')
    return make_gdf(stop_df)
    
def process_feed(feed):
    """
    It takes a GTFS feed, merges the trip and shape data, filters the stop_times data to only include
    the trips that are in the feed, merges the stop_times data with the stop data, creates a segment for
    each stop pair, gets the EPSG zone for the feed, creates a GeoDataFrame, and calculates the length
    of each segment
    
    Args:
      feed: a GTFS feed object
    
    Returns:
      A GeoDataFrame with the following columns:
    """
    trip_df = merge_trip_geom(feed.trips,feed.shapes)
    trip_ids = trip_df.trip_id.unique()
    stop_df = filter_stop_df(feed.stop_times,trip_ids)
    stop_loc_df = feed.stops[['stop_id','geometry']]
    stop_df = merge_stop_geom(stop_df,stop_loc_df)    
    stop_df = stop_df.merge(trip_df,on='trip_id',how='left')
    stop_df = create_segments(stop_df)
    # return stop_df
    epsg_zone = get_zone_epsg(stop_df)
    subset_list = np.array(['route_id','shape_id','service_id','segment_id','stop_id1','stop_id2','direction_id','traversals','geometry'])
    col_subset = subset_list[np.in1d(subset_list,stop_df.columns)]
    stop_df = stop_df[col_subset]
    stop_df = make_gdf(stop_df)    
    stop_df['distance'] = stop_df.to_crs(epsg_zone).geometry.length
    return stop_df

def inspect_feed(feed):
    """
    It checks to see if the feed has any bus routes and if it has a `shape_id` column in the `trips`
    table
    
    Args:
      feed: The feed object that you want to inspect.
    
    Returns:
      A message
    """
    message = True
    if len(feed.stop_times) == 0:
        message = 'No Bus Routes in ' 
    if not "shape_id" in feed.trips.columns:
        message = "Missing `shape_id` column in "
    return message 

def get_gtfs_segments(path):
    """
    > It reads a GTFS file, and returns a list of segments
    
    Args:
      path: the path to the GTFS file
    
    Returns:
      A list of segments.
    """
    bday ,feed = ptg_read_file(path)
    return process_feed(feed)

def pipeline_gtfs(filename,url,bounds,max_spacing):
    """
    It takes a GTFS file, downloads it, reads it, processes it, and then outputs a bunch of files. 
    
    Let's go through the function step by step. 
    
    First, we define the function and give it a name. We also give it a few arguments: 
    
    - filename: the name of the file we want to save the output to. 
    - url: the url of the GTFS file we want to download. 
    - bounds: the bounding box of the area we want to analyze. 
    - max_spacing: the maximum spacing we want to analyze. 
    
    We then create a folder to save the output to. 
    
    Next, we download the GTFS file and save it to the folder we just created. 
    
    Then, we read the GTFS file using the `ptg_read_file` function. 
    
    Args:
      filename: the name of the file you want to save the output to
      url: the url of the GTFS file
      bounds: the bounding box of the area you want to analyze. This is in the format
    [min_lat,min_lon,max_lat,max_lon]
      max_spacing: The maximum distance between stops that you want to consider.
    
    Returns:
      a string with the name of the file that was processed.
    """
    folder_path  = os.path.join('output_files',filename)
    gtfs_file_loc = download_write_file(url,folder_path)
    
    ## read file using GTFS Fucntions
    busisest_day, feed = ptg_read_file(gtfs_file_loc)
    ## Remove Null entries
    message =  inspect_feed(feed)
    if message != True:
        return failed_pipeline(message,filename,folder_path)
    
    df = process_feed(feed)
    df_sub = df[df['distance']  < 3000].copy().reset_index(drop=True)
    if len(df_sub) == 0:
        return failed_pipeline('Only Long Bus Routes in ',filename,folder_path)
    ## Output files and Stats
    summary_stats_mobility(df,folder_path,filename,busisest_day,url,bounds,max_spacing,export=True)

    plot_hist(df,file_path = os.path.join(folder_path,'spacings.png'),title = filename.split(".")[0],max_spacing = max_spacing,save_fig=True)
    export_segments(df,os.path.join(folder_path,'geojson'), output_format ='geojson',geometry = True)
    export_segments(df,os.path.join(folder_path,'spacings_with_geometry'), output_format ='csv',geometry = True)
    export_segments(df,os.path.join(folder_path,'spacings'), output_format ='csv',geometry = False)
    return "Success for "+filename

Functions

def create_segments(stop_df)

It takes a dataframe of stops and returns a dataframe of segments

Args

stop_df
a dataframe with the following columns:

Returns

A dataframe with the following columns:

Expand source code
def create_segments(stop_df):
    """
    It takes a dataframe of stops and returns a dataframe of segments
    
    Args:
      stop_df: a dataframe with the following columns:
    
    Returns:
      A dataframe with the following columns:
    """
    stop_df = stop_df.rename({'stop_id':'stop_id1'},axis =1)
    start_wkts = stop_df.apply(lambda row: nearest_snap(row['geometry'],row['start']), axis = 1)
    stop_df['start'] = gpd.GeoSeries.from_wkt(start_wkts)
    grp = stop_df.groupby('trip_id').apply(lambda df: df.shift(-1)).reset_index(drop=True)
    stop_df[['stop_id2','end']] = grp[['stop_id1','start']]
    stop_df = stop_df.dropna().reset_index(drop=True)
    stop_df['segment_id'] = stop_df.apply(lambda row: str(row['stop_id1']) +'-'+ str(row['stop_id2']),axis =1)
    # stop_df['segment_id'] = stop_df.apply(lambda row: str(row['stop_id1']) +'-'+ str(row['stop_id2'])+'-'+ str(row['shape_id']),axis =1)
    stop_df['snapped_start_id'] = stop_df.apply(lambda row: row['start'].within(row['geometry']), axis = 1)
    stop_df['snapped_end_id'] = stop_df.apply(lambda row: row['end'].within(row['geometry']), axis = 1)
    split_routes = stop_df.apply(lambda row: split_route(row),axis = 1)
    stop_df['geometry'] = gpd.GeoSeries.from_wkt(split_routes)
    return stop_df
def filter_stop_df(stop_df, trip_ids)

It takes a dataframe of stops and a list of trip IDs and returns a dataframe of stops that are in the list of trip IDs

Args

stop_df
the dataframe of all stops
trip_ids
a list of trip_ids that you want to filter the stop_df by

Returns

A dataframe with the trip_id, stop_id, and stop_sequence for the trips in the trip_ids list.

Expand source code
def filter_stop_df(stop_df,trip_ids):
    """
    It takes a dataframe of stops and a list of trip IDs and returns a dataframe of stops that are in
    the list of trip IDs
    
    Args:
      stop_df: the dataframe of all stops
      trip_ids: a list of trip_ids that you want to filter the stop_df by
    
    Returns:
      A dataframe with the trip_id, stop_id, and stop_sequence for the trips in the trip_ids list.
    """
    stop_df = stop_df[['trip_id','stop_id','stop_sequence']]
    stop_df = stop_df[stop_df.trip_id.isin(trip_ids)].reset_index(drop=True)
    stop_df = stop_df.sort_values(['trip_id','stop_sequence']).reset_index(drop=True)
    return stop_df
def get_gtfs_segments(path)

It reads a GTFS file, and returns a list of segments

Args

path
the path to the GTFS file

Returns

A list of segments.

Expand source code
def get_gtfs_segments(path):
    """
    > It reads a GTFS file, and returns a list of segments
    
    Args:
      path: the path to the GTFS file
    
    Returns:
      A list of segments.
    """
    bday ,feed = ptg_read_file(path)
    return process_feed(feed)
def inspect_feed(feed)

It checks to see if the feed has any bus routes and if it has a shape_id column in the trips table

Args

feed
The feed object that you want to inspect.

Returns

A message

Expand source code
def inspect_feed(feed):
    """
    It checks to see if the feed has any bus routes and if it has a `shape_id` column in the `trips`
    table
    
    Args:
      feed: The feed object that you want to inspect.
    
    Returns:
      A message
    """
    message = True
    if len(feed.stop_times) == 0:
        message = 'No Bus Routes in ' 
    if not "shape_id" in feed.trips.columns:
        message = "Missing `shape_id` column in "
    return message 
def merge_stop_geom(stop_df, stop_loc_df)

Merge the stop_loc_df with the stop_df, and then convert the result to a GeoDataFrame

Args

stop_df
a dataframe of stops
stop_loc_df
a GeoDataFrame of the stops

Returns

A GeoDataFrame

Expand source code
def merge_stop_geom(stop_df,stop_loc_df):      
    """
    > Merge the stop_loc_df with the stop_df, and then convert the result to a GeoDataFrame
    
    Args:
      stop_df: a dataframe of stops
      stop_loc_df: a GeoDataFrame of the stops
    
    Returns:
      A GeoDataFrame
    """
    stop_df['start'] = stop_df.copy().merge(stop_loc_df,how='left',on='stop_id')['geometry']
    stop_df = gpd.GeoDataFrame(stop_df,geometry='start')
    return make_gdf(stop_df)
def merge_trip_geom(trip_df, shape_df)

It takes a dataframe of trips and a dataframe of shapes, and returns a geodataframe of trips with the geometry of the shapes

Args

trip_df
a dataframe of trips
shape_df
a GeoDataFrame of the shapes.txt file

Returns

A GeoDataFrame

Expand source code
def merge_trip_geom(trip_df,shape_df):
    """
    It takes a dataframe of trips and a dataframe of shapes, and returns a geodataframe of trips with
    the geometry of the shapes
    
    Args:
      trip_df: a dataframe of trips
      shape_df: a GeoDataFrame of the shapes.txt file
    
    Returns:
      A GeoDataFrame
    """
    ## `direction_id` and `shape_id` are optional
    if ('direction_id' in trip_df.columns):
    ## Check is direction_ids are listed as null
        if trip_df['direction_id'].isnull().sum() == 0:
            grp = trip_df.groupby(['route_id','shape_id','direction_id'])
        else:
            grp = trip_df.groupby(['route_id','shape_id'])
    else:
        grp = trip_df.groupby(['route_id','shape_id'])
    trip_df = grp.first().reset_index()
    trip_df['traversals'] = grp.count().reset_index(drop=True)['trip_id']
    subset_list = np.array(['route_id','trip_id','shape_id','service_id','direction_id','traversals'])
    col_subset = subset_list[np.in1d(subset_list,trip_df.columns)]
    trip_df = trip_df[col_subset]
    trip_df = trip_df.dropna(how='all', axis=1)
    trip_df = shape_df.merge(trip_df, on='shape_id',how='left')
    
#     trip_df = trip_df.set_crs(epsg=4326,allow_override=True)
    return make_gdf(trip_df)
def pipeline_gtfs(filename, url, bounds, max_spacing)

It takes a GTFS file, downloads it, reads it, processes it, and then outputs a bunch of files.

Let's go through the function step by step.

First, we define the function and give it a name. We also give it a few arguments:

  • filename: the name of the file we want to save the output to.
  • url: the url of the GTFS file we want to download.
  • bounds: the bounding box of the area we want to analyze.
  • max_spacing: the maximum spacing we want to analyze.

We then create a folder to save the output to.

Next, we download the GTFS file and save it to the folder we just created.

Then, we read the GTFS file using the ptg_read_file function.

Args

filename
the name of the file you want to save the output to
url
the url of the GTFS file
bounds
the bounding box of the area you want to analyze. This is in the format

[min_lat,min_lon,max_lat,max_lon] max_spacing: The maximum distance between stops that you want to consider.

Returns

a string with the name of the file that was processed.

Expand source code
def pipeline_gtfs(filename,url,bounds,max_spacing):
    """
    It takes a GTFS file, downloads it, reads it, processes it, and then outputs a bunch of files. 
    
    Let's go through the function step by step. 
    
    First, we define the function and give it a name. We also give it a few arguments: 
    
    - filename: the name of the file we want to save the output to. 
    - url: the url of the GTFS file we want to download. 
    - bounds: the bounding box of the area we want to analyze. 
    - max_spacing: the maximum spacing we want to analyze. 
    
    We then create a folder to save the output to. 
    
    Next, we download the GTFS file and save it to the folder we just created. 
    
    Then, we read the GTFS file using the `ptg_read_file` function. 
    
    Args:
      filename: the name of the file you want to save the output to
      url: the url of the GTFS file
      bounds: the bounding box of the area you want to analyze. This is in the format
    [min_lat,min_lon,max_lat,max_lon]
      max_spacing: The maximum distance between stops that you want to consider.
    
    Returns:
      a string with the name of the file that was processed.
    """
    folder_path  = os.path.join('output_files',filename)
    gtfs_file_loc = download_write_file(url,folder_path)
    
    ## read file using GTFS Fucntions
    busisest_day, feed = ptg_read_file(gtfs_file_loc)
    ## Remove Null entries
    message =  inspect_feed(feed)
    if message != True:
        return failed_pipeline(message,filename,folder_path)
    
    df = process_feed(feed)
    df_sub = df[df['distance']  < 3000].copy().reset_index(drop=True)
    if len(df_sub) == 0:
        return failed_pipeline('Only Long Bus Routes in ',filename,folder_path)
    ## Output files and Stats
    summary_stats_mobility(df,folder_path,filename,busisest_day,url,bounds,max_spacing,export=True)

    plot_hist(df,file_path = os.path.join(folder_path,'spacings.png'),title = filename.split(".")[0],max_spacing = max_spacing,save_fig=True)
    export_segments(df,os.path.join(folder_path,'geojson'), output_format ='geojson',geometry = True)
    export_segments(df,os.path.join(folder_path,'spacings_with_geometry'), output_format ='csv',geometry = True)
    export_segments(df,os.path.join(folder_path,'spacings'), output_format ='csv',geometry = False)
    return "Success for "+filename
def process_feed(feed)

It takes a GTFS feed, merges the trip and shape data, filters the stop_times data to only include the trips that are in the feed, merges the stop_times data with the stop data, creates a segment for each stop pair, gets the EPSG zone for the feed, creates a GeoDataFrame, and calculates the length of each segment

Args

feed
a GTFS feed object

Returns

A GeoDataFrame with the following columns:

Expand source code
def process_feed(feed):
    """
    It takes a GTFS feed, merges the trip and shape data, filters the stop_times data to only include
    the trips that are in the feed, merges the stop_times data with the stop data, creates a segment for
    each stop pair, gets the EPSG zone for the feed, creates a GeoDataFrame, and calculates the length
    of each segment
    
    Args:
      feed: a GTFS feed object
    
    Returns:
      A GeoDataFrame with the following columns:
    """
    trip_df = merge_trip_geom(feed.trips,feed.shapes)
    trip_ids = trip_df.trip_id.unique()
    stop_df = filter_stop_df(feed.stop_times,trip_ids)
    stop_loc_df = feed.stops[['stop_id','geometry']]
    stop_df = merge_stop_geom(stop_df,stop_loc_df)    
    stop_df = stop_df.merge(trip_df,on='trip_id',how='left')
    stop_df = create_segments(stop_df)
    # return stop_df
    epsg_zone = get_zone_epsg(stop_df)
    subset_list = np.array(['route_id','shape_id','service_id','segment_id','stop_id1','stop_id2','direction_id','traversals','geometry'])
    col_subset = subset_list[np.in1d(subset_list,stop_df.columns)]
    stop_df = stop_df[col_subset]
    stop_df = make_gdf(stop_df)    
    stop_df['distance'] = stop_df.to_crs(epsg_zone).geometry.length
    return stop_df