Package gtfs_segments
Expand source code
__version__ = '0.0.4'
from .gtfs_segments import (
get_gtfs_segments,
pipeline_gtfs,
process_feed
)
from .utils import (
export_segments,
plot_hist,
process,
summary_stats
)
from .mobility import(
fetch_gtfs_source,
summary_stats_mobility,
download_latest_data
)
__all__ = [
"__version__",
"get_gtfs_segments",
"pipeline_gtfs",
"process_feed",
"export_segments",
"plot_hist",
"fetch_gtfs_source",
"summary_stats",
"process",
"process",
"summary_stats_mobility",
"download_latest_data"
]
Sub-modules
gtfs_segments.geom_utils
gtfs_segments.gtfs_segments
gtfs_segments.mobility
gtfs_segments.partridge_func
gtfs_segments.utils
Functions
def download_latest_data(out_folder_path, sources_df)
-
It iterates over the rows of the dataframe, and for each row, it tries to download the file from the URL in the
urls.latest
column, and write it to the folder specified in theprovider
columnArgs
out_folder_path
- The path to the folder where you want to save the data
sources_df
- This is the dataframe that contains the urls for the data.
Expand source code
def download_latest_data(out_folder_path,sources_df): """ It iterates over the rows of the dataframe, and for each row, it tries to download the file from the URL in the `urls.latest` column, and write it to the folder specified in the `provider` column Args: out_folder_path: The path to the folder where you want to save the data sources_df: This is the dataframe that contains the urls for the data. """ for i,row in sources_df.iterrows(): try: download_write_file(row['urls.latest'],os.path.join(out_folder_path,row['provider'])) except: continue print("Downloaded the latest data")
def export_segments(df, file_path, output_format, geometry=True)
-
This function takes a GeoDataFrame of segments, a file path, an output format, and a boolean value for whether or not to include the geometry in the output.
If the output format is GeoJSON, the function will output the GeoDataFrame to a GeoJSON file.
If the output format is CSV, the function will output the GeoDataFrame to a CSV file. If the geometry boolean is set to True, the function will output the CSV file with the geometry column. If the geometry boolean is set to False, the function will output the CSV file without the geometry column.
The function will also add additional columns to the CSV file, including the start and end points of the segments, the start and end longitude and latitude of the segments, and the distance of the segments.
The function will also add a column to the CSV file that indicates the number of times the segment was traversed.
Args
df
- the dataframe containing the segments
file_path
- The path to the file you want to export to.
output_format
- geojson or csv
geometry
- If True, the output will include the geometry of the segments. If False, the output will
only include the start and end points of the segments. Defaults to True
Expand source code
def export_segments(df,file_path,output_format, geometry = True): """ This function takes a GeoDataFrame of segments, a file path, an output format, and a boolean value for whether or not to include the geometry in the output. If the output format is GeoJSON, the function will output the GeoDataFrame to a GeoJSON file. If the output format is CSV, the function will output the GeoDataFrame to a CSV file. If the geometry boolean is set to True, the function will output the CSV file with the geometry column. If the geometry boolean is set to False, the function will output the CSV file without the geometry column. The function will also add additional columns to the CSV file, including the start and end points of the segments, the start and end longitude and latitude of the segments, and the distance of the segments. The function will also add a column to the CSV file that indicates the number of times the segment was traversed. Args: df: the dataframe containing the segments file_path: The path to the file you want to export to. output_format: geojson or csv geometry: If True, the output will include the geometry of the segments. If False, the output will only include the start and end points of the segments. Defaults to True """ ## Output to GeoJSON if output_format == 'geojson': df.to_file(file_path+'.json', driver="GeoJSON") elif output_format == 'csv': s_df = df[['route_id','segment_id','stop_id1','stop_id2','distance','traversals','geometry']].copy() geom_list = s_df.geometry.apply(lambda g: np.array(g.coords)) s_df['start_point'] = [Point(g[0]).wkt for g in geom_list] s_df['end_point'] = [Point(g[-1]).wkt for g in geom_list] s_df['start_lon'] = [g[0][0] for g in geom_list] s_df['start_lat'] = [g[0][1] for g in geom_list] s_df['end_lon'] = [g[-1][0] for g in geom_list] s_df['end_lat'] = [g[-1][1] for g in geom_list] sg_df = s_df[['route_id','segment_id','stop_id1','stop_id2','distance','traversals','start_point','end_point','geometry']] if geometry == True: ## Output With LS sg_df.to_csv(file_path+'.csv',index = False) else: d_df = s_df[['route_id','segment_id','stop_id1','stop_id2','start_lat','start_lon','end_lat','end_lon','distance','traversals']] ## Output without LS d_df.to_csv(file_path+'.csv',index = False)
def fetch_gtfs_source(place='ALL')
-
It reads the mobility data sources csv file and generates a dataframe with the sources that are of type gtfs and are from the US
Args
place
- The place you want to get the GTFS data for. This can be a city, state, or country.
Defaults to ALL
Returns
A dataframe with sources
Expand source code
def fetch_gtfs_source(place ='ALL'): """ It reads the mobility data sources csv file and generates a dataframe with the sources that are of type gtfs and are from the US Args: place: The place you want to get the GTFS data for. This can be a city, state, or country. Defaults to ALL Returns: A dataframe with sources """ abb_df = pd.read_json(ABBREV_link) sources_df = pd.read_csv(MOBILITY_SOURCES_link) sources_df = sources_df[sources_df['location.country_code'] == 'US'] sources_df = sources_df[sources_df['data_type'] == 'gtfs'] sources_df = pd.merge(sources_df,abb_df,how='left',left_on='location.subdivision_name',right_on='state') sources_df = sources_df[~sources_df.state_code.isna()] sources_df['location.municipality'] = sources_df['location.municipality'].astype("str") sources_df.drop(['entity_type','mdb_source_id','data_type','location.country_code','note', 'static_reference','urls.direct_download','urls.authentication_type','urls.license','location.bounding_box.extracted_on', 'urls.authentication_info','urls.api_key_parameter_name','features'],axis=1,inplace=True) file_names = [] for i,row in sources_df.iterrows(): if row['location.municipality'] != 'nan': if len(sources_df[(sources_df['location.municipality'] == row['location.municipality']) & (sources_df['provider'] == row['provider'])]) <= 1: f_name = str(row['location.municipality'])+'-'+str(row['provider'])+'-'+str(row['state_code']) else: f_name = str(row['location.municipality'])+'-'+str(row['provider'])+'-'+str(row['name'])+'-'+str(row['state_code']) else: if len(sources_df[(sources_df['location.subdivision_name'] == row['location.subdivision_name']) & (sources_df['provider'] == row['provider'])]) <= 1: f_name = str(row['location.subdivision_name'])+'-'+str(row['provider'])+'-'+str(row['state_code']) else: f_name =str(row['location.subdivision_name'])+'-'+str(row['provider'])+'-'+str(row['name'])+'-'+str(row['state_code']) f_name = f_name.replace('/','').strip() file_names.append(f_name) sources_df.drop(['provider','location.municipality','location.subdivision_name','name','state_code','state'],axis=1,inplace=True) sources_df.insert(0,'provider',file_names) sources_df.columns = sources_df.columns.str.replace('location.bounding_box.',"",regex=True) if place == 'ALL': return sources_df else: sources_df = sources_df[sources_df.apply(lambda row: row.astype(str).str.contains(place.lower(), case=False).any(), axis=1)] if len(sources_df) == 0: return "No sources found for the given place" else: return sources_df
def get_gtfs_segments(path)
-
It reads a GTFS file, and returns a list of segments
Args
path
- the path to the GTFS file
Returns
A list of segments.
Expand source code
def get_gtfs_segments(path): """ > It reads a GTFS file, and returns a list of segments Args: path: the path to the GTFS file Returns: A list of segments. """ bday ,feed = ptg_read_file(path) return process_feed(feed)
def pipeline_gtfs(filename, url, bounds, max_spacing)
-
It takes a GTFS file, downloads it, reads it, processes it, and then outputs a bunch of files.
Let's go through the function step by step.
First, we define the function and give it a name. We also give it a few arguments:
- filename: the name of the file we want to save the output to.
- url: the url of the GTFS file we want to download.
- bounds: the bounding box of the area we want to analyze.
- max_spacing: the maximum spacing we want to analyze.
We then create a folder to save the output to.
Next, we download the GTFS file and save it to the folder we just created.
Then, we read the GTFS file using the
ptg_read_file
function.Args
filename
- the name of the file you want to save the output to
url
- the url of the GTFS file
bounds
- the bounding box of the area you want to analyze. This is in the format
[min_lat,min_lon,max_lat,max_lon] max_spacing: The maximum distance between stops that you want to consider.
Returns
a string with the name of the file that was processed.
Expand source code
def pipeline_gtfs(filename,url,bounds,max_spacing): """ It takes a GTFS file, downloads it, reads it, processes it, and then outputs a bunch of files. Let's go through the function step by step. First, we define the function and give it a name. We also give it a few arguments: - filename: the name of the file we want to save the output to. - url: the url of the GTFS file we want to download. - bounds: the bounding box of the area we want to analyze. - max_spacing: the maximum spacing we want to analyze. We then create a folder to save the output to. Next, we download the GTFS file and save it to the folder we just created. Then, we read the GTFS file using the `ptg_read_file` function. Args: filename: the name of the file you want to save the output to url: the url of the GTFS file bounds: the bounding box of the area you want to analyze. This is in the format [min_lat,min_lon,max_lat,max_lon] max_spacing: The maximum distance between stops that you want to consider. Returns: a string with the name of the file that was processed. """ folder_path = os.path.join('output_files',filename) gtfs_file_loc = download_write_file(url,folder_path) ## read file using GTFS Fucntions busisest_day, feed = ptg_read_file(gtfs_file_loc) ## Remove Null entries message = inspect_feed(feed) if message != True: return failed_pipeline(message,filename,folder_path) df = process_feed(feed) df_sub = df[df['distance'] < 3000].copy().reset_index(drop=True) if len(df_sub) == 0: return failed_pipeline('Only Long Bus Routes in ',filename,folder_path) ## Output files and Stats summary_stats_mobility(df,folder_path,filename,busisest_day,url,bounds,max_spacing,export=True) plot_hist(df,file_path = os.path.join(folder_path,'spacings.png'),title = filename.split(".")[0],max_spacing = max_spacing,save_fig=True) export_segments(df,os.path.join(folder_path,'geojson'), output_format ='geojson',geometry = True) export_segments(df,os.path.join(folder_path,'spacings_with_geometry'), output_format ='csv',geometry = True) export_segments(df,os.path.join(folder_path,'spacings'), output_format ='csv',geometry = False) return "Success for "+filename
def plot_hist(df, save_fig=False, show_mean=False, **kwargs)
-
It takes a dataframe with two columns, one with the distance between stops and the other with the number of traversals between those stops, and plots a weighted histogram of the distances
Args
df
- The dataframe that contains the data
save_fig
- If True, the figure will be saved to the file_path. Defaults to False
show_mean
- If True, will show the mean of the distribution. Defaults to False
Returns
A matplotlib axis
Expand source code
def plot_hist(df,save_fig = False,show_mean = False,**kwargs): """ It takes a dataframe with two columns, one with the distance between stops and the other with the number of traversals between those stops, and plots a weighted histogram of the distances Args: df: The dataframe that contains the data save_fig: If True, the figure will be saved to the file_path. Defaults to False show_mean: If True, will show the mean of the distribution. Defaults to False Returns: A matplotlib axis """ if "max_spacing" not in kwargs.keys(): max_spacing = 3000 print("Using max_spacing = 3000") else: max_spacing = kwargs['max_spacing'] if "ax" in kwargs.keys(): ax = kwargs['ax'] else: fig, ax = plt.subplots(figsize=(8,6)) df = df[df['distance'] < max_spacing] data = np.hstack([np.repeat(x, y) for x, y in zip(df['distance'], df.traversals)]) plt.hist(data,range=(0,max_spacing),density = True,bins = int(max_spacing/50),fc=(0, 105/255, 160/255, 0.4),ec = "white",lw =0.8) x = np.arange(0,max_spacing,5) plt.plot(x,gaussian_kde(data)(x),lw = 1.5,color=(0, 85/255, 120/255, 1)) # sns.histplot(data,binwidth=50,stat = "density",kde=True,ax=ax) plt.xlim([0,max_spacing]) plt.xlabel('Stop Spacing [m]') plt.ylabel('Density - Traversal Weighted') plt.title("Histogram of Spacing") if show_mean: plt.axvline(np.mean(data), color='k', linestyle='dashed', linewidth=2) min_ylim, max_ylim = plt.ylim() plt.text(np.mean(data)*1.1, max_ylim*0.9, 'Mean: {:.0f}'.format(np.mean(data)),fontsize=12) if "title" in kwargs.keys(): plt.title(kwargs['title']) if save_fig == True: assert "file_path" in kwargs.keys(), "Please pass in the `file_path`" plt.savefig(kwargs['file_path'], dpi=300) plt.show() plt.close(fig) return ax
def process(pipeline_gtfs, row, max_spacing)
-
It takes a pipeline, a row from the sources_df, and a max_spacing, and returns the output of the pipeline
Args
pipeline_gtfs
- This is the function that will be used to process the GTFS data.
row
- This is a row in the sources_df dataframe. It contains the name of the provider, the url to
the gtfs file, and the bounding box of the area that the gtfs file covers. max_spacing: Maximum Allowed Spacing between two consecutive stops.
Returns
The return value is a tuple of the form (filename,folder_path,df)
Expand source code
def process(pipeline_gtfs,row,max_spacing): """ It takes a pipeline, a row from the sources_df, and a max_spacing, and returns the output of the pipeline Args: pipeline_gtfs: This is the function that will be used to process the GTFS data. row: This is a row in the sources_df dataframe. It contains the name of the provider, the url to the gtfs file, and the bounding box of the area that the gtfs file covers. max_spacing: Maximum Allowed Spacing between two consecutive stops. Returns: The return value is a tuple of the form (filename,folder_path,df) """ filename = row['provider'] url = row['urls.latest'] bounds = [[row['minimum_longitude'],row['minimum_latitude']],[row['maximum_longitude'],row['maximum_latitude']]] print(filename) try: return pipeline_gtfs(filename,url,bounds,max_spacing) except: traceback.print_exc() folder_path = os.path.join('output_files',filename) return failed_pipeline("Failed for ",filename,folder_path)
def process_feed(feed)
-
It takes a GTFS feed, merges the trip and shape data, filters the stop_times data to only include the trips that are in the feed, merges the stop_times data with the stop data, creates a segment for each stop pair, gets the EPSG zone for the feed, creates a GeoDataFrame, and calculates the length of each segment
Args
feed
- a GTFS feed object
Returns
A GeoDataFrame with the following columns:
Expand source code
def process_feed(feed): """ It takes a GTFS feed, merges the trip and shape data, filters the stop_times data to only include the trips that are in the feed, merges the stop_times data with the stop data, creates a segment for each stop pair, gets the EPSG zone for the feed, creates a GeoDataFrame, and calculates the length of each segment Args: feed: a GTFS feed object Returns: A GeoDataFrame with the following columns: """ trip_df = merge_trip_geom(feed.trips,feed.shapes) trip_ids = trip_df.trip_id.unique() stop_df = filter_stop_df(feed.stop_times,trip_ids) stop_loc_df = feed.stops[['stop_id','geometry']] stop_df = merge_stop_geom(stop_df,stop_loc_df) stop_df = stop_df.merge(trip_df,on='trip_id',how='left') stop_df = create_segments(stop_df) # return stop_df epsg_zone = get_zone_epsg(stop_df) subset_list = np.array(['route_id','shape_id','service_id','segment_id','stop_id1','stop_id2','direction_id','traversals','geometry']) col_subset = subset_list[np.in1d(subset_list,stop_df.columns)] stop_df = stop_df[col_subset] stop_df = make_gdf(stop_df) stop_df['distance'] = stop_df.to_crs(epsg_zone).geometry.length return stop_df
def summary_stats(df, export=False, **kwargs)
-
It takes in a dataframe, and returns a dataframe with summary statistics
Args
df
- The dataframe that you want to get the summary statistics for.
export
- If True, the summary will be exported to a csv file. Defaults to False
Returns
A dataframe with the summary statistics
Expand source code
def summary_stats(df,export = False,**kwargs): """ It takes in a dataframe, and returns a dataframe with summary statistics Args: df: The dataframe that you want to get the summary statistics for. export: If True, the summary will be exported to a csv file. Defaults to False Returns: A dataframe with the summary statistics """ if "max_spacing" not in kwargs.keys(): max_spacing = 3000 print("Using max_spacing = 3000") else: max_spacing = kwargs['max_spacing'] percent_spacing = round(df[df["distance"] > max_spacing]['traversals'].sum()/df['traversals'].sum() *100,3) df = df[df["distance"] <= max_spacing] stop_weighted_mean = df.groupby(['segment_id','distance']).first().reset_index()["distance"].mean() route_weighted_mean = df.groupby(['route_id','segment_id','distance']).first().reset_index()["distance"].mean() weighted_data = np.hstack([np.repeat(x, y) for x, y in zip(df['distance'], df.traversals)]) df_dict = { 'Segment Weighted Mean' : stop_weighted_mean, 'Route Weighted Mean' : route_weighted_mean, 'Traversal Weighted Mean': round(np.mean(weighted_data),3), 'Traversal Weighted Std': round(np.mean(weighted_data),3), 'Traversal Weighted 25 % Quantile': round(np.quantile(weighted_data,0.25),3), 'Traversal Weighted 50 % Quantile': round(np.quantile(weighted_data,0.50),3), 'Traversal Weighted 75 % Quantile': round(np.quantile(weighted_data,0.75),3), 'No of Segments':int(len(df)), 'No of Routes':int(len(df.route_id.unique())), 'No of Traversals':int(sum(df.traversals)), 'Max Spacing':int(max_spacing), '% Segments w/ spacing > max_spacing':percent_spacing} summary_df = pd.DataFrame([df_dict]) # df.set_index(summary_df.columns[0],inplace=True) if export: assert "file_path" in kwargs.keys(), "Please pass in the `file_path`" summary_df.to_csv(kwargs['file_path'],index = False) print("Saved the summary in "+kwargs['file_path']) summary_df = summary_df.T return summary_df
def summary_stats_mobility(df, folder_path, filename, b_day, link, bounds, max_spacing=3000, export=False)
-
It takes in a dataframe, a folder path, a filename, a busiest day, a link, a bounding box, a max spacing, and a boolean for exporting the summary to a csv.
It then calculates the percentage of segments that have a spacing greater than the max spacing. It then filters the dataframe to only include segments with a spacing less than the max spacing. It then calculates the segment weighted mean, route weighted mean, traversal weighted mean, traversal weighted standard deviation, traversal weighted 25th percentile, traversal weighted 50th percentile, traversal weighted 75th percentile, number of segments, number of routes, number of traversals, and the max spacing. It then creates a dictionary with all of the above values and creates a dataframe from the dictionary. It then exports the dataframe to a csv if the export boolean is true. If the export boolean is false, it transposes the dataframe and returns it.
Args
df
- the dataframe containing the mobility data
folder_path
- The path to the folder where you want to save the summary.csv file.
filename
- The name of the file you want to save the data as.
b_day
- The busiest day of the week
link
- The link of the map you want to use.
bounds
- The bounding box of the area you want to analyze.
max_spacing
- The maximum distance between two stops that you want to consider. Defaults to 3000
export
- If True, the summary will be saved as a csv file in the folder_path. If False, the summary
will be returned as a dataframe. Defaults to False
Returns
A dataframe with the summary statistics of the mobility data.
Expand source code
def summary_stats_mobility(df,folder_path,filename,b_day,link,bounds,max_spacing = 3000,export = False): """ It takes in a dataframe, a folder path, a filename, a busiest day, a link, a bounding box, a max spacing, and a boolean for exporting the summary to a csv. It then calculates the percentage of segments that have a spacing greater than the max spacing. It then filters the dataframe to only include segments with a spacing less than the max spacing. It then calculates the segment weighted mean, route weighted mean, traversal weighted mean, traversal weighted standard deviation, traversal weighted 25th percentile, traversal weighted 50th percentile, traversal weighted 75th percentile, number of segments, number of routes, number of traversals, and the max spacing. It then creates a dictionary with all of the above values and creates a dataframe from the dictionary. It then exports the dataframe to a csv if the export boolean is true. If the export boolean is false, it transposes the dataframe and returns it. Args: df: the dataframe containing the mobility data folder_path: The path to the folder where you want to save the summary.csv file. filename: The name of the file you want to save the data as. b_day: The busiest day of the week link: The link of the map you want to use. bounds: The bounding box of the area you want to analyze. max_spacing: The maximum distance between two stops that you want to consider. Defaults to 3000 export: If True, the summary will be saved as a csv file in the folder_path. If False, the summary will be returned as a dataframe. Defaults to False Returns: A dataframe with the summary statistics of the mobility data. """ percent_spacing = round(df[df["distance"] > max_spacing]['traversals'].sum()/df['traversals'].sum() *100,3) df = df[df["distance"] <= max_spacing] csv_path = os.path.join(folder_path,'summary.csv') stop_weighted_mean = df.groupby(['segment_id','distance']).first().reset_index()["distance"].mean() route_weighted_mean = df.groupby(['route_id','segment_id','distance']).first().reset_index()["distance"].mean() weighted_data = np.hstack([np.repeat(x, y) for x, y in zip(df['distance'], df.traversals)]) df_dict = {"Name":filename, 'Busiest Day': b_day, 'Link': link, 'Min Latitude': bounds[0][1], 'Min Longitude': bounds[0][0], 'Max Latitude': bounds[1][1], 'Max Longitude': bounds[1][0], 'Segment Weighted Mean' : stop_weighted_mean, 'Route Weighted Mean' : route_weighted_mean, 'Traversal Weighted Mean': round(np.mean(weighted_data),3), 'Traversal Weighted Std': round(np.std(weighted_data),3), 'Traversal Weighted 25 % Quantile': round(np.quantile(weighted_data,0.25),3), 'Traversal Weighted 50 % Quantile': round(np.quantile(weighted_data,0.5),3), 'Traversal Weighted 75 % Quantile': round(np.quantile(weighted_data,0.75),3), 'No of Segments':len(df), 'No of Routes':len(df.route_id.unique()), 'No of Traversals':sum(df.traversals), 'Max Spacing':max_spacing, '% Segments w/ spacing > max_spacing':percent_spacing} summary_df = pd.DataFrame([df_dict]) # df.set_index(summary_df.columns[0],inplace=True) if export: summary_df.to_csv(csv_path,index = False) return "Saved the summary.csv in "+folder_path else: summary_df = summary_df.T return summary_df