# import the requisite libraries
import pandas as pd
import csv
from tabulate import tabulate
# read in the dataframes from GitHub
df1 = pd.read_csv('https://raw.githubusercontent.com/lshpaner/sd_streets/' \
+ 'main/data/oci_2015_datasd.csv')
df2 = pd.read_csv('https://raw.githubusercontent.com/lshpaner/sd_streets/' \
+ 'main/data/sd_paving_datasd.csv')
df3 = pd.read_csv('https://raw.githubusercontent.com/lshpaner/sd_streets/' \
+ 'main/data/traffic_counts_datasd.csv')
df4 = pd.read_csv('https://raw.githubusercontent.com/lshpaner/sd_streets/' \
+ 'main/merged_data/sd_roads_dataframe.csv')
path = '/content/drive/My Drive/Python Projects/san_diego_street_conditions'
def build_report(df1, df2, df3, df4):
# create an empty log container for appending to output dataframe
log_txt = []
# file names
names = ['Streets OCI', 'Street Repair Projects',
'Traffic Volumes', 'Merged Dataframe']
dataframes = df1, df2, df3, df4 # dataframes for files
print(' ')
# append header to log_txt container defined above
log_txt.append(' ')
# shape of merged dataframes file
print('Merged File')
log_txt.append('Merged File')
print('No. of Rows in Merged File: ' + str(f"{df4.shape[0]:,}"))
log_txt.append('No. of Rows in Merged File: ' + str(f"{df4.shape[0]:,}"))
print('No. of Columns in Merged File: ' + str(f"{df4.shape[1]:,}"))
log_txt.append('No. of Columns in Merged File: ' + str(f"{df4.shape[1]:,}"))
print(' ')
log_txt.append(' ')
# OCI file
print('Streets Overall Condition Index (OCI)')
log_txt.append('Streets Overall Condition Index (OCI)')
print('No. of Rows in OCI File: ' + str(f"{df1.shape[0]:,}"))
log_txt.append('No. of Rows in OCI File: ' + str(f"{df1.shape[0]:,}"))
print('No. of Columns in OCI File: ' + str(f"{df1.shape[1]:,}"))
log_txt.append('No. of Columns in OCI File: ' + str(f"{df1.shape[1]:,}"))
print(' ')
log_txt.append(' ')
# filter out any columns contain the '_id' string
oci_id = df1.filter(like='_id').columns
# if there are any columns that contain the '_id' string in OCI
# print the number of unique columns and get a distinct count
# otherwise, report that these OCI Ids do not exist.
if df1[oci_id].columns.any():
df1_print = df1[oci_id].nunique().apply(lambda x : "{:,}".format(x))
df1_print = pd.DataFrame(df1_print)
df1_print.reset_index(inplace=True)
df1_print = df1_print.rename(columns={0: 'Distinct Count',
'index':'ID Columns'})
# encapsulate this distinct count within a table
df1_tab = tabulate(df1_print, headers='keys', tablefmt='psql')
print(df1_tab)
log_txt.append(df1_tab)
else:
df1_notab = 'Street OCI IDs DO NOT exist.'
print(df1_notab)
log_txt.append(df1_notab)
print(' ')
log_txt.append(' ')
# Street Repair Projects File
print('Street Repair Projects')
log_txt.append('Street Repair Projects')
print('No. of Rows in Street Repairs File: ' + \
str(f"{df2.shape[0]:,}"))
log_txt.append('No. of Rows in Street Repairs File: ' + \
str(f"{df2.shape[0]:,}"))
print('No. of Columns in Street Repairs File: ' + \
str(f"{df2.shape[1]:,}"))
log_txt.append('No. of Columns in Street Repairs File: ' + \
str(f"{df2.shape[1]:,}"))
print(' ')
log_txt.append(' ')
streets_id = df2.filter(like='_id').columns
if df2[oci_id].columns.any():
df2_print = df2[streets_id].nunique().apply(lambda x : "{:,}".format(x))
df2_print = pd.DataFrame(df2_print)
df2_print.reset_index(inplace=True)
df2_print = df2_print.rename(columns={0:'Distinct Count',
'index':'ID Columns'})
df2_tab = tabulate(df2_print, headers='keys',tablefmt='psql')
print(df2_tab)
log_txt.append(df2_tab)
else:
df2_notab= 'Street Repairs IDs DO NOT exist.'
print(df2_notab)
log_txt.append(df2_notab)
print(' ')
log_txt.append(' ')
# shape of Traffic Volumes File
print('Traffic Volumes')
log_txt.append('Traffic Volumes')
print('No. of Rows in Traffic Volumes File: ' + \
str(f"{df3.shape[0]:,}"))
log_txt.append('No. of Rows in Traffic Volumes File: ' + \
str(f"{df3.shape[0]:,}"))
print('No. of Columns in Traffic Volumes File: ' + \
str(f"{df3.shape[1]:,}"))
log_txt.append('No. of Columns in Traffic Volumes File: ' + \
str(f"{df3.shape[1]:,}"))
print(' ')
log_txt.append(' ')
traffic_id = df3.filter(like='_id').columns
if df3[traffic_id].columns.any():
df3_print = df3[traffic_id].nunique().apply(lambda x : "{:,}".format(x))
df3_print = pd.DataFrame(df3_print)
df3_print.reset_index(inplace=True)
df3_print = df3_print.rename(columns={0:'Distinct Count',
'index':'ID Columns'})
df3_tab = tabulate(df3_print, headers='keys', tablefmt='psql')
print(df3_tab)
log_txt.append(df3_tab)
else:
df3_notab = 'Traffic Volume IDs DO NOT exist.'
print(df3_notab)
log_txt.append(df3_notab)
print(' ')
log_txt.append(' ')
###########################
### Cross-file matching ###
###########################
df1_df2 = set(df1.columns).intersection(set(df2.columns))
df1_df2 = list(df1_df2)
if len(df1_df2) != 0:
df1_df2 = pd.DataFrame(df1_df2)
df1_df2 = df1_df2.rename(columns={0:'Shared Columns Between Street Repairs'
+' and OCI File'})
df1_df2_tab = (tabulate(df1_df2, headers='keys', tablefmt='psql'))
print(df1_df2_tab)
log_txt.append(df1_df2_tab)
else:
df1_df2_notab = 'There are no shared columns between Street Repairs file' + \
' and OCI file.'
print(df1_df2_notab)
log_txt.append(df1_df2_notab)
print(' ')
log_txt.append(' ')
# matching `pve_id` in dataframes
print(tabulate({'Matching Pve_IDs Across Files':\
['Matching Pve_IDs Across Files']}, headers='keys'))
log_txt.append(tabulate({'Matching Pve_IDs Across Files':\
['Matching Pve_IDs Across Files']}, headers='keys'))
for name, dataframe in zip(names, dataframes):
if 'pve_id' in dataframe:
pve_id = name+': pve_id' + ' = ' + str(True)
print(pve_id)
log_txt.append(pve_id)
else:
no_pve_id = name + ': pve_id' + ' = ' + str(False)
print(no_pve_id)
log_txt.append(no_pve_id)
print(' ')
log_txt.append(' ')
for name, dataframe in zip(names, dataframes):
if 'pve_id' in dataframe:
pve_id_exists = 'There are ' + str(f"""{(dataframe['pve_id'].isin(dataframe
['pve_id']).astype(int).value_counts()
[1]):,}""") + ' ' + name + \
' pve_ids in Streets OCI.'
print(pve_id_exists)
log_txt.append(pve_id_exists)
else:
no_pve_id = 'There are no ' + name + ' pve_ids ' + 'in Streets OCI.'
print(no_pve_id)
log_txt.append(no_pve_id)
print(' ')
log_txt.append(' ')
# matching `seg_id` in dataframes
print(tabulate({'Matching Seg_IDs Across Files':\
['Matching Seg_IDs Across Files']}, headers='keys'))
log_txt.append(tabulate({'Matching Seg_IDs Across Files':\
['Matching Seg_IDs Across Files']}, headers='keys'))
for name, dataframe in zip(names, dataframes):
# check for `seg_ids` in dataframes
if 'seg_id' in dataframe:
seg_log = name + ': seg_id' + ' = ' + str(True)
print(seg_log)
log_txt.append(seg_log)
seg_summary = 'There are ' + str(f"{dataframe['seg_id'].nunique():,}") \
+ ' unique seg_id values in ' + name + '.'
print(seg_summary)
log_txt.append(seg_summary)
else:
no_seg_log = name + ': seg_id' + ' = ' + str(False)
print(no_seg_log)
log_txt.append(no_seg_log)
print(' ')
log_txt.append(' ')
for name, dataframe in zip(names, dataframes):
if 'seg_id' in dataframe:
seg_id_exists = 'There are '+str(f"""{(dataframe['seg_id'].isin(dataframe
['seg_id']).astype(int).value_counts()
[1]):,}""") + ' ' + name + \
' seg_ids in Street OCI.'
print(seg_id_exists)
log_txt.append(seg_id_exists)
else:
no_seg_id = 'There are no ' + name +' seg_ids ' + \
'in Streets OCI.'
print(no_seg_id)
log_txt.append(no_seg_id)
report = pd.DataFrame({'San Diego Street Conditions Classification Report'
:log_txt})
return report
# pass the build_report function to a new variable named report
report = build_report(df1, df2, df3, df4)
## EXTRA FUNCTION DEFINES WHAT GETS SENT OUT
# save report to .txt file
report.to_csv(path + '/report.txt', index=False, sep="\t",
quoting=csv.QUOTE_NONE, quotechar='', escapechar='\t')
# save report to .rtf file
report.to_csv(path + '/report.rtf', index=False, sep="\t",
quoting=csv.QUOTE_NONE, quotechar='', escapechar='\t')