San Diego Street Conditions Classification - Data Inspection Report
Leonid Shpaner
The following code reads in the three dataframes and checks for the shape (i.e., column, row count, etc.).
It also looks for unique id values across all files and outputs the files to .txt
and .rtf
file formats, respectively.
This is a good working/reproducible example of a sound report-building strategy.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 |
# import the requisite libraries import pandas as pd import csv from tabulate import tabulate # read in the dataframes from GitHub df1 = pd.read_csv('https://raw.githubusercontent.com/lshpaner/sd_streets/' \ + 'main/data/oci_2015_datasd.csv') df2 = pd.read_csv('https://raw.githubusercontent.com/lshpaner/sd_streets/' \ + 'main/data/sd_paving_datasd.csv') df3 = pd.read_csv('https://raw.githubusercontent.com/lshpaner/sd_streets/' \ + 'main/data/traffic_counts_datasd.csv') df4 = pd.read_csv('https://raw.githubusercontent.com/lshpaner/sd_streets/' \ + 'main/merged_data/sd_roads_dataframe.csv') path = '/content/drive/My Drive/Python Projects/san_diego_street_conditions' def build_report(df1, df2, df3, df4): # create an empty log container for appending to output dataframe log_txt = [] # file names names = ['Streets OCI', 'Street Repair Projects', 'Traffic Volumes', 'Merged Dataframe'] dataframes = df1, df2, df3, df4 # dataframes for files print(' ') # append header to log_txt container defined above log_txt.append(' ') # shape of merged dataframes file print('Merged File') log_txt.append('Merged File') print('No. of Rows in Merged File: ' + str(f"{df4.shape[0]:,}")) log_txt.append('No. of Rows in Merged File: ' + str(f"{df4.shape[0]:,}")) print('No. of Columns in Merged File: ' + str(f"{df4.shape[1]:,}")) log_txt.append('No. of Columns in Merged File: ' + str(f"{df4.shape[1]:,}")) print(' ') log_txt.append(' ') # OCI file print('Streets Overall Condition Index (OCI)') log_txt.append('Streets Overall Condition Index (OCI)') print('No. of Rows in OCI File: ' + str(f"{df1.shape[0]:,}")) log_txt.append('No. of Rows in OCI File: ' + str(f"{df1.shape[0]:,}")) print('No. of Columns in OCI File: ' + str(f"{df1.shape[1]:,}")) log_txt.append('No. of Columns in OCI File: ' + str(f"{df1.shape[1]:,}")) print(' ') log_txt.append(' ') # filter out any columns contain the '_id' string oci_id = df1.filter(like='_id').columns # if there are any columns that contain the '_id' string in OCI # print the number of unique columns and get a distinct count # otherwise, report that these OCI Ids do not exist. if df1[oci_id].columns.any(): df1_print = df1[oci_id].nunique().apply(lambda x : "{:,}".format(x)) df1_print = pd.DataFrame(df1_print) df1_print.reset_index(inplace=True) df1_print = df1_print.rename(columns={0: 'Distinct Count', 'index':'ID Columns'}) # encapsulate this distinct count within a table df1_tab = tabulate(df1_print, headers='keys', tablefmt='psql') print(df1_tab) log_txt.append(df1_tab) else: df1_notab = 'Street OCI IDs DO NOT exist.' print(df1_notab) log_txt.append(df1_notab) print(' ') log_txt.append(' ') # Street Repair Projects File print('Street Repair Projects') log_txt.append('Street Repair Projects') print('No. of Rows in Street Repairs File: ' + \ str(f"{df2.shape[0]:,}")) log_txt.append('No. of Rows in Street Repairs File: ' + \ str(f"{df2.shape[0]:,}")) print('No. of Columns in Street Repairs File: ' + \ str(f"{df2.shape[1]:,}")) log_txt.append('No. of Columns in Street Repairs File: ' + \ str(f"{df2.shape[1]:,}")) print(' ') log_txt.append(' ') streets_id = df2.filter(like='_id').columns if df2[oci_id].columns.any(): df2_print = df2[streets_id].nunique().apply(lambda x : "{:,}".format(x)) df2_print = pd.DataFrame(df2_print) df2_print.reset_index(inplace=True) df2_print = df2_print.rename(columns={0:'Distinct Count', 'index':'ID Columns'}) df2_tab = tabulate(df2_print, headers='keys',tablefmt='psql') print(df2_tab) log_txt.append(df2_tab) else: df2_notab= 'Street Repairs IDs DO NOT exist.' print(df2_notab) log_txt.append(df2_notab) print(' ') log_txt.append(' ') # shape of Traffic Volumes File print('Traffic Volumes') log_txt.append('Traffic Volumes') print('No. of Rows in Traffic Volumes File: ' + \ str(f"{df3.shape[0]:,}")) log_txt.append('No. of Rows in Traffic Volumes File: ' + \ str(f"{df3.shape[0]:,}")) print('No. of Columns in Traffic Volumes File: ' + \ str(f"{df3.shape[1]:,}")) log_txt.append('No. of Columns in Traffic Volumes File: ' + \ str(f"{df3.shape[1]:,}")) print(' ') log_txt.append(' ') traffic_id = df3.filter(like='_id').columns if df3[traffic_id].columns.any(): df3_print = df3[traffic_id].nunique().apply(lambda x : "{:,}".format(x)) df3_print = pd.DataFrame(df3_print) df3_print.reset_index(inplace=True) df3_print = df3_print.rename(columns={0:'Distinct Count', 'index':'ID Columns'}) df3_tab = tabulate(df3_print, headers='keys', tablefmt='psql') print(df3_tab) log_txt.append(df3_tab) else: df3_notab = 'Traffic Volume IDs DO NOT exist.' print(df3_notab) log_txt.append(df3_notab) print(' ') log_txt.append(' ') ########################### ### Cross-file matching ### ########################### df1_df2 = set(df1.columns).intersection(set(df2.columns)) df1_df2 = list(df1_df2) if len(df1_df2) != 0: df1_df2 = pd.DataFrame(df1_df2) df1_df2 = df1_df2.rename(columns={0:'Shared Columns Between Street Repairs' +' and OCI File'}) df1_df2_tab = (tabulate(df1_df2, headers='keys', tablefmt='psql')) print(df1_df2_tab) log_txt.append(df1_df2_tab) else: df1_df2_notab = 'There are no shared columns between Street Repairs file' + \ ' and OCI file.' print(df1_df2_notab) log_txt.append(df1_df2_notab) print(' ') log_txt.append(' ') # matching `pve_id` in dataframes print(tabulate({'Matching Pve_IDs Across Files':\ ['Matching Pve_IDs Across Files']}, headers='keys')) log_txt.append(tabulate({'Matching Pve_IDs Across Files':\ ['Matching Pve_IDs Across Files']}, headers='keys')) for name, dataframe in zip(names, dataframes): if 'pve_id' in dataframe: pve_id = name+': pve_id' + ' = ' + str(True) print(pve_id) log_txt.append(pve_id) else: no_pve_id = name + ': pve_id' + ' = ' + str(False) print(no_pve_id) log_txt.append(no_pve_id) print(' ') log_txt.append(' ') for name, dataframe in zip(names, dataframes): if 'pve_id' in dataframe: pve_id_exists = 'There are ' + str(f"""{(dataframe['pve_id'].isin(dataframe ['pve_id']).astype(int).value_counts() [1]):,}""") + ' ' + name + \ ' pve_ids in Streets OCI.' print(pve_id_exists) log_txt.append(pve_id_exists) else: no_pve_id = 'There are no ' + name + ' pve_ids ' + 'in Streets OCI.' print(no_pve_id) log_txt.append(no_pve_id) print(' ') log_txt.append(' ') # matching `seg_id` in dataframes print(tabulate({'Matching Seg_IDs Across Files':\ ['Matching Seg_IDs Across Files']}, headers='keys')) log_txt.append(tabulate({'Matching Seg_IDs Across Files':\ ['Matching Seg_IDs Across Files']}, headers='keys')) for name, dataframe in zip(names, dataframes): # check for `seg_ids` in dataframes if 'seg_id' in dataframe: seg_log = name + ': seg_id' + ' = ' + str(True) print(seg_log) log_txt.append(seg_log) seg_summary = 'There are ' + str(f"{dataframe['seg_id'].nunique():,}") \ + ' unique seg_id values in ' + name + '.' print(seg_summary) log_txt.append(seg_summary) else: no_seg_log = name + ': seg_id' + ' = ' + str(False) print(no_seg_log) log_txt.append(no_seg_log) print(' ') log_txt.append(' ') for name, dataframe in zip(names, dataframes): if 'seg_id' in dataframe: seg_id_exists = 'There are '+str(f"""{(dataframe['seg_id'].isin(dataframe ['seg_id']).astype(int).value_counts() [1]):,}""") + ' ' + name + \ ' seg_ids in Street OCI.' print(seg_id_exists) log_txt.append(seg_id_exists) else: no_seg_id = 'There are no ' + name +' seg_ids ' + \ 'in Streets OCI.' print(no_seg_id) log_txt.append(no_seg_id) report = pd.DataFrame({'San Diego Street Conditions Classification Report' :log_txt}) return report # pass the build_report function to a new variable named report report = build_report(df1, df2, df3, df4) ## EXTRA FUNCTION DEFINES WHAT GETS SENT OUT # save report to .txt file report.to_csv(path + '/report.txt', index=False, sep="\t", quoting=csv.QUOTE_NONE, quotechar='', escapechar='\t') # save report to .rtf file report.to_csv(path + '/report.rtf', index=False, sep="\t", quoting=csv.QUOTE_NONE, quotechar='', escapechar='\t') |