San Diego Street Conditions Classification - Data Inspection Report

The following code reads in the three dataframes and checks for the shape (i.e., column, row count, etc.).
It also looks for unique id values across all files and outputs the files to .txt and .rtf file formats, respectively.
This is a good working/reproducible example of a sound report-building strategy.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
# import the requisite libraries
import pandas as pd
import csv
from tabulate import tabulate

# read in the dataframes from GitHub
df1 = pd.read_csv('https://raw.githubusercontent.com/lshpaner/sd_streets/' \
                  + 'main/data/oci_2015_datasd.csv')
df2 = pd.read_csv('https://raw.githubusercontent.com/lshpaner/sd_streets/' \
                  + 'main/data/sd_paving_datasd.csv')
df3 = pd.read_csv('https://raw.githubusercontent.com/lshpaner/sd_streets/' \
                  + 'main/data/traffic_counts_datasd.csv')
df4 = pd.read_csv('https://raw.githubusercontent.com/lshpaner/sd_streets/' \
                  + 'main/merged_data/sd_roads_dataframe.csv')
path = '/content/drive/My Drive/Python Projects/san_diego_street_conditions'

def build_report(df1, df2, df3, df4):
  # create an empty log container for appending to output dataframe 
  log_txt = []

  # file names
  names = ['Streets OCI', 'Street Repair Projects', 
          'Traffic Volumes', 'Merged Dataframe']
  dataframes = df1, df2, df3, df4 # dataframes for files

  print(' ')
  # append header to log_txt container defined above
  log_txt.append(' ')
  
  # shape of merged dataframes file
  print('Merged File')
  log_txt.append('Merged File')
  print('No. of Rows in Merged File: ' + str(f"{df4.shape[0]:,}"))
  log_txt.append('No. of Rows in Merged File: ' + str(f"{df4.shape[0]:,}"))
  print('No. of Columns in Merged File: ' + str(f"{df4.shape[1]:,}"))
  log_txt.append('No. of Columns in Merged File: ' + str(f"{df4.shape[1]:,}"))
  print(' ')
  log_txt.append(' ')

  # OCI file
  print('Streets Overall Condition Index (OCI)')
  log_txt.append('Streets Overall Condition Index (OCI)')
  print('No. of Rows in OCI File: ' + str(f"{df1.shape[0]:,}"))
  log_txt.append('No. of Rows in OCI File: ' + str(f"{df1.shape[0]:,}"))
  print('No. of Columns in OCI File: ' + str(f"{df1.shape[1]:,}"))
  log_txt.append('No. of Columns in OCI File: ' + str(f"{df1.shape[1]:,}"))

  print(' ')
  log_txt.append(' ')

  # filter out any columns contain the '_id' string
  oci_id = df1.filter(like='_id').columns
  # if there are any columns that contain the '_id' string in OCI
  # print the number of unique columns and get a distinct count
  # otherwise, report that these OCI Ids do not exist.
  if df1[oci_id].columns.any():
    df1_print = df1[oci_id].nunique().apply(lambda x : "{:,}".format(x))
    df1_print = pd.DataFrame(df1_print)
    df1_print.reset_index(inplace=True)
    df1_print = df1_print.rename(columns={0: 'Distinct Count',
                                            'index':'ID Columns'})
    # encapsulate this distinct count within a table
    df1_tab = tabulate(df1_print, headers='keys', tablefmt='psql')
    print(df1_tab)
    log_txt.append(df1_tab)
  else:
    df1_notab = 'Street OCI IDs DO NOT exist.'
    print(df1_notab)
    log_txt.append(df1_notab)

  print(' ')
  log_txt.append(' ')

  # Street Repair Projects File
  print('Street Repair Projects')
  log_txt.append('Street Repair Projects')
  print('No. of Rows in Street Repairs File: ' + \
        str(f"{df2.shape[0]:,}"))
  log_txt.append('No. of Rows in Street Repairs File: ' + \
                 str(f"{df2.shape[0]:,}"))
  print('No. of Columns in Street Repairs File: ' + \
        str(f"{df2.shape[1]:,}"))
  log_txt.append('No. of Columns in Street Repairs File: ' + \
                 str(f"{df2.shape[1]:,}"))

  print(' ')
  log_txt.append(' ')

  streets_id = df2.filter(like='_id').columns
  if df2[oci_id].columns.any():
    df2_print = df2[streets_id].nunique().apply(lambda x : "{:,}".format(x))
    df2_print = pd.DataFrame(df2_print)
    df2_print.reset_index(inplace=True)
    df2_print = df2_print.rename(columns={0:'Distinct Count',
                                            'index':'ID Columns'})
    df2_tab = tabulate(df2_print, headers='keys',tablefmt='psql')
    print(df2_tab)
    log_txt.append(df2_tab)
  else:
    df2_notab= 'Street Repairs IDs DO NOT exist.'
    print(df2_notab)
    log_txt.append(df2_notab)

  print(' ')
  log_txt.append(' ')

  # shape of Traffic Volumes File
  print('Traffic Volumes')
  log_txt.append('Traffic Volumes')
  print('No. of Rows in Traffic Volumes File: ' + \
        str(f"{df3.shape[0]:,}"))
  log_txt.append('No. of Rows in Traffic Volumes File: ' + \
                 str(f"{df3.shape[0]:,}"))
  print('No. of Columns in Traffic Volumes File: ' + \
        str(f"{df3.shape[1]:,}"))
  log_txt.append('No. of Columns in Traffic Volumes File: ' + \
                 str(f"{df3.shape[1]:,}"))

  print(' ')
  log_txt.append(' ')

  traffic_id = df3.filter(like='_id').columns
  if df3[traffic_id].columns.any():
    df3_print = df3[traffic_id].nunique().apply(lambda x : "{:,}".format(x))
    df3_print = pd.DataFrame(df3_print)
    df3_print.reset_index(inplace=True)
    df3_print = df3_print.rename(columns={0:'Distinct Count',
                                          'index':'ID Columns'})
    df3_tab = tabulate(df3_print, headers='keys', tablefmt='psql')
    print(df3_tab)
    log_txt.append(df3_tab)
  else:
    df3_notab = 'Traffic Volume IDs DO NOT exist.'
    print(df3_notab)
    log_txt.append(df3_notab)

  print(' ')
  log_txt.append(' ')

  ###########################
  ### Cross-file matching ###
  ###########################

  df1_df2 = set(df1.columns).intersection(set(df2.columns))
  df1_df2 = list(df1_df2)
  if len(df1_df2) != 0:
    df1_df2 = pd.DataFrame(df1_df2)
    df1_df2 = df1_df2.rename(columns={0:'Shared Columns Between Street Repairs'
                                      +' and OCI File'})
    df1_df2_tab = (tabulate(df1_df2, headers='keys', tablefmt='psql'))
    print(df1_df2_tab)
    log_txt.append(df1_df2_tab)
  else:
    df1_df2_notab = 'There are no shared columns between Street Repairs file' + \
                    ' and OCI file.'
    print(df1_df2_notab)                
    log_txt.append(df1_df2_notab)

  print(' ')
  log_txt.append(' ')

  # matching `pve_id` in dataframes
  print(tabulate({'Matching Pve_IDs Across Files':\
                       ['Matching Pve_IDs Across Files']}, headers='keys'))
  log_txt.append(tabulate({'Matching Pve_IDs Across Files':\
                       ['Matching Pve_IDs Across Files']}, headers='keys'))
  for name, dataframe in zip(names, dataframes):
    if 'pve_id' in dataframe:
      pve_id = name+': pve_id' + ' = ' + str(True)
      print(pve_id)
      log_txt.append(pve_id)
    else:
      no_pve_id = name + ': pve_id' + ' = ' + str(False)
      print(no_pve_id)
      log_txt.append(no_pve_id)

  print(' ')
  log_txt.append(' ')

  for name, dataframe in zip(names, dataframes):
    if 'pve_id' in dataframe:
      pve_id_exists = 'There are ' + str(f"""{(dataframe['pve_id'].isin(dataframe
                                          ['pve_id']).astype(int).value_counts()
                                          [1]):,}""") + ' ' + name + \
                                           ' pve_ids in Streets OCI.'
      print(pve_id_exists)                
      log_txt.append(pve_id_exists)
    else: 
      no_pve_id = 'There are no ' + name + ' pve_ids ' + 'in Streets OCI.'
      print(no_pve_id)
      log_txt.append(no_pve_id)
  
  print(' ')
  log_txt.append(' ')


  # matching `seg_id` in dataframes
  print(tabulate({'Matching Seg_IDs Across Files':\
                 ['Matching Seg_IDs Across Files']}, headers='keys'))
  
  log_txt.append(tabulate({'Matching Seg_IDs Across Files':\
                       ['Matching Seg_IDs Across Files']}, headers='keys'))

  for name, dataframe in zip(names, dataframes):
  # check for `seg_ids` in dataframes
    if 'seg_id' in dataframe:
      seg_log = name + ': seg_id' + ' = ' + str(True)
      print(seg_log)
      log_txt.append(seg_log)
      seg_summary = 'There are ' + str(f"{dataframe['seg_id'].nunique():,}") \
      + ' unique seg_id values in ' + name + '.'
      print(seg_summary)
      log_txt.append(seg_summary)
    else:
      no_seg_log = name + ': seg_id' + ' = ' + str(False)
      print(no_seg_log)
      log_txt.append(no_seg_log)

  print(' ')
  log_txt.append(' ')

  for name, dataframe in zip(names, dataframes):
    if 'seg_id' in dataframe:
      seg_id_exists = 'There are '+str(f"""{(dataframe['seg_id'].isin(dataframe
                                          ['seg_id']).astype(int).value_counts()
                                          [1]):,}""") + ' ' + name + \
                      ' seg_ids in Street OCI.'
      print(seg_id_exists)
      log_txt.append(seg_id_exists)
    else: 
      no_seg_id = 'There are no ' + name +' seg_ids ' + \
      'in Streets OCI.'
      print(no_seg_id)
      log_txt.append(no_seg_id)

  report = pd.DataFrame({'San Diego Street Conditions Classification Report'
                          :log_txt})
  
  return report

# pass the build_report function to a new variable named report
report = build_report(df1, df2, df3, df4)

## EXTRA FUNCTION DEFINES WHAT GETS SENT OUT

# save report to .txt file
report.to_csv(path + '/report.txt', index=False, sep="\t",
              quoting=csv.QUOTE_NONE, quotechar='', escapechar='\t')

# save report to .rtf file
report.to_csv(path + '/report.rtf', index=False, sep="\t",
              quoting=csv.QUOTE_NONE, quotechar='', escapechar='\t')
Previous