San Diego Street Conditions Classification - Data Inspection Report

Leonid Shpaner
May 1, 2022
GitHub Repository Jupyter Notebook Text File
The following code reads in the three dataframes and checks for the shape (i.e., column, row count, etc.).
It also looks for unique id values across all files and outputs the files to .txt and .rtf file formats, respectively.
This is a good working/reproducible example of a sound report-building strategy.
# import the requisite libraries
import pandas as pd
import csv
from tabulate import tabulate

# read in the dataframes from GitHub
df1 = pd.read_csv('https://raw.githubusercontent.com/lshpaner/sd_streets/' \
                  + 'main/data/oci_2015_datasd.csv')
df2 = pd.read_csv('https://raw.githubusercontent.com/lshpaner/sd_streets/' \
                  + 'main/data/sd_paving_datasd.csv')
df3 = pd.read_csv('https://raw.githubusercontent.com/lshpaner/sd_streets/' \
                  + 'main/data/traffic_counts_datasd.csv')
df4 = pd.read_csv('https://raw.githubusercontent.com/lshpaner/sd_streets/' \
                  + 'main/merged_data/sd_roads_dataframe.csv')
path = '/content/drive/My Drive/Python Projects/san_diego_street_conditions'

def build_report(df1, df2, df3, df4):
  # create an empty log container for appending to output dataframe 
  log_txt = []

  # file names
  names = ['Streets OCI', 'Street Repair Projects', 
          'Traffic Volumes', 'Merged Dataframe']
  dataframes = df1, df2, df3, df4 # dataframes for files

  print(' ')
  # append header to log_txt container defined above
  log_txt.append(' ')
  
  # shape of merged dataframes file
  print('Merged File')
  log_txt.append('Merged File')
  print('No. of Rows in Merged File: ' + str(f"{df4.shape[0]:,}"))
  log_txt.append('No. of Rows in Merged File: ' + str(f"{df4.shape[0]:,}"))
  print('No. of Columns in Merged File: ' + str(f"{df4.shape[1]:,}"))
  log_txt.append('No. of Columns in Merged File: ' + str(f"{df4.shape[1]:,}"))
  print(' ')
  log_txt.append(' ')

  # OCI file
  print('Streets Overall Condition Index (OCI)')
  log_txt.append('Streets Overall Condition Index (OCI)')
  print('No. of Rows in OCI File: ' + str(f"{df1.shape[0]:,}"))
  log_txt.append('No. of Rows in OCI File: ' + str(f"{df1.shape[0]:,}"))
  print('No. of Columns in OCI File: ' + str(f"{df1.shape[1]:,}"))
  log_txt.append('No. of Columns in OCI File: ' + str(f"{df1.shape[1]:,}"))

  print(' ')
  log_txt.append(' ')

  # filter out any columns contain the '_id' string
  oci_id = df1.filter(like='_id').columns
  # if there are any columns that contain the '_id' string in OCI
  # print the number of unique columns and get a distinct count
  # otherwise, report that these OCI Ids do not exist.
  if df1[oci_id].columns.any():
    df1_print = df1[oci_id].nunique().apply(lambda x : "{:,}".format(x))
    df1_print = pd.DataFrame(df1_print)
    df1_print.reset_index(inplace=True)
    df1_print = df1_print.rename(columns={0: 'Distinct Count',
                                            'index':'ID Columns'})
    # encapsulate this distinct count within a table
    df1_tab = tabulate(df1_print, headers='keys', tablefmt='psql')
    print(df1_tab)
    log_txt.append(df1_tab)
  else:
    df1_notab = 'Street OCI IDs DO NOT exist.'
    print(df1_notab)
    log_txt.append(df1_notab)

  print(' ')
  log_txt.append(' ')

  # Street Repair Projects File
  print('Street Repair Projects')
  log_txt.append('Street Repair Projects')
  print('No. of Rows in Street Repairs File: ' + \
        str(f"{df2.shape[0]:,}"))
  log_txt.append('No. of Rows in Street Repairs File: ' + \
                 str(f"{df2.shape[0]:,}"))
  print('No. of Columns in Street Repairs File: ' + \
        str(f"{df2.shape[1]:,}"))
  log_txt.append('No. of Columns in Street Repairs File: ' + \
                 str(f"{df2.shape[1]:,}"))

  print(' ')
  log_txt.append(' ')

  streets_id = df2.filter(like='_id').columns
  if df2[oci_id].columns.any():
    df2_print = df2[streets_id].nunique().apply(lambda x : "{:,}".format(x))
    df2_print = pd.DataFrame(df2_print)
    df2_print.reset_index(inplace=True)
    df2_print = df2_print.rename(columns={0:'Distinct Count',
                                            'index':'ID Columns'})
    df2_tab = tabulate(df2_print, headers='keys',tablefmt='psql')
    print(df2_tab)
    log_txt.append(df2_tab)
  else:
    df2_notab= 'Street Repairs IDs DO NOT exist.'
    print(df2_notab)
    log_txt.append(df2_notab)

  print(' ')
  log_txt.append(' ')

  # shape of Traffic Volumes File
  print('Traffic Volumes')
  log_txt.append('Traffic Volumes')
  print('No. of Rows in Traffic Volumes File: ' + \
        str(f"{df3.shape[0]:,}"))
  log_txt.append('No. of Rows in Traffic Volumes File: ' + \
                 str(f"{df3.shape[0]:,}"))
  print('No. of Columns in Traffic Volumes File: ' + \
        str(f"{df3.shape[1]:,}"))
  log_txt.append('No. of Columns in Traffic Volumes File: ' + \
                 str(f"{df3.shape[1]:,}"))

  print(' ')
  log_txt.append(' ')

  traffic_id = df3.filter(like='_id').columns
  if df3[traffic_id].columns.any():
    df3_print = df3[traffic_id].nunique().apply(lambda x : "{:,}".format(x))
    df3_print = pd.DataFrame(df3_print)
    df3_print.reset_index(inplace=True)
    df3_print = df3_print.rename(columns={0:'Distinct Count',
                                          'index':'ID Columns'})
    df3_tab = tabulate(df3_print, headers='keys', tablefmt='psql')
    print(df3_tab)
    log_txt.append(df3_tab)
  else:
    df3_notab = 'Traffic Volume IDs DO NOT exist.'
    print(df3_notab)
    log_txt.append(df3_notab)

  print(' ')
  log_txt.append(' ')

  ###########################
  ### Cross-file matching ###
  ###########################

  df1_df2 = set(df1.columns).intersection(set(df2.columns))
  df1_df2 = list(df1_df2)
  if len(df1_df2) != 0:
    df1_df2 = pd.DataFrame(df1_df2)
    df1_df2 = df1_df2.rename(columns={0:'Shared Columns Between Street Repairs'
                                      +' and OCI File'})
    df1_df2_tab = (tabulate(df1_df2, headers='keys', tablefmt='psql'))
    print(df1_df2_tab)
    log_txt.append(df1_df2_tab)
  else:
    df1_df2_notab = 'There are no shared columns between Street Repairs file' + \
                    ' and OCI file.'
    print(df1_df2_notab)                
    log_txt.append(df1_df2_notab)

  print(' ')
  log_txt.append(' ')

  # matching `pve_id` in dataframes
  print(tabulate({'Matching Pve_IDs Across Files':\
                       ['Matching Pve_IDs Across Files']}, headers='keys'))
  log_txt.append(tabulate({'Matching Pve_IDs Across Files':\
                       ['Matching Pve_IDs Across Files']}, headers='keys'))
  for name, dataframe in zip(names, dataframes):
    if 'pve_id' in dataframe:
      pve_id = name+': pve_id' + ' = ' + str(True)
      print(pve_id)
      log_txt.append(pve_id)
    else:
      no_pve_id = name + ': pve_id' + ' = ' + str(False)
      print(no_pve_id)
      log_txt.append(no_pve_id)

  print(' ')
  log_txt.append(' ')

  for name, dataframe in zip(names, dataframes):
    if 'pve_id' in dataframe:
      pve_id_exists = 'There are ' + str(f"""{(dataframe['pve_id'].isin(dataframe
                                          ['pve_id']).astype(int).value_counts()
                                          [1]):,}""") + ' ' + name + \
                                           ' pve_ids in Streets OCI.'
      print(pve_id_exists)                
      log_txt.append(pve_id_exists)
    else: 
      no_pve_id = 'There are no ' + name + ' pve_ids ' + 'in Streets OCI.'
      print(no_pve_id)
      log_txt.append(no_pve_id)
  
  print(' ')
  log_txt.append(' ')


  # matching `seg_id` in dataframes
  print(tabulate({'Matching Seg_IDs Across Files':\
                 ['Matching Seg_IDs Across Files']}, headers='keys'))
  
  log_txt.append(tabulate({'Matching Seg_IDs Across Files':\
                       ['Matching Seg_IDs Across Files']}, headers='keys'))

  for name, dataframe in zip(names, dataframes):
  # check for `seg_ids` in dataframes
    if 'seg_id' in dataframe:
      seg_log = name + ': seg_id' + ' = ' + str(True)
      print(seg_log)
      log_txt.append(seg_log)
      seg_summary = 'There are ' + str(f"{dataframe['seg_id'].nunique():,}") \
      + ' unique seg_id values in ' + name + '.'
      print(seg_summary)
      log_txt.append(seg_summary)
    else:
      no_seg_log = name + ': seg_id' + ' = ' + str(False)
      print(no_seg_log)
      log_txt.append(no_seg_log)

  print(' ')
  log_txt.append(' ')

  for name, dataframe in zip(names, dataframes):
    if 'seg_id' in dataframe:
      seg_id_exists = 'There are '+str(f"""{(dataframe['seg_id'].isin(dataframe
                                          ['seg_id']).astype(int).value_counts()
                                          [1]):,}""") + ' ' + name + \
                      ' seg_ids in Street OCI.'
      print(seg_id_exists)
      log_txt.append(seg_id_exists)
    else: 
      no_seg_id = 'There are no ' + name +' seg_ids ' + \
      'in Streets OCI.'
      print(no_seg_id)
      log_txt.append(no_seg_id)

  report = pd.DataFrame({'San Diego Street Conditions Classification Report'
                          :log_txt})
  
  return report

# pass the build_report function to a new variable named report
report = build_report(df1, df2, df3, df4)

## EXTRA FUNCTION DEFINES WHAT GETS SENT OUT

# save report to .txt file
report.to_csv(path + '/report.txt', index=False, sep="\t",
              quoting=csv.QUOTE_NONE, quotechar='', escapechar='\t')

# save report to .rtf file
report.to_csv(path + '/report.rtf', index=False, sep="\t",
              quoting=csv.QUOTE_NONE, quotechar='', escapechar='\t')
Python Programming San Diego Transportation