Connect to Google Drive via Google Colab

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
Mounted at /content/drive
In [2]:
import os
os.getcwd()
Out[2]:
'/content'
In [3]:
%cd '/content/drive/My Drive/Python Projects/san_diego_street_conditions'
/content/drive/My Drive/Python Projects/san_diego_street_conditions

San Diego Street Conditions Classification Report - Full Code

In [4]:
# import the requisite libraries
import pandas as pd
import csv
from tabulate import tabulate

# read in the dataframes from GitHub
df1 = pd.read_csv('https://raw.githubusercontent.com/lshpaner/sd_streets/' \
                  + 'main/data/oci_2015_datasd.csv')
df2 = pd.read_csv('https://raw.githubusercontent.com/lshpaner/sd_streets/' \
                  + 'main/data/sd_paving_datasd.csv')
df3 = pd.read_csv('https://raw.githubusercontent.com/lshpaner/sd_streets/' \
                  + 'main/data/traffic_counts_datasd.csv')
df4 = pd.read_csv('https://raw.githubusercontent.com/lshpaner/sd_streets/' \
                  + 'main/merged_data/sd_roads_dataframe.csv')
path = '/content/drive/My Drive/Python Projects/san_diego_street_conditions'

def build_report(df1, df2, df3, df4):
  # create an empty log container for appending to output dataframe 
  log_txt = []

  # file names
  names = ['Streets OCI', 'Street Repair Projects', 
          'Traffic Volumes', 'Merged Dataframe']
  dataframes = df1, df2, df3, df4 # dataframes for files

  print(' ')
  # append header to log_txt container defined above
  log_txt.append(' ')
  
  # shape of merged dataframes file
  print('Merged File')
  log_txt.append('Merged File')
  print('No. of Rows in Merged File: ' + str(f"{df4.shape[0]:,}"))
  log_txt.append('No. of Rows in Merged File: ' + str(f"{df4.shape[0]:,}"))
  print('No. of Columns in Merged File: ' + str(f"{df4.shape[1]:,}"))
  log_txt.append('No. of Columns in Merged File: ' + str(f"{df4.shape[1]:,}"))
  print(' ')
  log_txt.append(' ')

  # OCI file
  print('Streets Overall Condition Index (OCI)')
  log_txt.append('Streets Overall Condition Index (OCI)')
  print('No. of Rows in OCI File: ' + str(f"{df1.shape[0]:,}"))
  log_txt.append('No. of Rows in OCI File: ' + str(f"{df1.shape[0]:,}"))
  print('No. of Columns in OCI File: ' + str(f"{df1.shape[1]:,}"))
  log_txt.append('No. of Columns in OCI File: ' + str(f"{df1.shape[1]:,}"))

  print(' ')
  log_txt.append(' ')

  # filter out any columns contain the '_id' string
  oci_id = df1.filter(like='_id').columns
  # if there are any columns that contain the '_id' string in OCI
  # print the number of unique columns and get a distinct count
  # otherwise, report that these OCI Ids do not exist.
  if df1[oci_id].columns.any():
    df1_print = df1[oci_id].nunique().apply(lambda x : "{:,}".format(x))
    df1_print = pd.DataFrame(df1_print)
    df1_print.reset_index(inplace=True)
    df1_print = df1_print.rename(columns={0: 'Distinct Count',
                                            'index':'ID Columns'})
    # encapsulate this distinct count within a table
    df1_tab = tabulate(df1_print, headers='keys', tablefmt='psql')
    print(df1_tab)
    log_txt.append(df1_tab)
  else:
    df1_notab = 'Street OCI IDs DO NOT exist.'
    print(df1_notab)
    log_txt.append(df1_notab)

  print(' ')
  log_txt.append(' ')

  # Street Repair Projects File
  print('Street Repair Projects')
  log_txt.append('Street Repair Projects')
  print('No. of Rows in Street Repairs File: ' + \
        str(f"{df2.shape[0]:,}"))
  log_txt.append('No. of Rows in Street Repairs File: ' + \
                 str(f"{df2.shape[0]:,}"))
  print('No. of Columns in Street Repairs File: ' + \
        str(f"{df2.shape[1]:,}"))
  log_txt.append('No. of Columns in Street Repairs File: ' + \
                 str(f"{df2.shape[1]:,}"))

  print(' ')
  log_txt.append(' ')

  streets_id = df2.filter(like='_id').columns
  if df2[oci_id].columns.any():
    df2_print = df2[streets_id].nunique().apply(lambda x : "{:,}".format(x))
    df2_print = pd.DataFrame(df2_print)
    df2_print.reset_index(inplace=True)
    df2_print = df2_print.rename(columns={0:'Distinct Count',
                                            'index':'ID Columns'})
    df2_tab = tabulate(df2_print, headers='keys',tablefmt='psql')
    print(df2_tab)
    log_txt.append(df2_tab)
  else:
    df2_notab= 'Street Repairs IDs DO NOT exist.'
    print(df2_notab)
    log_txt.append(df2_notab)

  print(' ')
  log_txt.append(' ')

  # shape of Traffic Volumes File
  print('Traffic Volumes')
  log_txt.append('Traffic Volumes')
  print('No. of Rows in Traffic Volumes File: ' + \
        str(f"{df3.shape[0]:,}"))
  log_txt.append('No. of Rows in Traffic Volumes File: ' + \
                 str(f"{df3.shape[0]:,}"))
  print('No. of Columns in Traffic Volumes File: ' + \
        str(f"{df3.shape[1]:,}"))
  log_txt.append('No. of Columns in Traffic Volumes File: ' + \
                 str(f"{df3.shape[1]:,}"))

  print(' ')
  log_txt.append(' ')

  traffic_id = df3.filter(like='_id').columns
  if df3[traffic_id].columns.any():
    df3_print = df3[traffic_id].nunique().apply(lambda x : "{:,}".format(x))
    df3_print = pd.DataFrame(df3_print)
    df3_print.reset_index(inplace=True)
    df3_print = df3_print.rename(columns={0:'Distinct Count',
                                          'index':'ID Columns'})
    df3_tab = tabulate(df3_print, headers='keys', tablefmt='psql')
    print(df3_tab)
    log_txt.append(df3_tab)
  else:
    df3_notab = 'Traffic Volume IDs DO NOT exist.'
    print(df3_notab)
    log_txt.append(df3_notab)

  print(' ')
  log_txt.append(' ')

  ###########################
  ### Cross-file matching ###
  ###########################

  df1_df2 = set(df1.columns).intersection(set(df2.columns))
  df1_df2 = list(df1_df2)
  if len(df1_df2) != 0:
    df1_df2 = pd.DataFrame(df1_df2)
    df1_df2 = df1_df2.rename(columns={0:'Shared Columns Between Street Repairs'
                                      +' and OCI File'})
    df1_df2_tab = (tabulate(df1_df2, headers='keys', tablefmt='psql'))
    print(df1_df2_tab)
    log_txt.append(df1_df2_tab)
  else:
    df1_df2_notab = 'There are no shared columns between Street Repairs file' + \
                    ' and OCI file.'
    print(df1_df2_notab)                
    log_txt.append(df1_df2_notab)

  print(' ')
  log_txt.append(' ')

  # matching `pve_id` in dataframes
  print(tabulate({'Matching Pve_IDs Across Files':\
                       ['Matching Pve_IDs Across Files']}, headers='keys'))
  log_txt.append(tabulate({'Matching Pve_IDs Across Files':\
                       ['Matching Pve_IDs Across Files']}, headers='keys'))
  for name, dataframe in zip(names, dataframes):
    if 'pve_id' in dataframe:
      pve_id = name+': pve_id' + ' = ' + str(True)
      print(pve_id)
      log_txt.append(pve_id)
    else:
      no_pve_id = name + ': pve_id' + ' = ' + str(False)
      print(no_pve_id)
      log_txt.append(no_pve_id)

  print(' ')
  log_txt.append(' ')

  for name, dataframe in zip(names, dataframes):
    if 'pve_id' in dataframe:
      pve_id_exists = 'There are ' + str(f"""{(dataframe['pve_id'].isin(dataframe
                                          ['pve_id']).astype(int).value_counts()
                                          [1]):,}""") + ' ' + name + \
                                           ' pve_ids in Streets OCI.'
      print(pve_id_exists)                
      log_txt.append(pve_id_exists)
    else: 
      no_pve_id = 'There are no ' + name + ' pve_ids ' + 'in Streets OCI.'
      print(no_pve_id)
      log_txt.append(no_pve_id)
  
  print(' ')
  log_txt.append(' ')


  # matching `seg_id` in dataframes
  print(tabulate({'Matching Seg_IDs Across Files':\
                 ['Matching Seg_IDs Across Files']}, headers='keys'))
  
  log_txt.append(tabulate({'Matching Seg_IDs Across Files':\
                       ['Matching Seg_IDs Across Files']}, headers='keys'))

  for name, dataframe in zip(names, dataframes):
  # check for `seg_ids` in dataframes
    if 'seg_id' in dataframe:
      seg_log = name + ': seg_id' + ' = ' + str(True)
      print(seg_log)
      log_txt.append(seg_log)
      seg_summary = 'There are ' + str(f"{dataframe['seg_id'].nunique():,}") \
      + ' unique seg_id values in ' + name + '.'
      print(seg_summary)
      log_txt.append(seg_summary)
    else:
      no_seg_log = name + ': seg_id' + ' = ' + str(False)
      print(no_seg_log)
      log_txt.append(no_seg_log)

  print(' ')
  log_txt.append(' ')

  for name, dataframe in zip(names, dataframes):
    if 'seg_id' in dataframe:
      seg_id_exists = 'There are '+str(f"""{(dataframe['seg_id'].isin(dataframe
                                          ['seg_id']).astype(int).value_counts()
                                          [1]):,}""") + ' ' + name + \
                      ' seg_ids in Street OCI.'
      print(seg_id_exists)
      log_txt.append(seg_id_exists)
    else: 
      no_seg_id = 'There are no ' + name +' seg_ids ' + \
      'in Streets OCI.'
      print(no_seg_id)
      log_txt.append(no_seg_id)

  report = pd.DataFrame({'San Diego Street Conditions Classification Report'
                          :log_txt})
  
  return report

# pass the build_report function to a new variable named report
report = build_report(df1, df2, df3, df4)

## EXTRA FUNCTION DEFINES WHAT GETS SENT OUT

# save report to .txt file
report.to_csv(path + '/report.txt', index=False, sep="\t",
              quoting=csv.QUOTE_NONE, quotechar='', escapechar='\t')

# save report to .rtf file
report.to_csv(path + '/report.rtf', index=False, sep="\t",
              quoting=csv.QUOTE_NONE, quotechar='', escapechar='\t')
 
Merged File
No. of Rows in Merged File: 22,998
No. of Columns in Merged File: 15
 
Streets Overall Condition Index (OCI)
No. of Rows in OCI File: 30,712
No. of Columns in OCI File: 12
 
+----+--------------+------------------+
|    | ID Columns   | Distinct Count   |
|----+--------------+------------------|
|  0 | seg_id       | 30,712           |
+----+--------------+------------------+
 
Street Repair Projects
No. of Rows in Street Repairs File: 23,433
No. of Columns in Street Repairs File: 19
 
+----+--------------+------------------+
|    | ID Columns   | Distinct Count   |
|----+--------------+------------------|
|  0 | pve_id       | 23,433           |
|  1 | seg_id       | 18,072           |
|  2 | project_id   | 103              |
+----+--------------+------------------+
 
Traffic Volumes
No. of Rows in Traffic Volumes File: 12,390
No. of Columns in Traffic Volumes File: 10
 
Traffic Volume IDs DO NOT exist.
 
+----+------------------------------------------------------+
|    | Shared Columns Between Street Repairs and OCI File   |
|----+------------------------------------------------------|
|  0 | street_to                                            |
|  1 | street_from                                          |
|  2 | seg_id                                               |
+----+------------------------------------------------------+
 
Matching Pve_IDs Across Files
-------------------------------
Matching Pve_IDs Across Files
Streets OCI: pve_id = False
Street Repair Projects: pve_id = True
Traffic Volumes: pve_id = False
Merged Dataframe: pve_id = False
 
There are no Streets OCI pve_ids in Streets OCI.
There are 23,433 Street Repair Projects pve_ids in Streets OCI.
There are no Traffic Volumes pve_ids in Streets OCI.
There are no Merged Dataframe pve_ids in Streets OCI.
 
Matching Seg_IDs Across Files
-------------------------------
Matching Seg_IDs Across Files
Streets OCI: seg_id = True
There are 30,712 unique seg_id values in Streets OCI.
Street Repair Projects: seg_id = True
There are 18,072 unique seg_id values in Street Repair Projects.
Traffic Volumes: seg_id = False
Merged Dataframe: seg_id = False
 
There are 30,712 Streets OCI seg_ids in Street OCI.
There are 23,433 Street Repair Projects seg_ids in Street OCI.
There are no Traffic Volumes seg_ids in Streets OCI.
There are no Merged Dataframe seg_ids in Streets OCI.
In [5]:
!jupyter nbconvert --to html sd_streets_ids.ipynb
[NbConvertApp] Converting notebook sd_streets_ids.ipynb to html
[NbConvertApp] Writing 325274 bytes to sd_streets_ids.html