# import the requisite libraries
import pandas as pd
import csv
from tabulate import tabulate
# read in the dataframes from GitHub
df1 = pd.read_csv('https://raw.githubusercontent.com/lshpaner/sd_streets/' \
                  + 'main/data/oci_2015_datasd.csv')
df2 = pd.read_csv('https://raw.githubusercontent.com/lshpaner/sd_streets/' \
                  + 'main/data/sd_paving_datasd.csv')
df3 = pd.read_csv('https://raw.githubusercontent.com/lshpaner/sd_streets/' \
                  + 'main/data/traffic_counts_datasd.csv')
df4 = pd.read_csv('https://raw.githubusercontent.com/lshpaner/sd_streets/' \
                  + 'main/merged_data/sd_roads_dataframe.csv')
path = '/content/drive/My Drive/Python Projects/san_diego_street_conditions'
def build_report(df1, df2, df3, df4):
  # create an empty log container for appending to output dataframe 
  log_txt = []
  # file names
  names = ['Streets OCI', 'Street Repair Projects', 
          'Traffic Volumes', 'Merged Dataframe']
  dataframes = df1, df2, df3, df4 # dataframes for files
  print(' ')
  # append header to log_txt container defined above
  log_txt.append(' ')
  
  # shape of merged dataframes file
  print('Merged File')
  log_txt.append('Merged File')
  print('No. of Rows in Merged File: ' + str(f"{df4.shape[0]:,}"))
  log_txt.append('No. of Rows in Merged File: ' + str(f"{df4.shape[0]:,}"))
  print('No. of Columns in Merged File: ' + str(f"{df4.shape[1]:,}"))
  log_txt.append('No. of Columns in Merged File: ' + str(f"{df4.shape[1]:,}"))
  print(' ')
  log_txt.append(' ')
  # OCI file
  print('Streets Overall Condition Index (OCI)')
  log_txt.append('Streets Overall Condition Index (OCI)')
  print('No. of Rows in OCI File: ' + str(f"{df1.shape[0]:,}"))
  log_txt.append('No. of Rows in OCI File: ' + str(f"{df1.shape[0]:,}"))
  print('No. of Columns in OCI File: ' + str(f"{df1.shape[1]:,}"))
  log_txt.append('No. of Columns in OCI File: ' + str(f"{df1.shape[1]:,}"))
  print(' ')
  log_txt.append(' ')
  # filter out any columns contain the '_id' string
  oci_id = df1.filter(like='_id').columns
  # if there are any columns that contain the '_id' string in OCI
  # print the number of unique columns and get a distinct count
  # otherwise, report that these OCI Ids do not exist.
  if df1[oci_id].columns.any():
    df1_print = df1[oci_id].nunique().apply(lambda x : "{:,}".format(x))
    df1_print = pd.DataFrame(df1_print)
    df1_print.reset_index(inplace=True)
    df1_print = df1_print.rename(columns={0: 'Distinct Count',
                                            'index':'ID Columns'})
    # encapsulate this distinct count within a table
    df1_tab = tabulate(df1_print, headers='keys', tablefmt='psql')
    print(df1_tab)
    log_txt.append(df1_tab)
  else:
    df1_notab = 'Street OCI IDs DO NOT exist.'
    print(df1_notab)
    log_txt.append(df1_notab)
  print(' ')
  log_txt.append(' ')
  # Street Repair Projects File
  print('Street Repair Projects')
  log_txt.append('Street Repair Projects')
  print('No. of Rows in Street Repairs File: ' + \
        str(f"{df2.shape[0]:,}"))
  log_txt.append('No. of Rows in Street Repairs File: ' + \
                 str(f"{df2.shape[0]:,}"))
  print('No. of Columns in Street Repairs File: ' + \
        str(f"{df2.shape[1]:,}"))
  log_txt.append('No. of Columns in Street Repairs File: ' + \
                 str(f"{df2.shape[1]:,}"))
  print(' ')
  log_txt.append(' ')
  streets_id = df2.filter(like='_id').columns
  if df2[oci_id].columns.any():
    df2_print = df2[streets_id].nunique().apply(lambda x : "{:,}".format(x))
    df2_print = pd.DataFrame(df2_print)
    df2_print.reset_index(inplace=True)
    df2_print = df2_print.rename(columns={0:'Distinct Count',
                                            'index':'ID Columns'})
    df2_tab = tabulate(df2_print, headers='keys',tablefmt='psql')
    print(df2_tab)
    log_txt.append(df2_tab)
  else:
    df2_notab= 'Street Repairs IDs DO NOT exist.'
    print(df2_notab)
    log_txt.append(df2_notab)
  print(' ')
  log_txt.append(' ')
  # shape of Traffic Volumes File
  print('Traffic Volumes')
  log_txt.append('Traffic Volumes')
  print('No. of Rows in Traffic Volumes File: ' + \
        str(f"{df3.shape[0]:,}"))
  log_txt.append('No. of Rows in Traffic Volumes File: ' + \
                 str(f"{df3.shape[0]:,}"))
  print('No. of Columns in Traffic Volumes File: ' + \
        str(f"{df3.shape[1]:,}"))
  log_txt.append('No. of Columns in Traffic Volumes File: ' + \
                 str(f"{df3.shape[1]:,}"))
  print(' ')
  log_txt.append(' ')
  traffic_id = df3.filter(like='_id').columns
  if df3[traffic_id].columns.any():
    df3_print = df3[traffic_id].nunique().apply(lambda x : "{:,}".format(x))
    df3_print = pd.DataFrame(df3_print)
    df3_print.reset_index(inplace=True)
    df3_print = df3_print.rename(columns={0:'Distinct Count',
                                          'index':'ID Columns'})
    df3_tab = tabulate(df3_print, headers='keys', tablefmt='psql')
    print(df3_tab)
    log_txt.append(df3_tab)
  else:
    df3_notab = 'Traffic Volume IDs DO NOT exist.'
    print(df3_notab)
    log_txt.append(df3_notab)
  print(' ')
  log_txt.append(' ')
  ###########################
  ### Cross-file matching ###
  ###########################
  df1_df2 = set(df1.columns).intersection(set(df2.columns))
  df1_df2 = list(df1_df2)
  if len(df1_df2) != 0:
    df1_df2 = pd.DataFrame(df1_df2)
    df1_df2 = df1_df2.rename(columns={0:'Shared Columns Between Street Repairs'
                                      +' and OCI File'})
    df1_df2_tab = (tabulate(df1_df2, headers='keys', tablefmt='psql'))
    print(df1_df2_tab)
    log_txt.append(df1_df2_tab)
  else:
    df1_df2_notab = 'There are no shared columns between Street Repairs file' + \
                    ' and OCI file.'
    print(df1_df2_notab)                
    log_txt.append(df1_df2_notab)
  print(' ')
  log_txt.append(' ')
  # matching `pve_id` in dataframes
  print(tabulate({'Matching Pve_IDs Across Files':\
                       ['Matching Pve_IDs Across Files']}, headers='keys'))
  log_txt.append(tabulate({'Matching Pve_IDs Across Files':\
                       ['Matching Pve_IDs Across Files']}, headers='keys'))
  for name, dataframe in zip(names, dataframes):
    if 'pve_id' in dataframe:
      pve_id = name+': pve_id' + ' = ' + str(True)
      print(pve_id)
      log_txt.append(pve_id)
    else:
      no_pve_id = name + ': pve_id' + ' = ' + str(False)
      print(no_pve_id)
      log_txt.append(no_pve_id)
  print(' ')
  log_txt.append(' ')
  for name, dataframe in zip(names, dataframes):
    if 'pve_id' in dataframe:
      pve_id_exists = 'There are ' + str(f"""{(dataframe['pve_id'].isin(dataframe
                                          ['pve_id']).astype(int).value_counts()
                                          [1]):,}""") + ' ' + name + \
                                           ' pve_ids in Streets OCI.'
      print(pve_id_exists)                
      log_txt.append(pve_id_exists)
    else: 
      no_pve_id = 'There are no ' + name + ' pve_ids ' + 'in Streets OCI.'
      print(no_pve_id)
      log_txt.append(no_pve_id)
  
  print(' ')
  log_txt.append(' ')
  # matching `seg_id` in dataframes
  print(tabulate({'Matching Seg_IDs Across Files':\
                 ['Matching Seg_IDs Across Files']}, headers='keys'))
  
  log_txt.append(tabulate({'Matching Seg_IDs Across Files':\
                       ['Matching Seg_IDs Across Files']}, headers='keys'))
  for name, dataframe in zip(names, dataframes):
  # check for `seg_ids` in dataframes
    if 'seg_id' in dataframe:
      seg_log = name + ': seg_id' + ' = ' + str(True)
      print(seg_log)
      log_txt.append(seg_log)
      seg_summary = 'There are ' + str(f"{dataframe['seg_id'].nunique():,}") \
      + ' unique seg_id values in ' + name + '.'
      print(seg_summary)
      log_txt.append(seg_summary)
    else:
      no_seg_log = name + ': seg_id' + ' = ' + str(False)
      print(no_seg_log)
      log_txt.append(no_seg_log)
  print(' ')
  log_txt.append(' ')
  for name, dataframe in zip(names, dataframes):
    if 'seg_id' in dataframe:
      seg_id_exists = 'There are '+str(f"""{(dataframe['seg_id'].isin(dataframe
                                          ['seg_id']).astype(int).value_counts()
                                          [1]):,}""") + ' ' + name + \
                      ' seg_ids in Street OCI.'
      print(seg_id_exists)
      log_txt.append(seg_id_exists)
    else: 
      no_seg_id = 'There are no ' + name +' seg_ids ' + \
      'in Streets OCI.'
      print(no_seg_id)
      log_txt.append(no_seg_id)
  report = pd.DataFrame({'San Diego Street Conditions Classification Report'
                          :log_txt})
  
  return report
# pass the build_report function to a new variable named report
report = build_report(df1, df2, df3, df4)
## EXTRA FUNCTION DEFINES WHAT GETS SENT OUT
# save report to .txt file
report.to_csv(path + '/report.txt', index=False, sep="\t",
              quoting=csv.QUOTE_NONE, quotechar='', escapechar='\t')
# save report to .rtf file
report.to_csv(path + '/report.rtf', index=False, sep="\t",
              quoting=csv.QUOTE_NONE, quotechar='', escapechar='\t')