Identifying Safer Pedestrian Routes in Los Angeles - Functions(.py)
The following code creates functions for data types and various plots that are used throughout the project pipeline. It also looks for unique id values across all files and outputs the files to .txt
and .rtf
file formats, respectively.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 |
#################################### ## Data Types Report Function ## #################################### # import the requisite libraries import pandas as pd # import pandas library # Data Types Report def data_types(df): ''' This function provides a data types report on every column in the dataframe, showing column names, column data types, number of nulls, and percentage of nulls, respectively. Inputs: df: dataframe to run the datatypes report on Outputs: dat_type: report saved out to a dataframe showing column name, data type, count of null values in the dataframe, and percentage of null values in the dataframe ''' # Features' Data Types and Their Respective Null Counts dat_type = df.dtypes # create a new dataframe to inspect data types dat_type = pd.DataFrame(dat_type) # sum the number of nulls per column in df dat_type['Null_Values'] = df.isnull().sum() # reset index w/ inplace = True for more efficient memory usage dat_type.reset_index(inplace=True) # percentage of null values is produced and cast to new variable dat_type['perc_null'] = round(dat_type['Null_Values'] / len(df)*100,0) # columns are renamed for a cleaner appearance dat_type = dat_type.rename(columns={0:'Data Type', 'index': 'Column/Variable', 'Null_Values': '# of Nulls', 'perc_null': 'Percent Null'}) return dat_type ################################### ##### Boxplot Distributions ###### ################################### import seaborn as sns def sns_boxplot(df, title, xlabel, ylabel, column): ''' This function plots boxplots of any column of interest Inputs: df: dataframe to pass into the function title: title of the boxplot ylabel: y-axis label of the boxplot column: column of interest to run the function on ''' fig = plt.figure(figsize = (15,1.5)) # set figure size plt.title(title, fontsize=12) # set plot title plt.xlabel(xlabel, fontsize=12) # set plot x-axis label plt.ylabel(ylabel, fontsize=12) # set plot y-axis label # seaborn boxplot function w/ horizontal orientation boxplot = sns.boxplot(df[column], palette="coolwarm", orient='h', linewidth=2.5) print() print('Summarizing', column) # Computing IQR Q1 = df[column].quantile(0.25) # first quartile Q3 = df[column].quantile(0.75) # third quartile IQR = Q3-Q1 # interquartile range # Computing Summary Stats of average_monthly_hours mean = round(df[column].mean(),2) # calculate mean std = round(df[column].std(),2) # calculate standard dev. median = round(df[column].median(),2) # calculate median # print statements for summary statistics Q1_print = print('The first quartile is %s. '%Q1) Q3_print = print('The third quartile is %s. '%Q3) IQR_print = print('The IQR is %s.'%round(IQR,2)) mean_print = print('The mean is %s.'%mean) std_print = print('The standard deviation is %s.'%std) median_print = print('The median is %s.'%median) # if mean is greater than median, (+) skewed; # otherwise (-) skewed. if mean > median: print('The distribution is positively skewed.') else: print('The distribution is negatively skewed.') print() #################################### ### Bar Graph Plotting Function ### #################################### # import the requisite libraries import pandas as pd # import pandas library import matplotlib.pyplot as plt # import plotting library # Bar Graph For Any Column in the Dataframe def bar_plot(x, y, df, asc, kind, title, rotation, xlabel, ylabel, column, n): ''' This function allows for the plotting of a bar graph (regular or horizontal) of any column in the dataframe Inputs: x: passed into figsize as width of the bar graph y: passed into figsize as height of the bar graph df: dataframe to pass into the function asc: ascending order of the data (bool) kind: type of barchart (regular or barh for horizontal) title: title of the plot rotation: rotation of axes labels xlabel: x-axis label ylabel: y-axis label column: column of interest n: top number of rows of interest for inspecting the column; some columns may have a large number of observations; in these particular cases, it is best to limit the number of observations for analysis. ''' # set figure size fig, axes = plt.subplots(figsize=(x,y)) # sort values in ascending order and generate top n rows bar_plot = df[column].value_counts().sort_values(ascending=asc).head(n) bar_plot.plot(kind=kind, width=0.9) # plot horizontal bar graph plt.title(title, fontsize=12) # set plot title plt.xticks(rotation=rotation) # rotate x-axis labels to 90 degrees plt.xlabel(xlabel, fontsize=12) # set plot x-axis label plt.ylabel(ylabel, fontsize=12) # set plot y-axis label # Using a contingency table allows for the data in any column of interest to be # summarized by the values in the target column (crime severity). def cont_table(df, col1, lev1, col2, lev2): ''' A function for populating a contingency table is created such that it can be used with the variables of interest and the target column. Inputs: df: dataframe to ingest for the contingency table col1: column of interest from the dataframe; more often than not this is the ground truth (target) column - but this can be replaced with any binary outcome column col2: column that the dataframe is being grouped by lev1: if using ground truth column, these are the less severe crimes lev2: if using ground truth column, these are the more severe crimes Outputs: crime_res_comb.style.format("{:,.0f}"): returns the contingency table as a dataframe with values formatted to two decimal places ''' crime_less = df.loc[df[col1]==lev1].groupby([col2])[[col1]].count() crime_less.rename(columns = {col1:lev1}, inplace=True) crime_more = df.loc[df[col1]==lev2].groupby([col2])[[col1]].count() crime_more.rename(columns={col1:lev2}, inplace=True) crime_res_comb = pd.concat([crime_less, crime_more], axis=1) # sum row totals crime_res_comb['Total']=crime_res_comb.sum(axis=1) crime_res_comb.loc['Total']=crime_res_comb.sum(numeric_only=True, axis=0) # get % total of each row crime_res_comb['% More Serious']=round((crime_res_comb[lev2] / (crime_res_comb[lev2]+crime_res_comb[lev1]))*100, 2) crime_res_comb[lev2]=crime_res_comb[lev2].fillna(0) crime_res_comb['% More Serious']=crime_res_comb['% More Serious'].fillna(0) # crime_res_comb.set_index('new_index_name') return crime_res_comb.style.format("{:,.0f}") ########################## ### Summary Statistics ### ########################## def summ_stats(df, var1, var2): ''' A function to provide 5 number summary for any column of interest Inputs: df: dataframe to ingest for the summary stats table var1: column of interest var2: numerical column (i.e., 'Vict_Age') Output: dfsummary: summary statistics report ''' print("\033[1m"+'Summary Statistics by Age'+"\033[1m") pd.options.display.float_format = '{:,.2f}'.format summ_stats = df.groupby(var1)[var2].agg(['mean', 'median', 'std', 'min', 'max']) column_rename = {'mean': 'Mean', 'median': 'Median', 'std': 'Standard Deviation',\ 'min':'Minimum','max': 'Maximum'} dfsummary = summ_stats.rename(columns = column_rename) return dfsummary ########################## ### Stacked Bar Graphs ### ########################## def stacked_plot (x, y, p, df, col, truth, condition, kind, title1, xlabel1, ylabel1, width, rot, title2, xlabel2, ylabel2): ''' This function provides a stacked and normalized bar graph of any column of interest, colored by ground truth column Inputs: x: x-axis figure size y: y-axis figure size df: dataframe to ingest for the stacked plot col: column of interest truth: ground truth column condition: value from ground truth column kind: type of graph title1: title of first graph xlabel1: x-axis label of first graph ylabel1: y-axis label of first graph width: width of first graph rot: rotation of graph title2: title of second graph ylabel2: y-axis label of second graph ''' fig, axes = plt.subplots(nrows=2, ncols=1,figsize=(x, y)) flat = axes.flatten() fig.tight_layout(w_pad=5, pad=p, h_pad=5) flat = axes.flatten() # main title for both plots fig.suptitle('Absolute Distributions vs. Normalized Distributions', fontsize=12) # crosstabulation of column of interest and ground truth crosstabdest = pd.crosstab(df[col], df[truth]) \ .sort_values(by=[condition], ascending=False) # normalized crosstabulation crosstabdestnorm = crosstabdest.div(crosstabdest.sum(1), axis = 0) # plotting the first stacked bar graph plotdest = crosstabdest.plot(kind=kind, stacked=True, title=title1, ax=flat[0], color=['#00BFC4', '#F8766D'], width=width, rot=rot, fontsize=12) flat[0].set_title(label=title1, fontsize=12) flat[0].set_xlabel(xlabel1, fontsize=12) flat[0].set_ylabel(ylabel1, fontsize=12) flat[0].legend(fontsize=12) # plotting the second, normalized stacked bar graph plotdestnorm = crosstabdestnorm.plot(kind=kind, stacked=True, title=title2, ylabel='Frequency', ax=flat[1], color=['#00BFC4', '#F8766D'], width=width, rot=rot, fontsize=12) flat[1].set_title(label=title2, fontsize=12) flat[1].set_xlabel(xlabel2, fontsize=12) flat[1].set_ylabel(ylabel2, fontsize=12) flat[1].legend(fontsize=12) fig.align_ylabels() |