1 San Diego Street Conditions Classification¶

A Cloud Computing Project by Leonid Shpaner, Jose Luis Estrada, and Kiran Singh

import boto3, re, sys, math, json, os, sagemaker, urllib.request
import io
import sagemaker
from sagemaker import get_execution_role
from IPython.display import Image
from IPython.display import display
from time import gmtime, strftime
from sagemaker.predictor import csv_serializer
from pyathena import connect
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from prettytable import PrettyTable 
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, \
RepeatedStratifiedKFold, RandomizedSearchCV
from sklearn.metrics import roc_curve, auc, mean_squared_error,\
precision_score, recall_score, f1_score, accuracy_score,\
confusion_matrix, plot_confusion_matrix, classification_report
from sagemaker.tuner import HyperparameterTuner
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import loguniform
import warnings
warnings.filterwarnings('ignore')

2 Data Wrangling¶

# create athena database
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
# s3 = boto3.Session().client(service_name="s3", region_name=region)

# ec2 = boto3.Session().client(service_name="ec2", region_name=region)
# sm = boto3.Session().client(service_name="sagemaker", region_name=region)

ingest_create_athena_db_passed = False

# set a database name
database_name = "watersd"

# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)
pd.read_sql(statement, conn)

CREATE DATABASE IF NOT EXISTS watersd

water_dir = 's3://waterteam1/raw_files'

# SQL statement to execute the analyte tests drinking water table

table_name ='oci_2015_datasd'
pd.read_sql(f'DROP TABLE IF EXISTS {database_name}.{table_name}', conn)


create_table = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.{table_name}(
                seg_id string,
                oci float,
                street string,
                street_from string,
                street_to string,
                seg_length_ft float,
                seg_width_ft float,
                func_class string,
                pvm_class string,
                area_sq_ft float,
                oci_desc string,
                oci_wt float
                )
                
                ROW FORMAT DELIMITED
                FIELDS TERMINATED BY ','
                LOCATION '{water_dir}/{table_name}'
                TBLPROPERTIES ('skip.header.line.count'='1')
"""

pd.read_sql(create_table, conn)

pd.read_sql(f'SELECT * FROM {database_name}.{table_name} LIMIT 5', conn)

# SQL statement to execute the analyte tests drinking water table

table_name2 ='sd_paving_datasd'
pd.read_sql(f'DROP TABLE IF EXISTS {database_name}.{table_name2}', conn)


create_table = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.{table_name2}(
                pve_id int,
                seg_id string,
                project_id string,
                title string,
                project_manager string,
                project_manager_phone string,
                status string,
                type string,
                resident_engineer string,
                address_street string,
                street_from string,
                street_to string,
                seg_cd int,
                length int,
                width int,
                date_moratorium date,
                date_start date,
                date_end date,
                paving_miles float
                )
                
                ROW FORMAT DELIMITED
                FIELDS TERMINATED BY ','
                LOCATION '{water_dir}/{table_name2}'
                TBLPROPERTIES ('skip.header.line.count'='1')
"""

pd.read_sql(create_table, conn)

pd.read_sql(f'SELECT * FROM {database_name}.{table_name2} LIMIT 5', conn)

# SQL statement to execute the analyte tests drinking water table

table_name3 ='traffic_counts_datasd'
pd.read_sql(f'DROP TABLE IF EXISTS {database_name}.{table_name3}', conn)


create_table = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.{table_name3}(
                id string,
                street_name string,
                limits string,
                northbound_count int,
                southbound_count int,
                eastbound_count int,
                westbound_count int,
                total_count int,
                file_no string,
                date_count date
                )
                
                ROW FORMAT DELIMITED
                FIELDS TERMINATED BY ','
                LOCATION '{water_dir}/{table_name3}'
                TBLPROPERTIES ('skip.header.line.count'='1')
"""

pd.read_sql(create_table, conn)

pd.read_sql(f'SELECT * FROM {database_name}.{table_name3} LIMIT 5', conn)

statement = "SHOW DATABASES"
df_show = pd.read_sql(statement, conn)
df_show.head(5)

if database_name in df_show.values:
    ingest_create_athena_db_passed = True

%store ingest_create_athena_db_passed

Stored 'ingest_create_athena_db_passed' (bool)

pd.read_sql(f'SELECT * FROM {database_name}.{table_name} t1 INNER JOIN \
                            {database_name}.{table_name2} t2 ON t1.seg_id \
                            = t2.seg_id LIMIT 5', conn)

df = pd.read_sql(f'SELECT * FROM (SELECT * FROM {database_name}.{table_name} \
                           t1 INNER JOIN {database_name}.{table_name2} t2 \
                           ON t1.seg_id = t2.seg_id) m1 LEFT JOIN (SELECT street_name, \
                                                                   SUM(total_count) total_count \
                                                                   FROM {database_name}.{table_name3} \
                                                                   GROUP BY street_name) t3 \
                           ON m1.address_street = t3.street_name', conn)

df.head(5)

# remove duplicated columns
df = df.loc[:,~df.columns.duplicated()]

# create flat .csv file from originally
# merged dataframe
# df.to_csv('original_merge.csv')

3 Exploratory Data Analysis (EDA)¶

# get number of rows and columns
print('Number of Rows:', df.shape[0])
print('Number of Columns:', df.shape[1], '\n')

# inspect datatypes and nulls
data_types = df.dtypes
data_types = pd.DataFrame(data_types)
data_types = data_types.assign(Null_Values = 
                               df.isnull().sum())
data_types.reset_index(inplace = True)
data_types.rename(columns={0:'Data Type',
                          'index': 'Column/Variable',
                          'Null_Values': "# of Nulls"})

Number of Rows: 23005
Number of Columns: 30

3.1 Bias Exploration¶

To explore potential areas of bias, we will endeavor to trace class imbalance on the target feature of "oci_desc."

oci_desc_fair = df['oci_desc'].value_counts()['Fair']
oci_desc_good = df['oci_desc'].value_counts()['Good']
oci_desc_poor = df['oci_desc'].value_counts()['Poor']
oci_desc_total = oci_desc_fair  + oci_desc_good + oci_desc_poor

table1 = PrettyTable() # build a table
table1.field_names = ['Fair Condition', 'Good Condition', 
                      'Poor Condition', 'Total']
table1.add_row([oci_desc_fair, oci_desc_good, oci_desc_poor, 
                oci_desc_total])
table1

perc_good = oci_desc_good /(oci_desc_total) 
perc_fair = oci_desc_fair /(oci_desc_total)
perc_poor = oci_desc_poor /(oci_desc_total)
print(round(perc_good, 2)*100, '% of streets '
                       'are in good condition ')
print(round(perc_fair, 2)*100, '% of streets '
                       'are in fair condition ')
print(round(perc_poor, 2)*100, '% of streets '
                       'are in poor condition ')

68.0 % of streets are in good condition 
27.0 % of streets are in fair condition 
5.0 % of streets are in poor condition

Considerably more than half of the streets are in good condition. A little less than a third are in fair condition. Only 5% are in poor condition.

# accidents injury bar graph
conditions = df['oci_desc'].value_counts()
fig = plt.figure()
conditions.plot.bar(x ='lab', y='val', rot=0, width=0.99, 
                         color="steelblue")
plt.title ('Bar Graph of San Diego Street Conditions')
plt.xlabel('Condition') 
plt.ylabel('Count')
plt.show()

conditions

Good    15758
Fair     6105
Poor     1142
Name: oci_desc, dtype: int64

Whereas a method can be used to classify street conditions into multiple classes, it is easier to re-classify streets in “fair” and “good” condition into one category in comparison with the poor class. This, in turn, becomes a binary classification problem. Thus, there are now 21,863 streets in good condition and 1,142 in poor condition (only 5% of all streets). This presents a definitive example of class imbalance.

df['oci_cat'] = df['oci_desc'].map({'Good':1, 'Fair':1,
                                    'Poor':0})
cond = df['oci_cat'].value_counts()
cond

1    21863
0     1142
Name: oci_cat, dtype: int64

# oci ratings bar graph
fig = plt.figure()
cond.plot.bar(x ='lab', y='val', rot=0, width=0.99, 
                         color="steelblue")
plt.title ('Bar Graph of San Diego Street Conditions')
plt.xlabel('Condition') 
plt.ylabel('Count')
plt.show()

cond

1    21863
0     1142
Name: oci_cat, dtype: int64

# cast oci info into range of values
labels = [ "{0} - {1}".format(i, i + 5) for i in range(0, 100, 10) ]
df['OCI Range'] = pd.cut(df.oci, range(0, 105, 10), 
                                right=False, 
                                labels=labels).astype(object)
# inspect the new dataframe with this info
df[['oci', 'OCI Range']]

print("\033[1m"+'Street Conditions by Condition Index Range'+"\033[1m")
def oci_cond():
    oci_desc_good = df.loc[df.oci_desc == 'Good'].groupby(
                               ['OCI Range'])[['oci_desc']].count()
    oci_desc_good.rename(columns = {'oci_desc':'Good'}, inplace=True)
    oci_desc_fair = df.loc[df.oci_desc == 'Fair'].groupby(
                               ['OCI Range'])[['oci_desc']].count()
    oci_desc_fair.rename(columns = {'oci_desc':'Fair'}, inplace=True)
    oci_desc_poor = df.loc[df.oci_desc == 'Poor'].groupby(
                               ['OCI Range'])[['oci_desc']].count()
    oci_desc_poor.rename(columns = {'oci_desc':'Poor'}, inplace=True)
    oci_desc_comb = pd.concat([oci_desc_good, oci_desc_fair, oci_desc_poor], 
    axis = 1)
    # sum row totals
    oci_desc_comb.loc['Total']= oci_desc_comb.sum(numeric_only=True, axis=0)
    # sum column totals
    oci_desc_comb.loc[:,'Total'] = oci_desc_comb.sum(numeric_only=True, axis=1)
    oci_desc_comb.fillna(0, inplace = True)
    return oci_desc_comb.style.format("{:,.0f}")

oci_cond = oci_cond().data # retrieve dataframe
oci_cond

Street Conditions by Condition Index Range

oci_plt = oci_cond['Total'][0:8].sort_values(ascending=False)
oci_plt.plot(kind='bar', width=0.90)
plt.title('Street Conditions by Index Range')
plt.xlabel('Index Range')
plt.ylabel('# of Streets')
plt.show()

3.2 Summary Statistics¶

# summary statistics
summ_stats = pd.DataFrame(df['oci'].describe()).T
summ_stats

IQR = summ_stats['75%'][0] - summ_stats['25%'][0]
low_outlier = summ_stats['25%'][0] - 1.5*(IQR)
high_outlier = summ_stats['75%'][0] + 1.5*(IQR)
print('Low Outlier:', low_outlier)
print('High Outlier:', high_outlier)

Low Outlier: 34.8
High Outlier: 118.8

print("\033[1m"+'Overall Condition Index (OCI) Summary'+"\033[1m")
def oci_by_range():
    pd.options.display.float_format = '{:,.2f}'.format
    new = df.groupby('OCI Range')['oci']\
    .agg(["mean", "median", "std", "min", "max"])
    new.loc['Total'] = new.sum(numeric_only=True, axis=0)
    column_rename = {'mean': 'Mean', 'median': 'Median',
                     'std': 'Standard Deviation',\
                     'min':'Minimum','max': 'Maximum'}
    dfsummary = new.rename(columns = column_rename)
    return dfsummary
oci_by_range = oci_by_range()
oci_by_range

Overall Condition Index (OCI) Summary

3.3 Histogram Distributions¶

# histograms
df.hist(grid=False, figsize=(18,12))
plt.show()

3.4 Boxplot Distribution (OCI)¶

# selected boxplot distribution for oci values
print("\033[1m"+'Boxplot Distribution'+"\033[1m")

# Boxplot of age as one way of showing distribution
fig = plt.figure(figsize = (10,1.5))
plt.title ('Boxplot: Overall Condition Index (OCI)')
plt.xlabel('Speed Limit')
plt.ylabel('Value')
sns.boxplot(data=df['oci'], 
            palette="coolwarm", orient='h', 
            linewidth=2.5)
plt.show()

IQR = summ_stats['75%'][0] - summ_stats['25%'][0]

print('The first quartile is %s. '%summ_stats['25%'][0])
print('The third quartile is %s. '%summ_stats['75%'][0])
print('The IQR is %s.'%round(IQR,2))
print('The mean is %s. '%round(summ_stats['mean'][0],2))
print('The standard deviation is %s. '%round(summ_stats['std'][0],2))
print('The median is %s. '%round(summ_stats['50%'][0],2))

Boxplot Distribution

The first quartile is 66.3. 
The third quartile is 87.3. 
The IQR is 21.0.
The mean is 74.79. 
The standard deviation is 16.78. 
The median is 79.06.

3.5 Correlation Matrix¶

# assign correlation function to new variable
corr = df.corr()
matrix = np.triu(corr) # for triangular matrix
plt.figure(figsize=(10,10))
# parse corr variable intro triangular matrix
sns.heatmap(df.corr(method='pearson'), 
            annot=True, linewidths=.5, 
            cmap="coolwarm", mask=matrix,
            square = True, 
            cbar_kws={'label': 'Correlation Index'}, 
            vmin=-1, vmax=1)
plt.show()

3.6 Multicollinearity¶

Let us narrow our focus by removing highly correlated predictors and passing the rest into a new dataframe.

cor_matrix = df.corr().abs()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),
                                     k=1).astype(np.bool))

to_drop = [column for column in upper_tri.columns if 
           any(upper_tri[column] > 0.75)]
print('These are the columns prescribed to be dropped: %s'%to_drop)

These are the columns prescribed to be dropped: ['area_sq_ft', 'oci_wt', 'length', 'width', 'paving_miles']

4 Pre-Processing¶

Based on the prescribed output of the multicollinearity outcome, we should remove area_sq_ft, oci_wt, length, width, paving_miles, respectively. However, area in square feet is derived from length (x) width values, and converted to paving miles. Removing all of these features is not necessary. We can keep area in square feet, as long as we remove the rest.

4.1 Feature Engineering¶

The start date is subtracted from the end date and converted to number of days as one column.

df['date_end'] = pd.to_datetime(df['date_end'])
df['date_start'] = pd.to_datetime(df['date_start'])
# 7 rows with missing values are dropped in the following line
day_diff = df.dropna(subset=['date_end', 'date_start'],inplace=True) 
df['day_diff'] = (df['date_end'] - df['date_start']).dt.days.astype(int)

zero_days = df['day_diff'].value_counts()[0]
percent_days = round(zero_days/len(df), 2)*100
print('There are', zero_days, 'rows with "0".')
print('That is roughly', percent_days, '% of the data.')

There are 18451 rows with "0".
That is roughly 80.0 % of the data.

The residential, collector, major, prime, local, and alley functional classes are converted to dummy variables.

df['func_class'].value_counts()
df['func_cat'] = df['func_class'].map({'Residential': 1, 
                                       'Collector': 2,
                                       'Major': 3, 'Prime':4,
                                       'Local':5, 'Alley':6})

The AC Improved, PCC Jointed Concrete, AC Unimproved, and UnSurfaced pavement classes are converted to dummy variables.

df['pvm_class'].value_counts()
df['pvm_cat'] = df['pvm_class'].map({'AC Improved': 1, 
                                     'PCC Jointed Concrete': 2,
                                     'AC Unimproved': 3, 
                                     'UnSurfaced':4})

The current status of the job (i.e., post construction, design, bid/award, construction, and planning) is also converted to dummy variables.

df['status'].value_counts()
df['status_cat'] = df['status'].map({'post construction': 1, 
                                     'design': 2,
                                     'bid / award': 3, 
                                     'construction':4,
                                     'planning': 5})

4.2 Dropping Non-Useful/Re-classed Columns¶

Columns with explicit titles (i.e., names) and non-convertible/non-meaningful strings are dropped. Redundant columns (columns that have been cast to dummy variables) have also been dropped in conjunction with the index column which serves no purpose for this experiment.

# drop unnecessary columns

df = df.drop(columns=['street_from','street_to', 
                      'street_name','seg_id', 'street', 
                      'pve_id', 'title','project_manager', 
                      'project_manager_phone', 'project_id',
                      'resident_engineer', 'address_street', 
                      'date_moratorium', 'OCI Range', 'total_count'])

df = df.reset_index(drop=True)

# drop variables exhibiting multicollinearity
df = df.drop(columns=['seg_length_ft', 'seg_width_ft', 
                      'length', 'width', 'paving_miles',
                      'oci_wt'])

# drop re-classed columns
df = df.drop(columns=['func_class', 'pvm_class', 'status', 
                      'type','date_end', 'date_start', 
                      'oci_desc'])

The original dataframe is copied into a new dataframe df1 in order to continue the final steps in the preprocessing endeavor. This is to avoid any mis-steps or adverse/unintended effects on the original dataframe.

# create new dataframe for final pre-processing steps
df1 = df.copy()

One consequence of pre-processing data is that additional missing values may be brought into the mix, so one final sanity check for this phenomenom is commenced as follows.

df_check = df.isna().sum()
df_check[df_check>0]

Series([], dtype: int64)

cor_matrix = df.corr().abs()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),
                                     k=1).astype(np.bool))

to_drop = [column for column in upper_tri.columns if 
           any(upper_tri[column] > 0.75)]
print('These are the columns prescribed to be dropped: %s'%to_drop)

These are the columns prescribed to be dropped: []

4.3 Handling Class Imbalance¶

Multiple methods for balancing a dataset exist like “undersampling the majority classes” (Fregly & Barth, 2021, p. 178). To account for the large gap (95%) of mis-classed data on the “poor” condition class, “oversampling the minority class up to the majority class” (p. 179) is commenced. However, such endeavor cannot proceed in good faith without the unsupervised dimensionality reduction technique of Principal Component Analysis (PCA), which is carried out “to compact the dataset and eliminate irrelevant Features” (Naseriparsa & Kashani, 2014, p. 33). In this case, a new dataframe is reduced down into the first two principal components since the largest percent variance explained exists therein.

# the first two principal components are used
pca = PCA(n_components=2, random_state=777)
data_2d = pd.DataFrame(pca.fit_transform(df1.iloc[:,0:9]))

The dataframe is prepared for scatterplot analysis as follows.

data_2d = pd.concat([data_2d, df1['oci_cat']], axis=1)
data_2d.columns = ['x', 'y', 'oci_cat']; data_2d

sns.lmplot('x','y', data_2d, fit_reg=False, hue='oci_cat',
                    palette=['#00BFC4', '#F8766D'])
plt.title('Class Imbalance in Street Condition Index'); plt.show()

The dataset is oversampled into a new dataframe df2.

The adaptive synthetic sampling approach (ADAYSN) is leveraged “where more synthetic data is generated for minority class examples that are harder to learn compared to those minority examples that are easier to learn” (He et al., 2008). This allows for the minority class to be more closely matched (up-sampled) to the majority class for an approximately even 50/50 weight distribution.

ada = ADASYN(random_state=777)
X_resampled, y_resampled = ada.fit_resample(df1.iloc[:,0:7], 
                                            df1['oci_cat'])

df2 = pd.concat([pd.DataFrame(X_resampled), 
                 pd.DataFrame(y_resampled)], axis=1)
df2.columns = df1.columns

The classes are re-balanced in a new dataframe using oversampling:

# rebalanced classes in new df
df2['oci_cat'].value_counts()
zero_count = df2['oci_cat'].value_counts()[0]
one_count = df2['oci_cat'].value_counts()[1]
zero_plus_one = zero_count + one_count

print('Poor Condition Size:', zero_count)
print('Good Condition Size:', one_count)
print('Total Condition Size:', zero_plus_one)
print('Percent in Poor Condition:', round(zero_count/zero_plus_one,2))
print('Percent in Good Condition:', round(one_count/zero_plus_one,2))

Poor Condition Size: 21714
Good Condition Size: 21858
Total Condition Size: 43572
Percent in Poor Condition: 0.5
Percent in Good Condition: 0.5

The dataframe can now be prepared as a flat .csv file if so desired.

4.4 Train-Test-Validation Split¶

#Divide train set by .7, test set by .15, and valid set .15
size_train = 30500
size_valid = 6536
size_test = 6536
size_total = size_test + size_valid + size_train
train, test = train_test_split(df2, train_size = size_train,\
                               random_state = 777)
valid, test = train_test_split(test, train_size = size_valid,\
                               random_state = 777)

print('Training size:', size_train)
print('Validation size:', size_valid)
print('Test size:', size_test)
print('Total size:', size_train + size_valid + size_test)
print('Training percentage:', round(size_train/(size_total),2))
print('Validation percentage:', round(size_valid/(size_total),2))
print('Test percentage:', round(size_test/(size_total),2))

Training size: 30500
Validation size: 6536
Test size: 6536
Total size: 43572
Training percentage: 0.7
Validation percentage: 0.15
Test percentage: 0.15

# define (list) the features
X_var = list(df2.columns)

# define the target
target ='oci_cat'
X_var.remove(target)
X_train = train[X_var]
y_train = train[target]
X_test = test[X_var] 
y_test = test[target]
X_valid = valid[X_var]
y_valid = valid[target]

# rearrange columns so that the target column is set up first
# for later training
df2 = df2[['oci_cat', 'oci', 'area_sq_ft', 'seg_cd', 'day_diff', 
           'func_cat', 'pvm_cat', 'status_cat']]

# reinspect the dataframe
df2.head()

4.5 Transfer The Final Dataframe (df2) to S3 Bucket¶

s3_client = boto3.client("s3")
BUCKET='waterteam1'
KEY='raw_files/df2/df2.csv'
response = s3_client.get_object(Bucket=BUCKET, Key=KEY)

with io.StringIO() as csv_buffer:
    df2.to_csv(csv_buffer, index=False, header=True)

    response = s3_client.put_object(
        Bucket=BUCKET, Key=KEY, Body=csv_buffer.getvalue()
    )

5 Modeling and Training¶

5.1 Logistic Regression¶

Herein, the classical Anaconda-based scikit-learn approach is leveraged to train the logistic regression model on the validation set.

# Un-Tuned Logistic Regression Model
logit_reg = LogisticRegression(random_state=777) 
logit_reg.fit(X_train, y_train)

# Predict on validation set
logit_reg_pred1 = logit_reg.predict(X_valid)

# accuracy and classification report (Untuned Model)
print('Untuned Logistic Regression Model')
print('Accuracy Score')
print(accuracy_score(y_valid, logit_reg_pred1))
print('Classification Report \n', 
       classification_report(y_valid, logit_reg_pred1))

Untuned Logistic Regression Model
Accuracy Score
0.8959608323133414
Classification Report 
               precision    recall  f1-score   support

           0       0.93      0.85      0.89      3286
           1       0.86      0.94      0.90      3250

    accuracy                           0.90      6536
   macro avg       0.90      0.90      0.90      6536
weighted avg       0.90      0.90      0.90      6536

Next, the logistic regression model is tuned using RandomizedSearchCV() and cross validated using repeated stratified kfold with five splits and two repeats. A set of hyperparamaters are subsequently defined to produce an overall best accuracy score in conjunction with a set of optimal hyperparameters.

model1 = LogisticRegression(random_state=777)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, 
                             random_state=777)
space = dict()

# define search space
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
space['C'] = loguniform(1e-5, 100)

# define search
search = RandomizedSearchCV(model1, space, 
                            scoring='accuracy', 
n_jobs=-1, cv=cv, random_state=777)

# execute search
result = search.fit(X_train, y_train)

# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Best Score: 0.9518524590163935
Best Hyperparameters: {'C': 0.005639439254142048, 'penalty': 'l2', 'solver': 'lbfgs'}

6 Training, Testing, and Deploying a Model with Amazon SageMaker's Built-in XGBoost Model¶

# Define IAM role
role = get_execution_role()

# set the region of the instance
my_region = boto3.session.Session().region_name 

# this line automatically looks for the XGBoost image URI and 
# builds an XGBoost container.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", 
                                                  my_region, 
                                                  "latest")

print("Success - the MySageMakerInstance is in the " + my_region + \
      " region. You will use the " + xgboost_container + \
      " container for your SageMaker endpoint.")

Success - the MySageMakerInstance is in the us-east-1 region. You will use the 811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest container for your SageMaker endpoint.

train, test = np.split(df2.sample(frac=1, random_state=777), 
                                 [int(0.7 * len(df2))])
print(train.shape, test.shape)

(30500, 8) (13072, 8)

6.1 Transfer The Training Data to S3 Bucket¶

s3_client = boto3.client("s3")
BUCKET='waterteam1'
KEY='raw_files/train/train.csv'
response = s3_client.get_object(Bucket=BUCKET, Key=KEY)

with io.StringIO() as csv_buffer:
    train.to_csv(csv_buffer, index=False, header=False)

    response = s3_client.put_object(
        Bucket=BUCKET, Key=KEY, Body=csv_buffer.getvalue()
    )

# input training parameters
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=\
        's3://{}/raw_files/train'.format(BUCKET), content_type='csv')

6.2 Setting up the SageMaker Session and Supplying Instance for XGBoost Model¶

sess = sagemaker.Session()
xgb = sagemaker.estimator.Estimator(xgboost_container,role, 
                                    instance_count=1, 
                                    instance_type='ml.m5.large',
                                    output_path='s3://{}/output'.format(BUCKET),
                                    sagemaker_session=sess)
# parse in the hyperparameters
xgb.set_hyperparameters(max_depth=5,eta=0.2,gamma=4,min_child_weight=6,
                        subsample=0.8,silent=0,
                        objective='binary:logistic',num_round=100)

6.3 Train The Model¶

xgb.fit({'train': s3_input_train})

2022-04-10 22:17:12 Starting - Starting the training job...
2022-04-10 22:17:29 Starting - Preparing the instances for trainingProfilerReport-1649629032: InProgress
.........
2022-04-10 22:18:55 Downloading - Downloading input data......
2022-04-10 22:19:56 Training - Downloading the training image..Arguments: train
[2022-04-10:22:20:25:INFO] Running standalone xgboost training.
[2022-04-10:22:20:25:INFO] Path /opt/ml/input/data/validation does not exist!
[2022-04-10:22:20:25:INFO] File size need to be processed in the node: 1.09mb. Available memory size in the node: 294.12mb
[2022-04-10:22:20:25:INFO] Determined delimiter of CSV input is ','
[22:20:25] S3DistributionType set as FullyReplicated
[22:20:25] 30500x7 matrix with 213500 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,
[22:20:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[0]#011train-error:3.3e-05
[22:20:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[1]#011train-error:3.3e-05
[22:20:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[2]#011train-error:0
[22:20:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[3]#011train-error:0
[22:20:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[4]#011train-error:0
[22:20:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[5]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[6]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[7]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[8]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[9]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[10]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[11]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[12]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[13]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[14]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[15]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[16]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[17]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[18]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[19]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[20]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[21]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[22]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[23]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[24]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[25]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[26]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[27]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[28]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[29]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[30]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[31]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[32]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[33]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[34]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[35]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[36]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[37]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[38]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[39]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[40]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[41]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[42]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[43]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[44]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[45]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[46]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[47]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[48]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[49]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[50]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[51]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[52]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[53]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[54]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[55]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[56]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[57]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[58]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[59]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[60]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[61]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[62]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[63]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[64]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[65]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[66]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[67]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[68]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[69]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[70]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[71]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[72]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[73]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[74]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[75]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[76]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[77]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[78]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[79]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[80]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[81]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[82]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[83]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[84]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[85]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[86]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[87]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[88]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[89]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[90]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[91]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[92]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[93]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[94]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[95]#011train-error:0
[22:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[96]#011train-error:0
[22:20:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[97]#011train-error:0
[22:20:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[98]#011train-error:0
[22:20:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[99]#011train-error:0

2022-04-10 22:20:43 Uploading - Uploading generated training model
2022-04-10 22:20:43 Completed - Training job completed
Training seconds: 118
Billable seconds: 118

6.4 Deploying The Predictor¶

xgb_predictor = xgb.deploy(initial_instance_count=1,
                           instance_type='ml.m5.large')

-----!

6.5 Running Predictions¶

from sagemaker.serializers import CSVSerializer

# load the data into an array
test_array = test.drop(['oci_cat'], axis=1).values

# set the serializer type
xgb_predictor.serializer = CSVSerializer() 

# predict!
predictions = xgb_predictor.predict(test_array).decode('utf-8') 

# and turn the prediction into an array
predictions_array = np.fromstring(predictions[1:], sep=',') 
print(predictions_array.shape)

(13072,)

6.6 Evaluating The Model¶

cm = pd.crosstab(index=test['oci_cat'], 
                 columns=np.round(predictions_array), 
                 rownames=['Observed'], 
                 colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; 
fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "Poor Condition", 
                                    "Good Condition"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("Poor Condition", \
                                        tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Good Condition", \
                                        fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))

Overall Classification Rate: 100.0%

Predicted      Poor Condition Good Condition
Observed
Poor Condition 100% (6497)     0% (0)
Good Condition  0% (0)    100% (6575)

6.7 Terminating the Endpoint To Save on Costs¶

# clean-up by deleteting endpoint
xgb_predictor.delete_endpoint(delete_endpoint_config=True)

References

Amazon Web Services. (n.d.). Amazon Athena.
https://aws.amazon.com/athena/?whats-new-cards.sort-by=item.additionalFields.postDateTime&whats-new-cards.sort-order=desc

Amazon Web Services. (n.d.). Build, train, and deploy a machine learning model with Amazon SageMaker.
https://aws.amazon.com/getting-started/hands-on/build-train-deploy-machine-learning-model-sagemaker/

Fregly, C., & Barth, A. (2021). Data Science on AWS. O'Reilly.

Garrick, D. (2021, September 12). San Diego to spend $700K assessing street conditions to spend repair money wisely. The San Diego Union-Tribune.
https://www.sandiegouniontribune.com/news/politics/story/2021-09-12/san-diego-to-spend-700k-assessing-street-conditions-to-spend-repair-money-wisely

He, H., Bai, Y., Garcia, E. & Li, S. (2008). ADASYN: Adaptive synthetic sampling approach for imbalanced learning.
2008 IEEE International Joint Conference on Neural Networks (IEEE World Congress on Computational Intelligence),1322-1328.
https://ieeexplore.ieee.org/document/4633969

Naseriparsa, M. & Kashani, M.M.R. (2014). Combination of PCA with SMOTE Resampling to Boost the Prediction Rate in Lung Cancer Dataset.
International Journal of Computer Applications, 77(3) 33-38. https://doi.org/10.5120/13376-0987

	seg_id	oci	street	seg_length_ft	seg_width_ft	func_class	pvm_class	area_sq_ft	oci_desc	oci_wt
0	SA-000003	65.14	ALLEY	772.7258	30.0	Alley	PCC Jointed Concrete	23181.773	Fair	1510060.80
1	SA-000004	67.45	ALLEY	196.0025	30.0	Alley	PCC Jointed Concrete	5880.075	Fair	396611.06
2	SA-000005	70.88	ALLEY	395.0049	30.0	Alley	PCC Jointed Concrete	11850.147	Good	839938.44
3	SA-000006	84.00	ALLEY	192.0025	30.0	Alley	PCC Jointed Concrete	5760.075	Good	483846.30
4	SA-000008	79.24	ALLEY	251.7540	30.0	Alley	PCC Jointed Concrete	7552.620	Good	598469.60

	pve_id	seg_id	project_id	title	project_manager	project_manager_phone	status	type	resident_engineer	address_street	seg_cd	length	width	date_moratorium	date_start	date_end	paving_miles
0	1073577074	SA-000319	UTLY	Public Works CIP	Engineering@sandiego.gov	858-627-3200	post construction	Overlay	ECP	ALLEY	2	0	NaN	2019-02-02	2019-02-02	2019-02-02	0.000000
1	1792486183	SA-000345	UTLY	Public Works CIP	Engineering@sandiego.gov	858-627-3200	post construction	Slurry	ECP	ALLEY	2	938	30.0	2019-01-30	2019-01-30	2019-01-30	0.177652
2	1173780646	SA-000375	UTLY	Public Works CIP	Engineering@sandiego.gov	858-627-3200	post construction	Slurry	ECP	ALLEY	2	674	30.0	2018-08-01	2018-08-01	2018-08-01	0.127652
3	1276790298	SA-000378	UTLY	Public Works CIP	Engineering@sandiego.gov	858-627-3200	post construction	Slurry	ECP	ALLEY	2	658	30.0	2018-08-01	2018-08-01	2018-08-01	0.124621
4	27170959	SA-001081	UTLY	Public Works CIP	Engineering@sandiego.gov	858-627-3200	post construction	Concrete	ECP	ALLEY	9	680	30.0	None	2020-08-13	2020-08-13	0.128788

	id	street_name	limits	northbound_count	southbound_count	eastbound_count	westbound_count	total_count	file_no	date_count
0	01AV018207	01 AV	A ST - ASH ST	18010	None	None	None	18010	0182-07	2007-03-13
1	01AV015210	01 AV	A ST - ASH ST	20060	None	None	None	20060	0152-10	2010-03-18
2	01AV018213	01 AV	A ST - ASH ST	19597	None	None	None	19597	0182-13	2013-03-12
3	01AV007721	01 AV	A ST - ASH ST	10640	None	None	None	10640	0077-21	2021-03-10
4	01AV088812	01 AV	ASH ST - BEECH ST	2298	None	None	None	2298	0888-12	2012-12-11

	seg_id	oci	street	seg_length_ft	seg_width_ft	func_class	pvm_class	area_sq_ft	...	address_street	seg_cd	length	width	date_moratorium	date_start	date_end	paving_miles
0	SA-000345	34.14	ALLEY	937.9261	30.0	Alley	AC Improved	28137.783	...	ALLEY	2	938	30	2019-01-30	2019-01-30	2019-01-30	0.177652
1	SA-000375	97.25	ALLEY	673.3209	30.0	Alley	PCC Jointed Concrete	20199.627	...	ALLEY	2	674	30	2018-08-01	2018-08-01	2018-08-01	0.127652
2	SA-000378	62.67	ALLEY	657.2000	30.0	Alley	PCC Jointed Concrete	19716.000	...	ALLEY	2	658	30	2018-08-01	2018-08-01	2018-08-01	0.124621
3	SA-001081	68.86	ALLEY	679.1060	30.0	Alley	PCC Jointed Concrete	20373.180	...	ALLEY	9	680	30	None	2020-08-13	2020-08-13	0.128788
4	SA-001083	28.67	ALLEY	660.0917	30.0	Alley	PCC Jointed Concrete	19802.752	...	ALLEY	9	661	30	None	2020-07-31	2020-07-31	0.125189

	seg_id	oci	street	seg_length_ft	seg_width_ft	func_class	pvm_class	area_sq_ft	...	seg_cd	length	width	date_moratorium	date_start	date_end	paving_miles	street_name	total_count
0	SA-000345	34.14	ALLEY	937.9261	30.0	Alley	AC Improved	28137.783	...	2	938	30	2019-01-30	2019-01-30	2019-01-30	0.177652	None	NaN
1	SA-000375	97.25	ALLEY	673.3209	30.0	Alley	PCC Jointed Concrete	20199.627	...	2	674	30	2018-08-01	2018-08-01	2018-08-01	0.127652	None	NaN
2	SA-000378	62.67	ALLEY	657.2000	30.0	Alley	PCC Jointed Concrete	19716.000	...	2	658	30	2018-08-01	2018-08-01	2018-08-01	0.124621	None	NaN
3	SA-001081	68.86	ALLEY	679.1060	30.0	Alley	PCC Jointed Concrete	20373.180	...	9	680	30	None	2020-08-13	2020-08-13	0.128788	None	NaN
4	SA-001083	28.67	ALLEY	660.0917	30.0	Alley	PCC Jointed Concrete	19802.752	...	9	661	30	None	2020-07-31	2020-07-31	0.125189	None	NaN

	Column/Variable	Data Type	# of Nulls
0	seg_id	object	0
1	oci	float64	0
2	street	object	0
3	street_from	object	0
4	street_to	object	0
5	seg_length_ft	float64	0
6	seg_width_ft	float64	0
7	func_class	object	0
8	pvm_class	object	0
9	area_sq_ft	float64	0
10	oci_desc	object	0
11	oci_wt	float64	0
12	pve_id	int64	0
13	project_id	object	0
14	title	object	0
15	project_manager	object	0
16	project_manager_phone	object	0
17	status	object	0
18	type	object	0
19	resident_engineer	object	0
20	address_street	object	0
21	seg_cd	int64	0
22	length	int64	0
23	width	int64	0
24	date_moratorium	object	4426
25	date_start	object	1
26	date_end	object	7
27	paving_miles	float64	0
28	street_name	object	16874
29	total_count	float64	16874

	oci	OCI Range
0	34.14	30 - 35
1	97.25	90 - 95
2	62.67	60 - 65
3	68.86	60 - 65
4	28.67	20 - 25
...	...	...
23000	93.40	90 - 95
23001	91.01	90 - 95
23002	97.26	90 - 95
23003	95.00	90 - 95
23004	80.83	80 - 85

	Good	Fair	Poor	Total
70 - 75	4766.0	3.0	0.0	4769.0
80 - 85	7341.0	0.0	0.0	7341.0
90 - 95	3541.0	0.0	0.0	3541.0
40 - 45	0.0	1095.0	0.0	1095.0
50 - 55	0.0	1685.0	0.0	1685.0
60 - 65	0.0	3322.0	0.0	3322.0
0 - 5	0.0	0.0	37.0	37.0
10 - 15	0.0	0.0	135.0	135.0
20 - 25	0.0	0.0	259.0	259.0
30 - 35	0.0	0.0	711.0	711.0
Total	15648.0	6105.0	1142.0	22895.0

	Mean	Median	Standard Deviation	Minimum	Maximum
OCI Range
0 - 5	6.13	8.00	3.70	0.00	9.69
10 - 15	15.66	16.40	2.82	10.11	19.84
20 - 25	25.77	26.17	2.91	20.12	29.96
30 - 35	35.63	36.04	2.80	30.04	39.98
40 - 45	45.37	45.58	2.88	40.00	49.98
50 - 55	55.62	56.00	2.88	50.00	59.98
60 - 65	65.56	65.80	2.82	60.00	69.99
70 - 75	75.11	75.16	2.97	70.00	79.99
80 - 85	85.14	85.15	2.84	80.00	89.99
90 - 95	93.44	92.89	2.57	90.00	99.33
Total	503.42	507.19	29.18	450.27	548.73

	x	y	oci_cat
0	7,986.11	-38.95	0
1	47.96	-40.17	1
2	-435.67	-39.63	1
3	221.51	-39.71	1
4	-348.92	-39.08	0
...	...	...	...
22993	-15,801.67	-40.53	1
22994	12,768.33	114.26	1
22995	9,128.33	114.06	1
22996	-12,991.67	-40.48	1
22997	-12,800.19	-40.25	1

	oci_cat	oci	area_sq_ft	seg_cd	func_cat	pvm_cat	status_cat
0	0	34.14	28,137.78	2	6	1	0
1	1	97.25	20,199.63	2	6	2	1
2	1	62.67	19,716.00	2	6	2	1
3	1	68.86	20,373.18	9	6	2	1
4	0	28.67	19,802.75	9	6	2	0

	database_name
0	default
1	dsoaws
2	watersd