AWS SageMaker Tutorial: Part 7

IMPORT DATASET AND LIBRARIES

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import zipfile
# import the csv files using pandas
features = pd.read_csv('Features_data_set.csv')
sales = pd.read_csv('sales_data_set.csv')
stores = pd.read_csv('stores_data_set.csv')

Shape Datafield

# function to return month
def get_month(datetime):
    return int(str(datetime).split('-')[1])

# Change the datatype of 'date' column
features['Date'] = pd.to_datetime(features['Date'])
sales['Date'] = pd.to_datetime(sales['Date'])

# merge the three seperate csv files
df = pd.merge(sales, features, on = ['Store','Date','IsHoliday'])
df = pd.merge(df, stores, on = ['Store'], how = 'left')

# retrieve month from date column and add to it's own "Month" column
df['Month'] = df['Date'].apply(get_month)

# Fill up NaN elements with zeros
df = df.fillna(0)

# Convert IsHoliday boolean to numbers
df['IsHoliday'] =df['IsHoliday'].apply(lambda element : 0 if element == False else 1)

## Move target "weekly_sales" into first column
# remove column into its own object
first_column = df.pop('Weekly_Sales')

# insert column using insert(position,column_name,
# first_column) function
df.insert(0, 'Weekly_Sales', first_column)

# drop the Date column
# We no longer need the date column as we have isolated the month into it's own column
df = df.drop(columns = ['Date'])

# get dummies
df = pd.get_dummies(df, columns = ['Type', 'Store', 'Dept'], drop_first = True)
df.sample(n=20)
Weekly_Sales IsHoliday Temperature Fuel_Price MarkDown1 MarkDown2 MarkDown3 MarkDown4 MarkDown5 CPI ... Dept_90 Dept_91 Dept_92 Dept_93 Dept_94 Dept_95 Dept_96 Dept_97 Dept_98 Dept_99
299268 3745.69 0 94.22 3.684 0.00 0.00 0.00 0.00 0.00 215.197852 ... 0 0 0 0 0 0 0 0 0 0
238515 3543.48 0 72.17 2.808 0.00 0.00 0.00 0.00 0.00 204.567546 ... 0 0 0 0 0 0 0 0 0 0
389906 124014.71 0 74.47 3.646 7791.47 306.70 31.44 4856.67 4087.29 197.832220 ... 0 0 1 0 0 0 0 0 0 0
139476 3718.19 0 65.83 2.942 0.00 0.00 0.00 0.00 0.00 132.473333 ... 0 0 0 0 0 0 0 0 0 0
419249 15534.95 0 41.55 3.816 22832.38 2515.25 4.00 13317.88 2560.48 190.171493 ... 0 0 0 0 0 0 0 0 0 0
234284 3289.22 0 31.92 3.737 755.78 4142.75 190.30 90.50 2594.66 136.959839 ... 0 0 0 0 0 0 0 0 0 0
351381 81.08 0 55.41 3.112 158.11 0.00 7.50 0.00 1316.88 218.054185 ... 0 0 0 0 0 0 0 0 0 0
231826 8931.06 0 73.12 4.069 0.00 0.00 0.00 0.00 0.00 134.855161 ... 0 0 0 0 0 0 0 0 0 0
414889 38683.93 0 30.54 3.109 0.00 0.00 0.00 0.00 0.00 182.551954 ... 0 0 0 0 0 0 0 0 0 0
215061 20421.97 0 50.75 3.991 21823.53 0.00 37.87 6586.49 1565.11 142.017793 ... 0 0 0 0 0 0 0 0 0 0
69496 49155.24 0 62.66 2.808 0.00 0.00 0.00 0.00 0.00 213.818636 ... 0 0 0 0 0 0 0 0 0 0
264394 46906.16 0 37.24 3.874 14655.15 10670.84 162.82 11286.99 1595.80 141.214036 ... 0 0 0 1 0 0 0 0 0 0
323355 6781.83 0 52.43 2.699 0.00 0.00 0.00 0.00 0.00 126.491290 ... 0 0 0 0 0 0 0 0 0 0
211688 7071.69 0 58.56 4.101 0.00 0.00 0.00 0.00 0.00 138.587106 ... 0 0 0 0 0 0 0 0 0 0
97064 8373.04 0 88.83 4.002 4407.90 0.00 7.20 3037.56 3717.52 130.790968 ... 0 0 0 0 0 0 0 0 0 0
85976 5302.91 0 70.94 3.688 4727.80 0.00 1.04 356.78 1077.81 225.478263 ... 0 0 0 0 0 0 0 0 0 0
99892 10980.97 0 87.70 2.619 0.00 0.00 0.00 0.00 0.00 214.889794 ... 0 0 0 0 0 0 0 0 0 0
250308 1437.29 0 16.70 3.215 0.00 0.00 0.00 0.00 0.00 132.951065 ... 0 0 0 0 0 0 0 0 0 0
391104 17084.47 0 54.34 2.962 0.00 0.00 0.00 0.00 0.00 126.442065 ... 0 0 0 0 0 0 0 1 0 0
42864 2907.10 0 37.74 2.983 0.00 0.00 0.00 0.00 0.00 212.008514 ... 0 0 0 0 0 0 0 0 0 0

20 rows × 139 columns

Datafield is looking good at this point

df.shape
(421570, 139)

Shape Training and Validation Data

Split Data

# spliting the data in to test and train sets
from sklearn.model_selection import train_test_split
training_data, testing_data = train_test_split(df, test_size = 0.02)

Generate CSV Files

# save train_data and validation_data as csv files.
training_data.to_csv('training.csv', header = False, index = False)
testing_data.to_csv('validation.csv', header = False, index = False)

Configure Sagemaker

# Boto3 is the Amazon Web Services (AWS) Software Development Kit (SDK) for Python
# Boto3 allows Python developer to write software that makes use of services like Amazon S3 and Amazon EC2

import sagemaker
import boto3
from sagemaker import Session

# create sagemaker session
sagemaker_session = sagemaker.Session()

# specify bucket and folder
bucket = 'tutorial-sagemaker-sales-xgboost'
prefix = 'XGBoost-Regressor'
key = 'XGBoost-Regressor'

#Roles give learning and hosting access to the data
#This is specified while opening the sagemakers instance in "Create an IAM role"
role = sagemaker.get_execution_role()

print(bucket)
print(prefix)
print(key)
print(role)
tutorial-sagemaker-sales-xgboost
XGBoost-Regressor
XGBoost-Regressor
arn:aws:iam::483449698840:role/service-role/AmazonSageMaker-ExecutionRole-20211018T115875

Training Data Location

# read the data from csv file and then upload the data to s3 bucket
import os
with open('training.csv','rb') as file:
    # The following code uploads the data into S3 bucket to be accessed later for training
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(file)

# Let's print out the training data location in s3
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))
uploaded training data location: s3://tutorial-sagemaker-sales-xgboost/XGBoost-Regressor/train/XGBoost-Regressor

Validation Data Location

# read the data from csv file and then upload the data to s3 bucket

with open('validation.csv','rb') as file:
    # The following code uploads the data into S3 bucket to be accessed later for training

    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation', key)).upload_fileobj(file)
# Let's print out the validation data location in s3
s3_validation_data = 's3://{}/{}/validation/{}'.format(bucket, prefix, key)
print('uploaded validation data location: {}'.format(s3_validation_data))
uploaded validation data location: s3://tutorial-sagemaker-sales-xgboost/XGBoost-Regressor/validation/XGBoost-Regressor

Output Placeholder

# creates output placeholder in S3 bucket to store the output

output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('training artifacts will be uploaded to: {}'.format(output_location))
training artifacts will be uploaded to: s3://tutorial-sagemaker-sales-xgboost/XGBoost-Regressor/output

Algorithm Container

# This code is used to get the training container of sagemaker built-in algorithms
# all we have to do is to specify the name of the algorithm, that we want to use

region = boto3.Session().region_name
container = sagemaker.image_uris.retrieve('xgboost', region, version='latest')
container
'811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest'

Container and Algorithm Parameters

# Specify the type of instance that we would like to use for training
# output path and sagemaker session into the Estimator.
# We can also specify how many instances we would like to use for training

# Recall that XGBoost works by combining an ensemble of weak models to generate accurate/robust results.
# The weak models are randomized to avoid overfitting

# num_round: The number of rounds to run the training.


# Alpha: L1 regularization term on weights. Increasing this value makes models more conservative.

# colsample_by_tree: fraction of features that will be used to train each tree.

# eta: Step size shrinkage used in updates to prevent overfitting.
# After each boosting step, eta parameter shrinks the feature weights to make the boosting process more conservative.


Xgboost_regressor = sagemaker.estimator.Estimator(container,
                                       role,
                                       instance_count = 1,
                                       instance_type = 'ml.m5.2xlarge',
                                       output_path = output_location,
                                       sagemaker_session = sagemaker_session,
                                       # reduce cost with spot instances
                                       use_spot_instances = True,
                                       max_run = 300,
                                       max_wait = 600
                                       )

#We can tune the hyper-parameters to improve the performance of the model

Xgboost_regressor.set_hyperparameters(max_depth = 10,
                           objective = 'reg:linear',
                           colsample_bytree = 0.3,
                           alpha = 10,
                           eta = 0.1,
                           num_round = 100
                           )

Data Channels

# Creating "train", "validation" channels to feed in the model
# Source: https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html

train_input = sagemaker.inputs.TrainingInput(s3_data = s3_train_data, content_type='csv',s3_data_type = 'S3Prefix')
valid_input = sagemaker.inputs.TrainingInput(s3_data = s3_validation_data, content_type='csv',s3_data_type = 'S3Prefix')


data_channels = {'train': train_input,'validation': valid_input}

Train Model

Xgboost_regressor.fit(data_channels)
2021-11-04 01:25:49 Starting - Starting the training job...
2021-11-04 01:26:13 Starting - Launching requested ML instancesProfilerReport-1635989149: InProgress
......
2021-11-04 01:27:14 Starting - Preparing the instances for training.........
2021-11-04 01:28:46 Downloading - Downloading input data
2021-11-04 01:28:46 Training - Downloading the training image..Arguments: train
[2021-11-04:01:29:02:INFO] Running standalone xgboost training.
[2021-11-04:01:29:02:INFO] File size need to be processed in the node: 134.75mb. Available memory size in the node: 23771.49mb
[2021-11-04:01:29:02:INFO] Determined delimiter of CSV input is ','
[01:29:02] S3DistributionType set as FullyReplicated
[01:29:03] 413138x138 matrix with 57013044 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,
[2021-11-04:01:29:03:INFO] Determined delimiter of CSV input is ','
[01:29:03] S3DistributionType set as FullyReplicated
[01:29:03] 8432x138 matrix with 1163616 entries loaded from /opt/ml/input/data/validation?format=csv&label_column=0&delimiter=,
[01:29:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 280 extra nodes, 0 pruned nodes, max_depth=10
[0]#011train-rmse:26438.7#011validation-rmse:26719.4
[01:29:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 186 extra nodes, 0 pruned nodes, max_depth=10
[1]#011train-rmse:25132#011validation-rmse:25360.7
[01:29:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 134 extra nodes, 0 pruned nodes, max_depth=10
[2]#011train-rmse:24277.6#011validation-rmse:24491.7
[01:29:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 516 extra nodes, 0 pruned nodes, max_depth=10
[3]#011train-rmse:23507.3#011validation-rmse:23752.3
[01:29:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 376 extra nodes, 0 pruned nodes, max_depth=10
[4]#011train-rmse:22410.4#011validation-rmse:22620.7
[01:29:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 294 extra nodes, 0 pruned nodes, max_depth=10
[5]#011train-rmse:21626.1#011validation-rmse:21807.8
[01:29:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 280 extra nodes, 0 pruned nodes, max_depth=10
[6]#011train-rmse:21162.7#011validation-rmse:21349.2
[01:29:07] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 582 extra nodes, 0 pruned nodes, max_depth=10
[7]#011train-rmse:20449.9#011validation-rmse:20609.4

2021-11-04 01:29:14 Training - Training image download completed. Training in progress.[01:29:07] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 200 extra nodes, 0 pruned nodes, max_depth=10
[8]#011train-rmse:20010.4#011validation-rmse:20186.7
[01:29:07] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 512 extra nodes, 0 pruned nodes, max_depth=10
[9]#011train-rmse:19158.7#011validation-rmse:19302.7
[01:29:07] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 264 extra nodes, 0 pruned nodes, max_depth=10
[10]#011train-rmse:18656.4#011validation-rmse:18803.7
[01:29:08] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 568 extra nodes, 0 pruned nodes, max_depth=10
[11]#011train-rmse:18227.7#011validation-rmse:18348.3
[01:29:08] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 756 extra nodes, 0 pruned nodes, max_depth=10
[12]#011train-rmse:17616.9#011validation-rmse:17700.8
[01:29:08] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 360 extra nodes, 0 pruned nodes, max_depth=10
[13]#011train-rmse:17309.5#011validation-rmse:17375.6
[01:29:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 294 extra nodes, 0 pruned nodes, max_depth=10
[14]#011train-rmse:16843.7#011validation-rmse:16882
[01:29:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 368 extra nodes, 0 pruned nodes, max_depth=10
[15]#011train-rmse:16445#011validation-rmse:16438.4
[01:29:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 322 extra nodes, 0 pruned nodes, max_depth=10
[16]#011train-rmse:16258.4#011validation-rmse:16249.4
[01:29:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 362 extra nodes, 0 pruned nodes, max_depth=10
[17]#011train-rmse:15938.5#011validation-rmse:15885.8
[01:29:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 270 extra nodes, 0 pruned nodes, max_depth=10
[18]#011train-rmse:15584.9#011validation-rmse:15513.5
[01:29:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 534 extra nodes, 0 pruned nodes, max_depth=10
[19]#011train-rmse:15283.6#011validation-rmse:15172.6
[01:29:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 424 extra nodes, 0 pruned nodes, max_depth=10
[20]#011train-rmse:15006.6#011validation-rmse:14858.5
[01:29:11] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 778 extra nodes, 0 pruned nodes, max_depth=10
[21]#011train-rmse:14646.6#011validation-rmse:14502.8
[01:29:11] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 282 extra nodes, 0 pruned nodes, max_depth=10
[22]#011train-rmse:14464.6#011validation-rmse:14300.9
[01:29:11] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 256 extra nodes, 0 pruned nodes, max_depth=10
[23]#011train-rmse:14199.5#011validation-rmse:14023.8
[01:29:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 688 extra nodes, 0 pruned nodes, max_depth=10
[24]#011train-rmse:13979.7#011validation-rmse:13792.4
[01:29:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 828 extra nodes, 0 pruned nodes, max_depth=10
[25]#011train-rmse:13745#011validation-rmse:13539.5
[01:29:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 232 extra nodes, 0 pruned nodes, max_depth=10
[26]#011train-rmse:13451.5#011validation-rmse:13231.4
[01:29:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 518 extra nodes, 0 pruned nodes, max_depth=10
[27]#011train-rmse:13235#011validation-rmse:13032.9
[01:29:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 316 extra nodes, 0 pruned nodes, max_depth=10
[28]#011train-rmse:13109#011validation-rmse:12883.8
[01:29:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 170 extra nodes, 0 pruned nodes, max_depth=10
[29]#011train-rmse:12992.8#011validation-rmse:12781.5
[01:29:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 242 extra nodes, 0 pruned nodes, max_depth=10
[30]#011train-rmse:12886.4#011validation-rmse:12669.8
[01:29:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 790 extra nodes, 0 pruned nodes, max_depth=10
[31]#011train-rmse:12636.9#011validation-rmse:12410.4
[01:29:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 328 extra nodes, 0 pruned nodes, max_depth=10
[32]#011train-rmse:12378.1#011validation-rmse:12136.2
[01:29:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 306 extra nodes, 0 pruned nodes, max_depth=10
[33]#011train-rmse:12252.1#011validation-rmse:12005.4
[01:29:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 556 extra nodes, 0 pruned nodes, max_depth=10
[34]#011train-rmse:12085#011validation-rmse:11850.3
[01:29:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 922 extra nodes, 0 pruned nodes, max_depth=10
[35]#011train-rmse:11943.4#011validation-rmse:11701
[01:29:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 942 extra nodes, 0 pruned nodes, max_depth=10
[36]#011train-rmse:11628.8#011validation-rmse:11417.1
[01:29:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 998 extra nodes, 0 pruned nodes, max_depth=10
[37]#011train-rmse:11498.4#011validation-rmse:11306.6
[01:29:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 510 extra nodes, 0 pruned nodes, max_depth=10
[38]#011train-rmse:11359.4#011validation-rmse:11172.8
[01:29:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 630 extra nodes, 0 pruned nodes, max_depth=10
[39]#011train-rmse:11261.7#011validation-rmse:11084.4
[01:29:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 464 extra nodes, 0 pruned nodes, max_depth=10
[40]#011train-rmse:11017.4#011validation-rmse:10820.3
[01:29:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 284 extra nodes, 0 pruned nodes, max_depth=10
[41]#011train-rmse:10923.9#011validation-rmse:10712
[01:29:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 824 extra nodes, 0 pruned nodes, max_depth=10
[42]#011train-rmse:10716.8#011validation-rmse:10471.9
[01:29:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 760 extra nodes, 0 pruned nodes, max_depth=10
[43]#011train-rmse:10506#011validation-rmse:10261.4
[01:29:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 342 extra nodes, 0 pruned nodes, max_depth=10
[44]#011train-rmse:10394.5#011validation-rmse:10153.8
[01:29:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 438 extra nodes, 0 pruned nodes, max_depth=10
[45]#011train-rmse:10335.6#011validation-rmse:10093.1
[01:29:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 438 extra nodes, 0 pruned nodes, max_depth=10
[46]#011train-rmse:10264.1#011validation-rmse:10010
[01:29:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 294 extra nodes, 0 pruned nodes, max_depth=10
[47]#011train-rmse:10177#011validation-rmse:9920.74
[01:29:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 1074 extra nodes, 0 pruned nodes, max_depth=10
[48]#011train-rmse:10028.8#011validation-rmse:9776.71
[01:29:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 452 extra nodes, 0 pruned nodes, max_depth=10
[49]#011train-rmse:9959.12#011validation-rmse:9699.38
[01:29:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 758 extra nodes, 0 pruned nodes, max_depth=10
[50]#011train-rmse:9819.62#011validation-rmse:9583.81
[01:29:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 654 extra nodes, 0 pruned nodes, max_depth=10
[51]#011train-rmse:9756.69#011validation-rmse:9518.23
[01:29:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 860 extra nodes, 0 pruned nodes, max_depth=10
[52]#011train-rmse:9654.91#011validation-rmse:9430.58
[01:29:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 820 extra nodes, 0 pruned nodes, max_depth=10
[53]#011train-rmse:9501.04#011validation-rmse:9297.99
[01:29:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 290 extra nodes, 0 pruned nodes, max_depth=10
[54]#011train-rmse:9400.23#011validation-rmse:9201.69
[01:29:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 536 extra nodes, 0 pruned nodes, max_depth=10
[55]#011train-rmse:9344.84#011validation-rmse:9149.57
[01:29:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 380 extra nodes, 0 pruned nodes, max_depth=10
[56]#011train-rmse:9267.61#011validation-rmse:9069.42
[01:29:21] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 380 extra nodes, 0 pruned nodes, max_depth=10
[57]#011train-rmse:9209.92#011validation-rmse:9007.38
[01:29:21] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 988 extra nodes, 0 pruned nodes, max_depth=10
[58]#011train-rmse:9095.4#011validation-rmse:8891.88
[01:29:21] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 216 extra nodes, 0 pruned nodes, max_depth=10
[59]#011train-rmse:9018.6#011validation-rmse:8827.79
[01:29:21] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 340 extra nodes, 0 pruned nodes, max_depth=10
[60]#011train-rmse:8969.99#011validation-rmse:8779.02
[01:29:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 426 extra nodes, 0 pruned nodes, max_depth=10
[61]#011train-rmse:8913.37#011validation-rmse:8722.46
[01:29:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 1056 extra nodes, 0 pruned nodes, max_depth=10
[62]#011train-rmse:8856.65#011validation-rmse:8664.98
[01:29:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 430 extra nodes, 0 pruned nodes, max_depth=10
[63]#011train-rmse:8806.69#011validation-rmse:8622.59
[01:29:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 762 extra nodes, 0 pruned nodes, max_depth=10
[64]#011train-rmse:8758.35#011validation-rmse:8575.46
[01:29:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 612 extra nodes, 0 pruned nodes, max_depth=10
[65]#011train-rmse:8698.18#011validation-rmse:8506.64
[01:29:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 578 extra nodes, 0 pruned nodes, max_depth=10
[66]#011train-rmse:8668.53#011validation-rmse:8477.54
[01:29:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 832 extra nodes, 0 pruned nodes, max_depth=10
[67]#011train-rmse:8636.11#011validation-rmse:8444.05
[01:29:24] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 322 extra nodes, 0 pruned nodes, max_depth=10
[68]#011train-rmse:8612.05#011validation-rmse:8419.51
[01:29:24] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 504 extra nodes, 0 pruned nodes, max_depth=10
[69]#011train-rmse:8527.51#011validation-rmse:8335.6
[01:29:24] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 814 extra nodes, 0 pruned nodes, max_depth=10
[70]#011train-rmse:8477.03#011validation-rmse:8287.09
[01:29:24] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 486 extra nodes, 0 pruned nodes, max_depth=10
[71]#011train-rmse:8429.81#011validation-rmse:8248.77
[01:29:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 344 extra nodes, 0 pruned nodes, max_depth=10
[72]#011train-rmse:8406.38#011validation-rmse:8225.97
[01:29:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 846 extra nodes, 0 pruned nodes, max_depth=10
[73]#011train-rmse:8350.88#011validation-rmse:8165.68
[01:29:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 962 extra nodes, 0 pruned nodes, max_depth=10
[74]#011train-rmse:8265.71#011validation-rmse:8097.3
[01:29:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 978 extra nodes, 0 pruned nodes, max_depth=10
[75]#011train-rmse:8189.77#011validation-rmse:8013.82
[01:29:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 294 extra nodes, 0 pruned nodes, max_depth=10
[76]#011train-rmse:8160.4#011validation-rmse:7984.17
[01:29:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 698 extra nodes, 0 pruned nodes, max_depth=10
[77]#011train-rmse:8088.24#011validation-rmse:7904.87
[01:29:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 416 extra nodes, 0 pruned nodes, max_depth=10
[78]#011train-rmse:8065.81#011validation-rmse:7880.81
[01:29:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 1030 extra nodes, 0 pruned nodes, max_depth=10
[79]#011train-rmse:7985.32#011validation-rmse:7795.9
[01:29:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 414 extra nodes, 0 pruned nodes, max_depth=10
[80]#011train-rmse:7959.09#011validation-rmse:7768.53
[01:29:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 506 extra nodes, 0 pruned nodes, max_depth=10
[81]#011train-rmse:7899.51#011validation-rmse:7717.77
[01:29:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 996 extra nodes, 0 pruned nodes, max_depth=10
[82]#011train-rmse:7842.48#011validation-rmse:7644.3
[01:29:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 394 extra nodes, 0 pruned nodes, max_depth=10
[83]#011train-rmse:7812.29#011validation-rmse:7616.29
[01:29:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 582 extra nodes, 0 pruned nodes, max_depth=10
[84]#011train-rmse:7775.24#011validation-rmse:7581.92
[01:29:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 930 extra nodes, 0 pruned nodes, max_depth=10
[85]#011train-rmse:7732.53#011validation-rmse:7530.5
[01:29:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 368 extra nodes, 0 pruned nodes, max_depth=10
[86]#011train-rmse:7707.12#011validation-rmse:7496.32
[01:29:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 454 extra nodes, 0 pruned nodes, max_depth=10
[87]#011train-rmse:7684.77#011validation-rmse:7471.27
[01:29:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 588 extra nodes, 0 pruned nodes, max_depth=10
[88]#011train-rmse:7642.79#011validation-rmse:7430.39
[01:29:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 154 extra nodes, 0 pruned nodes, max_depth=10
[89]#011train-rmse:7614.04#011validation-rmse:7400.1
[01:29:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 632 extra nodes, 0 pruned nodes, max_depth=10
[90]#011train-rmse:7537.41#011validation-rmse:7326.29
[01:29:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 636 extra nodes, 0 pruned nodes, max_depth=10
[91]#011train-rmse:7516.38#011validation-rmse:7299.69
[01:29:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 686 extra nodes, 0 pruned nodes, max_depth=10
[92]#011train-rmse:7456.1#011validation-rmse:7259.76
[01:29:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 584 extra nodes, 0 pruned nodes, max_depth=10
[93]#011train-rmse:7435.81#011validation-rmse:7240.72
[01:29:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 570 extra nodes, 0 pruned nodes, max_depth=10
[94]#011train-rmse:7409.44#011validation-rmse:7211.32
[01:29:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 578 extra nodes, 0 pruned nodes, max_depth=10
[95]#011train-rmse:7375.82#011validation-rmse:7178.67
[01:29:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 362 extra nodes, 0 pruned nodes, max_depth=10
[96]#011train-rmse:7354.69#011validation-rmse:7158.87
[01:29:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 330 extra nodes, 0 pruned nodes, max_depth=10
[97]#011train-rmse:7340.66#011validation-rmse:7144.72
[01:29:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 706 extra nodes, 0 pruned nodes, max_depth=10
[98]#011train-rmse:7301.74#011validation-rmse:7106.24
[01:29:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 760 extra nodes, 0 pruned nodes, max_depth=10
[99]#011train-rmse:7278.08#011validation-rmse:7081.5

2021-11-04 01:29:54 Uploading - Uploading generated training model
2021-11-04 01:29:54 Completed - Training job completed
Training seconds: 75
Billable seconds: 34
Managed Spot Training savings: 54.7%

Deploy Model

# Deploy the model to perform inference

Xgboost_regressor = Xgboost_regressor.deploy(initial_instance_count = 1, instance_type = 'ml.m5.2xlarge')
-----!

Set Serializer

# Content type over-rides the data that will be passed to the deployed model, since the deployed model expects data
# in text/csv format, we specify this as content -type.

# Serializer accepts a single argument, the input data, and returns a sequence of bytes in the specified content
# type

# Reference: https://sagemaker.readthedocs.io/en/stable/predictors.html

from sagemaker.serializers import CSVSerializer

Xgboost_regressor.serializer = CSVSerializer()

Shape Testing Data

Split testing data for metrics later

testing_data
Weekly_Sales IsHoliday Temperature Fuel_Price MarkDown1 MarkDown2 MarkDown3 MarkDown4 MarkDown5 CPI ... Dept_90 Dept_91 Dept_92 Dept_93 Dept_94 Dept_95 Dept_96 Dept_97 Dept_98 Dept_99
236081 19829.80 0 76.42 3.732 571.85 126.88 145.72 614.55 1867.65 138.233193 ... 0 0 0 0 0 0 0 0 0 0
188104 12212.20 0 55.43 2.899 0.00 0.00 0.00 0.00 0.00 203.730749 ... 0 0 0 0 0 0 0 0 0 0
353417 1962.50 0 75.98 3.721 241.47 0.00 0.00 0.00 2143.91 221.457860 ... 0 0 0 0 0 0 0 0 0 0
229856 82315.53 0 45.63 3.138 0.00 0.00 0.00 0.00 0.00 132.917200 ... 0 0 0 0 0 1 0 0 0 0
13179 48465.38 0 50.81 2.771 0.00 0.00 0.00 0.00 0.00 211.547030 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
127670 2870.09 0 67.96 3.821 10671.71 141.83 46.00 2465.37 12372.29 131.010333 ... 0 0 0 0 0 0 0 0 0 0
396908 115.00 0 80.06 4.277 609.75 0.00 1.10 43.67 726.34 130.959226 ... 0 0 0 0 0 0 0 0 0 0
251735 1285.74 0 59.85 3.924 0.00 0.00 0.00 0.00 0.00 134.942548 ... 0 0 0 0 0 0 0 0 0 0
346065 5009.87 0 85.89 3.571 442.05 0.00 0.00 0.00 1381.39 220.719961 ... 0 0 0 0 0 0 0 0 1 0
227008 130761.81 1 25.94 2.940 0.00 0.00 0.00 0.00 0.00 131.586613 ... 0 0 1 0 0 0 0 0 0 0

8432 rows × 139 columns

# split testing_data.df into X_test.f32 & y_test.f32

y_test = testing_data['Weekly_Sales']
X_test = testing_data.drop(columns = ["Weekly_Sales"])
y_test.shape
(8432,)
X_test.shape
(8432, 138)

Remove Target for Predictions Function

.predictions() method needs the testing data without the y value (target)

# testing_data.pop('Weekly_Sales')
# testing_data
IsHoliday Temperature Fuel_Price MarkDown1 MarkDown2 MarkDown3 MarkDown4 MarkDown5 CPI Unemployment ... Dept_90 Dept_91 Dept_92 Dept_93 Dept_94 Dept_95 Dept_96 Dept_97 Dept_98 Dept_99
373039 0 70.66 2.735 0.00 0.00 0.00 0.00 0.00 132.724839 5.326 ... 0 0 0 0 0 0 0 0 0 0
69323 0 60.18 2.719 0.00 0.00 0.00 0.00 0.00 214.164218 6.290 ... 0 0 0 0 0 0 0 0 0 0
157684 0 15.64 2.667 0.00 0.00 0.00 0.00 0.00 126.552286 6.548 ... 0 0 0 0 0 0 0 0 0 0
76599 0 62.18 3.891 6821.13 0.00 27.04 2101.86 3627.77 224.988362 5.679 ... 0 0 0 0 0 0 0 0 0 0
313527 1 72.56 3.596 7271.43 172.04 233.55 916.85 3089.99 198.095048 7.872 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
214614 0 46.06 3.867 8463.01 51.31 11.36 3719.23 4895.18 141.554780 7.503 ... 0 0 0 0 0 0 0 0 0 0
112581 0 71.81 4.031 0.00 0.00 0.00 0.00 0.00 129.049032 13.736 ... 0 0 0 0 0 0 0 0 0 0
224550 0 33.11 3.876 8575.57 1125.61 5.08 4365.63 2742.87 137.506690 4.261 ... 0 0 0 0 0 0 0 0 0 0
238214 0 69.31 2.899 0.00 0.00 0.00 0.00 0.00 204.140656 7.856 ... 0 0 0 0 0 0 0 0 0 0
261374 0 61.71 4.117 0.00 0.00 0.00 0.00 0.00 138.330312 7.725 ... 0 0 0 0 0 0 0 0 0 0

8432 rows × 138 columns

Convert Testing Data to float32

Testing data must be converted to float32 format before execution

testing_data_float32 = np.array(X_test).astype('float32')
testing_data_float32.shape
(8432, 138)

Make Predictions

# custom code to convert the values in bytes format to array
def bytes_2_array(x):

    # makes entire prediction as string and splits based on ','
    l = str(x).split(',')

    # Since the first element contains unwanted characters like (b,',') we remove them
    l[0] = l[0][2:]
    #same-thing as above remove the unwanted last character (')
    l[-1] = l[-1][:-1]

    # iterating through the list of strings and converting them into float type
    for i in range(len(l)):
        l[i] = float(l[i])

    # converting the list into array
    l = np.array(l).astype('float32')

    # reshape one-dimensional array to two-dimensional array
    return l.reshape(-1,1)

Attempting to run the entire dataset would cause a crash because this dataset is so large, so let us predict 10,000 at a time

# making prediction

predictions_bytes = Xgboost_regressor.predict(testing_data_float32)
# convert bytes to array
predicted_values = bytes_2_array(predictions_bytes)
predicted_values.shape
(8432, 1)

Calculate Accuracy

Split Testing Data

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt
k = X_test.shape[1]
n = len(X_test)
RMSE = float(format(np.sqrt(mean_squared_error(y_test, predicted_values)),'.3f'))
MSE = mean_squared_error(y_test, predicted_values)
MAE = mean_absolute_error(y_test, predicted_values)
r2 = r2_score(y_test, predicted_values)
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)

print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2, '\nAdjusted R2 =', adj_r2)
RMSE = 7377.642
MSE = 54429599.67199714
MAE = 4311.153492210574
R2 = 0.903990280483614
Adjusted R2 = 0.9023926268850053