IMPORT DATASET AND LIBRARIES
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import zipfile
# import the csv files using pandas
features = pd.read_csv('Features_data_set.csv')
sales = pd.read_csv('sales_data_set.csv')
stores = pd.read_csv('stores_data_set.csv')
Shape Datafield
# function to return month
def get_month(datetime):
return int(str(datetime).split('-')[1])
# Change the datatype of 'date' column
features['Date'] = pd.to_datetime(features['Date'])
sales['Date'] = pd.to_datetime(sales['Date'])
# merge the three seperate csv files
df = pd.merge(sales, features, on = ['Store','Date','IsHoliday'])
df = pd.merge(df, stores, on = ['Store'], how = 'left')
# retrieve month from date column and add to it's own "Month" column
df['Month'] = df['Date'].apply(get_month)
# Fill up NaN elements with zeros
df = df.fillna(0)
# Convert IsHoliday boolean to numbers
df['IsHoliday'] =df['IsHoliday'].apply(lambda element : 0 if element == False else 1)
## Move target "weekly_sales" into first column
# remove column into its own object
first_column = df.pop('Weekly_Sales')
# insert column using insert(position,column_name,
# first_column) function
df.insert(0, 'Weekly_Sales', first_column)
# drop the Date column
# We no longer need the date column as we have isolated the month into it's own column
df = df.drop(columns = ['Date'])
# get dummies
df = pd.get_dummies(df, columns = ['Type', 'Store', 'Dept'], drop_first = True)
df.sample(n=20)
Weekly_Sales | IsHoliday | Temperature | Fuel_Price | MarkDown1 | MarkDown2 | MarkDown3 | MarkDown4 | MarkDown5 | CPI | ... | Dept_90 | Dept_91 | Dept_92 | Dept_93 | Dept_94 | Dept_95 | Dept_96 | Dept_97 | Dept_98 | Dept_99 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
299268 | 3745.69 | 0 | 94.22 | 3.684 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 215.197852 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
238515 | 3543.48 | 0 | 72.17 | 2.808 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 204.567546 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
389906 | 124014.71 | 0 | 74.47 | 3.646 | 7791.47 | 306.70 | 31.44 | 4856.67 | 4087.29 | 197.832220 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
139476 | 3718.19 | 0 | 65.83 | 2.942 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 132.473333 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
419249 | 15534.95 | 0 | 41.55 | 3.816 | 22832.38 | 2515.25 | 4.00 | 13317.88 | 2560.48 | 190.171493 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
234284 | 3289.22 | 0 | 31.92 | 3.737 | 755.78 | 4142.75 | 190.30 | 90.50 | 2594.66 | 136.959839 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
351381 | 81.08 | 0 | 55.41 | 3.112 | 158.11 | 0.00 | 7.50 | 0.00 | 1316.88 | 218.054185 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
231826 | 8931.06 | 0 | 73.12 | 4.069 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 134.855161 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
414889 | 38683.93 | 0 | 30.54 | 3.109 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 182.551954 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
215061 | 20421.97 | 0 | 50.75 | 3.991 | 21823.53 | 0.00 | 37.87 | 6586.49 | 1565.11 | 142.017793 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
69496 | 49155.24 | 0 | 62.66 | 2.808 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 213.818636 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
264394 | 46906.16 | 0 | 37.24 | 3.874 | 14655.15 | 10670.84 | 162.82 | 11286.99 | 1595.80 | 141.214036 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
323355 | 6781.83 | 0 | 52.43 | 2.699 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 126.491290 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
211688 | 7071.69 | 0 | 58.56 | 4.101 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 138.587106 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
97064 | 8373.04 | 0 | 88.83 | 4.002 | 4407.90 | 0.00 | 7.20 | 3037.56 | 3717.52 | 130.790968 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
85976 | 5302.91 | 0 | 70.94 | 3.688 | 4727.80 | 0.00 | 1.04 | 356.78 | 1077.81 | 225.478263 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
99892 | 10980.97 | 0 | 87.70 | 2.619 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 214.889794 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
250308 | 1437.29 | 0 | 16.70 | 3.215 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 132.951065 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
391104 | 17084.47 | 0 | 54.34 | 2.962 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 126.442065 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
42864 | 2907.10 | 0 | 37.74 | 2.983 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 212.008514 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
20 rows × 139 columns
Datafield is looking good at this point
df.shape
(421570, 139)
Shape Training and Validation Data
Split Data
# spliting the data in to test and train sets
from sklearn.model_selection import train_test_split
training_data, testing_data = train_test_split(df, test_size = 0.02)
Generate CSV Files
# save train_data and validation_data as csv files.
training_data.to_csv('training.csv', header = False, index = False)
testing_data.to_csv('validation.csv', header = False, index = False)
Configure Sagemaker
# Boto3 is the Amazon Web Services (AWS) Software Development Kit (SDK) for Python
# Boto3 allows Python developer to write software that makes use of services like Amazon S3 and Amazon EC2
import sagemaker
import boto3
from sagemaker import Session
# create sagemaker session
sagemaker_session = sagemaker.Session()
# specify bucket and folder
bucket = 'tutorial-sagemaker-sales-xgboost'
prefix = 'XGBoost-Regressor'
key = 'XGBoost-Regressor'
#Roles give learning and hosting access to the data
#This is specified while opening the sagemakers instance in "Create an IAM role"
role = sagemaker.get_execution_role()
print(bucket)
print(prefix)
print(key)
print(role)
tutorial-sagemaker-sales-xgboost
XGBoost-Regressor
XGBoost-Regressor
arn:aws:iam::483449698840:role/service-role/AmazonSageMaker-ExecutionRole-20211018T115875
Training Data Location
# read the data from csv file and then upload the data to s3 bucket
import os
with open('training.csv','rb') as file:
# The following code uploads the data into S3 bucket to be accessed later for training
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(file)
# Let's print out the training data location in s3
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))
uploaded training data location: s3://tutorial-sagemaker-sales-xgboost/XGBoost-Regressor/train/XGBoost-Regressor
Validation Data Location
# read the data from csv file and then upload the data to s3 bucket
with open('validation.csv','rb') as file:
# The following code uploads the data into S3 bucket to be accessed later for training
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation', key)).upload_fileobj(file)
# Let's print out the validation data location in s3
s3_validation_data = 's3://{}/{}/validation/{}'.format(bucket, prefix, key)
print('uploaded validation data location: {}'.format(s3_validation_data))
uploaded validation data location: s3://tutorial-sagemaker-sales-xgboost/XGBoost-Regressor/validation/XGBoost-Regressor
Output Placeholder
# creates output placeholder in S3 bucket to store the output
output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('training artifacts will be uploaded to: {}'.format(output_location))
training artifacts will be uploaded to: s3://tutorial-sagemaker-sales-xgboost/XGBoost-Regressor/output
Algorithm Container
# This code is used to get the training container of sagemaker built-in algorithms
# all we have to do is to specify the name of the algorithm, that we want to use
region = boto3.Session().region_name
container = sagemaker.image_uris.retrieve('xgboost', region, version='latest')
container
'811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest'
Container and Algorithm Parameters
# Specify the type of instance that we would like to use for training
# output path and sagemaker session into the Estimator.
# We can also specify how many instances we would like to use for training
# Recall that XGBoost works by combining an ensemble of weak models to generate accurate/robust results.
# The weak models are randomized to avoid overfitting
# num_round: The number of rounds to run the training.
# Alpha: L1 regularization term on weights. Increasing this value makes models more conservative.
# colsample_by_tree: fraction of features that will be used to train each tree.
# eta: Step size shrinkage used in updates to prevent overfitting.
# After each boosting step, eta parameter shrinks the feature weights to make the boosting process more conservative.
Xgboost_regressor = sagemaker.estimator.Estimator(container,
role,
instance_count = 1,
instance_type = 'ml.m5.2xlarge',
output_path = output_location,
sagemaker_session = sagemaker_session,
# reduce cost with spot instances
use_spot_instances = True,
max_run = 300,
max_wait = 600
)
#We can tune the hyper-parameters to improve the performance of the model
Xgboost_regressor.set_hyperparameters(max_depth = 10,
objective = 'reg:linear',
colsample_bytree = 0.3,
alpha = 10,
eta = 0.1,
num_round = 100
)
Data Channels
# Creating "train", "validation" channels to feed in the model
# Source: https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
train_input = sagemaker.inputs.TrainingInput(s3_data = s3_train_data, content_type='csv',s3_data_type = 'S3Prefix')
valid_input = sagemaker.inputs.TrainingInput(s3_data = s3_validation_data, content_type='csv',s3_data_type = 'S3Prefix')
data_channels = {'train': train_input,'validation': valid_input}
Train Model
Xgboost_regressor.fit(data_channels)
2021-11-04 01:25:49 Starting - Starting the training job...
2021-11-04 01:26:13 Starting - Launching requested ML instancesProfilerReport-1635989149: InProgress
......
2021-11-04 01:27:14 Starting - Preparing the instances for training.........
2021-11-04 01:28:46 Downloading - Downloading input data
2021-11-04 01:28:46 Training - Downloading the training image..[34mArguments: train[0m
[34m[2021-11-04:01:29:02:INFO] Running standalone xgboost training.[0m
[34m[2021-11-04:01:29:02:INFO] File size need to be processed in the node: 134.75mb. Available memory size in the node: 23771.49mb[0m
[34m[2021-11-04:01:29:02:INFO] Determined delimiter of CSV input is ','[0m
[34m[01:29:02] S3DistributionType set as FullyReplicated[0m
[34m[01:29:03] 413138x138 matrix with 57013044 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2021-11-04:01:29:03:INFO] Determined delimiter of CSV input is ','[0m
[34m[01:29:03] S3DistributionType set as FullyReplicated[0m
[34m[01:29:03] 8432x138 matrix with 1163616 entries loaded from /opt/ml/input/data/validation?format=csv&label_column=0&delimiter=,[0m
[34m[01:29:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 280 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[0]#011train-rmse:26438.7#011validation-rmse:26719.4[0m
[34m[01:29:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 186 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[1]#011train-rmse:25132#011validation-rmse:25360.7[0m
[34m[01:29:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 134 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[2]#011train-rmse:24277.6#011validation-rmse:24491.7[0m
[34m[01:29:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 516 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[3]#011train-rmse:23507.3#011validation-rmse:23752.3[0m
[34m[01:29:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 376 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[4]#011train-rmse:22410.4#011validation-rmse:22620.7[0m
[34m[01:29:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 294 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[5]#011train-rmse:21626.1#011validation-rmse:21807.8[0m
[34m[01:29:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 280 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[6]#011train-rmse:21162.7#011validation-rmse:21349.2[0m
[34m[01:29:07] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 582 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[7]#011train-rmse:20449.9#011validation-rmse:20609.4[0m
2021-11-04 01:29:14 Training - Training image download completed. Training in progress.[34m[01:29:07] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 200 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[8]#011train-rmse:20010.4#011validation-rmse:20186.7[0m
[34m[01:29:07] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 512 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[9]#011train-rmse:19158.7#011validation-rmse:19302.7[0m
[34m[01:29:07] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 264 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[10]#011train-rmse:18656.4#011validation-rmse:18803.7[0m
[34m[01:29:08] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 568 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[11]#011train-rmse:18227.7#011validation-rmse:18348.3[0m
[34m[01:29:08] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 756 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[12]#011train-rmse:17616.9#011validation-rmse:17700.8[0m
[34m[01:29:08] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 360 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[13]#011train-rmse:17309.5#011validation-rmse:17375.6[0m
[34m[01:29:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 294 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[14]#011train-rmse:16843.7#011validation-rmse:16882[0m
[34m[01:29:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 368 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[15]#011train-rmse:16445#011validation-rmse:16438.4[0m
[34m[01:29:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 322 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[16]#011train-rmse:16258.4#011validation-rmse:16249.4[0m
[34m[01:29:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 362 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[17]#011train-rmse:15938.5#011validation-rmse:15885.8[0m
[34m[01:29:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 270 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[18]#011train-rmse:15584.9#011validation-rmse:15513.5[0m
[34m[01:29:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 534 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[19]#011train-rmse:15283.6#011validation-rmse:15172.6[0m
[34m[01:29:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 424 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[20]#011train-rmse:15006.6#011validation-rmse:14858.5[0m
[34m[01:29:11] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 778 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[21]#011train-rmse:14646.6#011validation-rmse:14502.8[0m
[34m[01:29:11] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 282 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[22]#011train-rmse:14464.6#011validation-rmse:14300.9[0m
[34m[01:29:11] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 256 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[23]#011train-rmse:14199.5#011validation-rmse:14023.8[0m
[34m[01:29:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 688 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[24]#011train-rmse:13979.7#011validation-rmse:13792.4[0m
[34m[01:29:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 828 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[25]#011train-rmse:13745#011validation-rmse:13539.5[0m
[34m[01:29:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 232 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[26]#011train-rmse:13451.5#011validation-rmse:13231.4[0m
[34m[01:29:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 518 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[27]#011train-rmse:13235#011validation-rmse:13032.9[0m
[34m[01:29:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 316 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[28]#011train-rmse:13109#011validation-rmse:12883.8[0m
[34m[01:29:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 170 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[29]#011train-rmse:12992.8#011validation-rmse:12781.5[0m
[34m[01:29:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 242 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[30]#011train-rmse:12886.4#011validation-rmse:12669.8[0m
[34m[01:29:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 790 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[31]#011train-rmse:12636.9#011validation-rmse:12410.4[0m
[34m[01:29:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 328 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[32]#011train-rmse:12378.1#011validation-rmse:12136.2[0m
[34m[01:29:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 306 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[33]#011train-rmse:12252.1#011validation-rmse:12005.4[0m
[34m[01:29:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 556 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[34]#011train-rmse:12085#011validation-rmse:11850.3[0m
[34m[01:29:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 922 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[35]#011train-rmse:11943.4#011validation-rmse:11701[0m
[34m[01:29:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 942 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[36]#011train-rmse:11628.8#011validation-rmse:11417.1[0m
[34m[01:29:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 998 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[37]#011train-rmse:11498.4#011validation-rmse:11306.6[0m
[34m[01:29:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 510 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[38]#011train-rmse:11359.4#011validation-rmse:11172.8[0m
[34m[01:29:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 630 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[39]#011train-rmse:11261.7#011validation-rmse:11084.4[0m
[34m[01:29:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 464 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[40]#011train-rmse:11017.4#011validation-rmse:10820.3[0m
[34m[01:29:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 284 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[41]#011train-rmse:10923.9#011validation-rmse:10712[0m
[34m[01:29:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 824 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[42]#011train-rmse:10716.8#011validation-rmse:10471.9[0m
[34m[01:29:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 760 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[43]#011train-rmse:10506#011validation-rmse:10261.4[0m
[34m[01:29:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 342 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[44]#011train-rmse:10394.5#011validation-rmse:10153.8[0m
[34m[01:29:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 438 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[45]#011train-rmse:10335.6#011validation-rmse:10093.1[0m
[34m[01:29:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 438 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[46]#011train-rmse:10264.1#011validation-rmse:10010[0m
[34m[01:29:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 294 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[47]#011train-rmse:10177#011validation-rmse:9920.74[0m
[34m[01:29:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 1074 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[48]#011train-rmse:10028.8#011validation-rmse:9776.71[0m
[34m[01:29:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 452 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[49]#011train-rmse:9959.12#011validation-rmse:9699.38[0m
[34m[01:29:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 758 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[50]#011train-rmse:9819.62#011validation-rmse:9583.81[0m
[34m[01:29:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 654 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[51]#011train-rmse:9756.69#011validation-rmse:9518.23[0m
[34m[01:29:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 860 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[52]#011train-rmse:9654.91#011validation-rmse:9430.58[0m
[34m[01:29:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 820 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[53]#011train-rmse:9501.04#011validation-rmse:9297.99[0m
[34m[01:29:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 290 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[54]#011train-rmse:9400.23#011validation-rmse:9201.69[0m
[34m[01:29:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 536 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[55]#011train-rmse:9344.84#011validation-rmse:9149.57[0m
[34m[01:29:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 380 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[56]#011train-rmse:9267.61#011validation-rmse:9069.42[0m
[34m[01:29:21] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 380 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[57]#011train-rmse:9209.92#011validation-rmse:9007.38[0m
[34m[01:29:21] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 988 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[58]#011train-rmse:9095.4#011validation-rmse:8891.88[0m
[34m[01:29:21] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 216 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[59]#011train-rmse:9018.6#011validation-rmse:8827.79[0m
[34m[01:29:21] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 340 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[60]#011train-rmse:8969.99#011validation-rmse:8779.02[0m
[34m[01:29:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 426 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[61]#011train-rmse:8913.37#011validation-rmse:8722.46[0m
[34m[01:29:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 1056 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[62]#011train-rmse:8856.65#011validation-rmse:8664.98[0m
[34m[01:29:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 430 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[63]#011train-rmse:8806.69#011validation-rmse:8622.59[0m
[34m[01:29:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 762 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[64]#011train-rmse:8758.35#011validation-rmse:8575.46[0m
[34m[01:29:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 612 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[65]#011train-rmse:8698.18#011validation-rmse:8506.64[0m
[34m[01:29:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 578 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[66]#011train-rmse:8668.53#011validation-rmse:8477.54[0m
[34m[01:29:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 832 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[67]#011train-rmse:8636.11#011validation-rmse:8444.05[0m
[34m[01:29:24] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 322 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[68]#011train-rmse:8612.05#011validation-rmse:8419.51[0m
[34m[01:29:24] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 504 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[69]#011train-rmse:8527.51#011validation-rmse:8335.6[0m
[34m[01:29:24] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 814 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[70]#011train-rmse:8477.03#011validation-rmse:8287.09[0m
[34m[01:29:24] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 486 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[71]#011train-rmse:8429.81#011validation-rmse:8248.77[0m
[34m[01:29:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 344 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[72]#011train-rmse:8406.38#011validation-rmse:8225.97[0m
[34m[01:29:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 846 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[73]#011train-rmse:8350.88#011validation-rmse:8165.68[0m
[34m[01:29:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 962 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[74]#011train-rmse:8265.71#011validation-rmse:8097.3[0m
[34m[01:29:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 978 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[75]#011train-rmse:8189.77#011validation-rmse:8013.82[0m
[34m[01:29:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 294 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[76]#011train-rmse:8160.4#011validation-rmse:7984.17[0m
[34m[01:29:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 698 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[77]#011train-rmse:8088.24#011validation-rmse:7904.87[0m
[34m[01:29:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 416 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[78]#011train-rmse:8065.81#011validation-rmse:7880.81[0m
[34m[01:29:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 1030 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[79]#011train-rmse:7985.32#011validation-rmse:7795.9[0m
[34m[01:29:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 414 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[80]#011train-rmse:7959.09#011validation-rmse:7768.53[0m
[34m[01:29:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 506 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[81]#011train-rmse:7899.51#011validation-rmse:7717.77[0m
[34m[01:29:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 996 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[82]#011train-rmse:7842.48#011validation-rmse:7644.3[0m
[34m[01:29:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 394 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[83]#011train-rmse:7812.29#011validation-rmse:7616.29[0m
[34m[01:29:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 582 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[84]#011train-rmse:7775.24#011validation-rmse:7581.92[0m
[34m[01:29:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 930 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[85]#011train-rmse:7732.53#011validation-rmse:7530.5[0m
[34m[01:29:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 368 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[86]#011train-rmse:7707.12#011validation-rmse:7496.32[0m
[34m[01:29:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 454 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[87]#011train-rmse:7684.77#011validation-rmse:7471.27[0m
[34m[01:29:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 588 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[88]#011train-rmse:7642.79#011validation-rmse:7430.39[0m
[34m[01:29:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 154 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[89]#011train-rmse:7614.04#011validation-rmse:7400.1[0m
[34m[01:29:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 632 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[90]#011train-rmse:7537.41#011validation-rmse:7326.29[0m
[34m[01:29:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 636 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[91]#011train-rmse:7516.38#011validation-rmse:7299.69[0m
[34m[01:29:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 686 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[92]#011train-rmse:7456.1#011validation-rmse:7259.76[0m
[34m[01:29:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 584 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[93]#011train-rmse:7435.81#011validation-rmse:7240.72[0m
[34m[01:29:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 570 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[94]#011train-rmse:7409.44#011validation-rmse:7211.32[0m
[34m[01:29:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 578 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[95]#011train-rmse:7375.82#011validation-rmse:7178.67[0m
[34m[01:29:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 362 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[96]#011train-rmse:7354.69#011validation-rmse:7158.87[0m
[34m[01:29:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 330 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[97]#011train-rmse:7340.66#011validation-rmse:7144.72[0m
[34m[01:29:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 706 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[98]#011train-rmse:7301.74#011validation-rmse:7106.24[0m
[34m[01:29:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 760 extra nodes, 0 pruned nodes, max_depth=10[0m
[34m[99]#011train-rmse:7278.08#011validation-rmse:7081.5[0m
2021-11-04 01:29:54 Uploading - Uploading generated training model
2021-11-04 01:29:54 Completed - Training job completed
Training seconds: 75
Billable seconds: 34
Managed Spot Training savings: 54.7%
Deploy Model
# Deploy the model to perform inference
Xgboost_regressor = Xgboost_regressor.deploy(initial_instance_count = 1, instance_type = 'ml.m5.2xlarge')
-----!
Set Serializer
# Content type over-rides the data that will be passed to the deployed model, since the deployed model expects data
# in text/csv format, we specify this as content -type.
# Serializer accepts a single argument, the input data, and returns a sequence of bytes in the specified content
# type
# Reference: https://sagemaker.readthedocs.io/en/stable/predictors.html
from sagemaker.serializers import CSVSerializer
Xgboost_regressor.serializer = CSVSerializer()
Shape Testing Data
Split testing data for metrics later
testing_data
Weekly_Sales | IsHoliday | Temperature | Fuel_Price | MarkDown1 | MarkDown2 | MarkDown3 | MarkDown4 | MarkDown5 | CPI | ... | Dept_90 | Dept_91 | Dept_92 | Dept_93 | Dept_94 | Dept_95 | Dept_96 | Dept_97 | Dept_98 | Dept_99 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
236081 | 19829.80 | 0 | 76.42 | 3.732 | 571.85 | 126.88 | 145.72 | 614.55 | 1867.65 | 138.233193 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
188104 | 12212.20 | 0 | 55.43 | 2.899 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 203.730749 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
353417 | 1962.50 | 0 | 75.98 | 3.721 | 241.47 | 0.00 | 0.00 | 0.00 | 2143.91 | 221.457860 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
229856 | 82315.53 | 0 | 45.63 | 3.138 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 132.917200 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
13179 | 48465.38 | 0 | 50.81 | 2.771 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 211.547030 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
127670 | 2870.09 | 0 | 67.96 | 3.821 | 10671.71 | 141.83 | 46.00 | 2465.37 | 12372.29 | 131.010333 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
396908 | 115.00 | 0 | 80.06 | 4.277 | 609.75 | 0.00 | 1.10 | 43.67 | 726.34 | 130.959226 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
251735 | 1285.74 | 0 | 59.85 | 3.924 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 134.942548 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
346065 | 5009.87 | 0 | 85.89 | 3.571 | 442.05 | 0.00 | 0.00 | 0.00 | 1381.39 | 220.719961 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
227008 | 130761.81 | 1 | 25.94 | 2.940 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 131.586613 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
8432 rows × 139 columns
# split testing_data.df into X_test.f32 & y_test.f32
y_test = testing_data['Weekly_Sales']
X_test = testing_data.drop(columns = ["Weekly_Sales"])
y_test.shape
(8432,)
X_test.shape
(8432, 138)
Remove Target for Predictions Function
.predictions() method needs the testing data without the y value (target)
# testing_data.pop('Weekly_Sales')
# testing_data
IsHoliday | Temperature | Fuel_Price | MarkDown1 | MarkDown2 | MarkDown3 | MarkDown4 | MarkDown5 | CPI | Unemployment | ... | Dept_90 | Dept_91 | Dept_92 | Dept_93 | Dept_94 | Dept_95 | Dept_96 | Dept_97 | Dept_98 | Dept_99 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
373039 | 0 | 70.66 | 2.735 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 132.724839 | 5.326 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
69323 | 0 | 60.18 | 2.719 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 214.164218 | 6.290 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
157684 | 0 | 15.64 | 2.667 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 126.552286 | 6.548 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
76599 | 0 | 62.18 | 3.891 | 6821.13 | 0.00 | 27.04 | 2101.86 | 3627.77 | 224.988362 | 5.679 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
313527 | 1 | 72.56 | 3.596 | 7271.43 | 172.04 | 233.55 | 916.85 | 3089.99 | 198.095048 | 7.872 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
214614 | 0 | 46.06 | 3.867 | 8463.01 | 51.31 | 11.36 | 3719.23 | 4895.18 | 141.554780 | 7.503 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
112581 | 0 | 71.81 | 4.031 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 129.049032 | 13.736 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
224550 | 0 | 33.11 | 3.876 | 8575.57 | 1125.61 | 5.08 | 4365.63 | 2742.87 | 137.506690 | 4.261 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
238214 | 0 | 69.31 | 2.899 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 204.140656 | 7.856 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
261374 | 0 | 61.71 | 4.117 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 138.330312 | 7.725 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
8432 rows × 138 columns
Convert Testing Data to float32
Testing data must be converted to float32 format before execution
testing_data_float32 = np.array(X_test).astype('float32')
testing_data_float32.shape
(8432, 138)
Make Predictions
# custom code to convert the values in bytes format to array
def bytes_2_array(x):
# makes entire prediction as string and splits based on ','
l = str(x).split(',')
# Since the first element contains unwanted characters like (b,',') we remove them
l[0] = l[0][2:]
#same-thing as above remove the unwanted last character (')
l[-1] = l[-1][:-1]
# iterating through the list of strings and converting them into float type
for i in range(len(l)):
l[i] = float(l[i])
# converting the list into array
l = np.array(l).astype('float32')
# reshape one-dimensional array to two-dimensional array
return l.reshape(-1,1)
Attempting to run the entire dataset would cause a crash because this dataset is so large, so let us predict 10,000 at a time
# making prediction
predictions_bytes = Xgboost_regressor.predict(testing_data_float32)
# convert bytes to array
predicted_values = bytes_2_array(predictions_bytes)
predicted_values.shape
(8432, 1)
Calculate Accuracy
Split Testing Data
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt
k = X_test.shape[1]
n = len(X_test)
RMSE = float(format(np.sqrt(mean_squared_error(y_test, predicted_values)),'.3f'))
MSE = mean_squared_error(y_test, predicted_values)
MAE = mean_absolute_error(y_test, predicted_values)
r2 = r2_score(y_test, predicted_values)
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)
print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2, '\nAdjusted R2 =', adj_r2)
RMSE = 7377.642
MSE = 54429599.67199714
MAE = 4311.153492210574
R2 = 0.903990280483614
Adjusted R2 = 0.9023926268850053