AWS SageMaker Tutorial: Part 7
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import zipfile
# import the csv files using pandas
features = pd.read_csv('Features_data_set.csv')
sales = pd.read_csv('sales_data_set.csv')
stores = pd.read_csv('stores_data_set.csv')
Shape Datafield
# function to return month
def get_month(datetime):
return int(str(datetime).split('-')[1])
# Change the datatype of 'date' column
features['Date'] = pd.to_datetime(features['Date'])
sales['Date'] = pd.to_datetime(sales['Date'])
# merge the three seperate csv files
df = pd.merge(sales, features, on = ['Store','Date','IsHoliday'])
df = pd.merge(df, stores, on = ['Store'], how = 'left')
# retrieve month from date column and add to it's own "Month" column
df['Month'] = df['Date'].apply(get_month)
# Fill up NaN elements with zeros
df = df.fillna(0)
# Convert IsHoliday boolean to numbers
df['IsHoliday'] =df['IsHoliday'].apply(lambda element : 0 if element == False else 1)
## Move target "weekly_sales" into first column
# remove column into its own object
first_column = df.pop('Weekly_Sales')
# insert column using insert(position,column_name,
# first_column) function
df.insert(0, 'Weekly_Sales', first_column)
# drop the Date column
# We no longer need the date column as we have isolated the month into it's own column
df = df.drop(columns = ['Date'])
# get dummies
df = pd.get_dummies(df, columns = ['Type', 'Store', 'Dept'], drop_first = True)
df.sample(n=20)
Weekly_Sales | IsHoliday | Temperature | Fuel_Price | MarkDown1 | MarkDown2 | MarkDown3 | MarkDown4 | MarkDown5 | CPI | ... | Dept_90 | Dept_91 | Dept_92 | Dept_93 | Dept_94 | Dept_95 | Dept_96 | Dept_97 | Dept_98 | Dept_99 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
299268 | 3745.69 | 0 | 94.22 | 3.684 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 215.197852 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
238515 | 3543.48 | 0 | 72.17 | 2.808 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 204.567546 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
389906 | 124014.71 | 0 | 74.47 | 3.646 | 7791.47 | 306.70 | 31.44 | 4856.67 | 4087.29 | 197.832220 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
139476 | 3718.19 | 0 | 65.83 | 2.942 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 132.473333 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
419249 | 15534.95 | 0 | 41.55 | 3.816 | 22832.38 | 2515.25 | 4.00 | 13317.88 | 2560.48 | 190.171493 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
234284 | 3289.22 | 0 | 31.92 | 3.737 | 755.78 | 4142.75 | 190.30 | 90.50 | 2594.66 | 136.959839 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
351381 | 81.08 | 0 | 55.41 | 3.112 | 158.11 | 0.00 | 7.50 | 0.00 | 1316.88 | 218.054185 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
231826 | 8931.06 | 0 | 73.12 | 4.069 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 134.855161 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
414889 | 38683.93 | 0 | 30.54 | 3.109 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 182.551954 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
215061 | 20421.97 | 0 | 50.75 | 3.991 | 21823.53 | 0.00 | 37.87 | 6586.49 | 1565.11 | 142.017793 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
69496 | 49155.24 | 0 | 62.66 | 2.808 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 213.818636 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
264394 | 46906.16 | 0 | 37.24 | 3.874 | 14655.15 | 10670.84 | 162.82 | 11286.99 | 1595.80 | 141.214036 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
323355 | 6781.83 | 0 | 52.43 | 2.699 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 126.491290 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
211688 | 7071.69 | 0 | 58.56 | 4.101 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 138.587106 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
97064 | 8373.04 | 0 | 88.83 | 4.002 | 4407.90 | 0.00 | 7.20 | 3037.56 | 3717.52 | 130.790968 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
85976 | 5302.91 | 0 | 70.94 | 3.688 | 4727.80 | 0.00 | 1.04 | 356.78 | 1077.81 | 225.478263 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
99892 | 10980.97 | 0 | 87.70 | 2.619 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 214.889794 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
250308 | 1437.29 | 0 | 16.70 | 3.215 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 132.951065 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
391104 | 17084.47 | 0 | 54.34 | 2.962 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 126.442065 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
42864 | 2907.10 | 0 | 37.74 | 2.983 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 212.008514 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
20 rows × 139 columns
Datafield is looking good at this point
df.shape
(421570, 139)
Shape Training and Validation Data
Split Data
# spliting the data in to test and train sets
from sklearn.model_selection import train_test_split
training_data, testing_data = train_test_split(df, test_size = 0.02)
Generate CSV Files
# save train_data and validation_data as csv files.
training_data.to_csv('training.csv', header = False, index = False)
testing_data.to_csv('validation.csv', header = False, index = False)
Configure Sagemaker
# Boto3 is the Amazon Web Services (AWS) Software Development Kit (SDK) for Python
# Boto3 allows Python developer to write software that makes use of services like Amazon S3 and Amazon EC2
import sagemaker
import boto3
from sagemaker import Session
# create sagemaker session
sagemaker_session = sagemaker.Session()
# specify bucket and folder
bucket = 'tutorial-sagemaker-sales-xgboost'
prefix = 'XGBoost-Regressor'
key = 'XGBoost-Regressor'
#Roles give learning and hosting access to the data
#This is specified while opening the sagemakers instance in "Create an IAM role"
role = sagemaker.get_execution_role()
print(bucket)
print(prefix)
print(key)
print(role)
tutorial-sagemaker-sales-xgboost XGBoost-Regressor XGBoost-Regressor arn:aws:iam::483449698840:role/service-role/AmazonSageMaker-ExecutionRole-20211018T115875
Training Data Location
# read the data from csv file and then upload the data to s3 bucket
import os
with open('training.csv','rb') as file:
# The following code uploads the data into S3 bucket to be accessed later for training
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(file)
# Let's print out the training data location in s3
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))
uploaded training data location: s3://tutorial-sagemaker-sales-xgboost/XGBoost-Regressor/train/XGBoost-Regressor
Validation Data Location
# read the data from csv file and then upload the data to s3 bucket
with open('validation.csv','rb') as file:
# The following code uploads the data into S3 bucket to be accessed later for training
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation', key)).upload_fileobj(file)
# Let's print out the validation data location in s3
s3_validation_data = 's3://{}/{}/validation/{}'.format(bucket, prefix, key)
print('uploaded validation data location: {}'.format(s3_validation_data))
uploaded validation data location: s3://tutorial-sagemaker-sales-xgboost/XGBoost-Regressor/validation/XGBoost-Regressor
Output Placeholder
# creates output placeholder in S3 bucket to store the output
output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('training artifacts will be uploaded to: {}'.format(output_location))
training artifacts will be uploaded to: s3://tutorial-sagemaker-sales-xgboost/XGBoost-Regressor/output
Algorithm Container
# This code is used to get the training container of sagemaker built-in algorithms
# all we have to do is to specify the name of the algorithm, that we want to use
region = boto3.Session().region_name
container = sagemaker.image_uris.retrieve('xgboost', region, version='latest')
container
'811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest'
Container and Algorithm Parameters
# Specify the type of instance that we would like to use for training
# output path and sagemaker session into the Estimator.
# We can also specify how many instances we would like to use for training
# Recall that XGBoost works by combining an ensemble of weak models to generate accurate/robust results.
# The weak models are randomized to avoid overfitting
# num_round: The number of rounds to run the training.
# Alpha: L1 regularization term on weights. Increasing this value makes models more conservative.
# colsample_by_tree: fraction of features that will be used to train each tree.
# eta: Step size shrinkage used in updates to prevent overfitting.
# After each boosting step, eta parameter shrinks the feature weights to make the boosting process more conservative.
Xgboost_regressor = sagemaker.estimator.Estimator(container,
role,
instance_count = 1,
instance_type = 'ml.m5.2xlarge',
output_path = output_location,
sagemaker_session = sagemaker_session,
# reduce cost with spot instances
use_spot_instances = True,
max_run = 300,
max_wait = 600
)
#We can tune the hyper-parameters to improve the performance of the model
Xgboost_regressor.set_hyperparameters(max_depth = 10,
objective = 'reg:linear',
colsample_bytree = 0.3,
alpha = 10,
eta = 0.1,
num_round = 100
)
Data Channels
# Creating "train", "validation" channels to feed in the model
# Source: https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
train_input = sagemaker.inputs.TrainingInput(s3_data = s3_train_data, content_type='csv',s3_data_type = 'S3Prefix')
valid_input = sagemaker.inputs.TrainingInput(s3_data = s3_validation_data, content_type='csv',s3_data_type = 'S3Prefix')
data_channels = {'train': train_input,'validation': valid_input}
Train Model
Xgboost_regressor.fit(data_channels)
2021-11-04 01:25:49 Starting - Starting the training job... 2021-11-04 01:26:13 Starting - Launching requested ML instancesProfilerReport-1635989149: InProgress ...... 2021-11-04 01:27:14 Starting - Preparing the instances for training......... 2021-11-04 01:28:46 Downloading - Downloading input data 2021-11-04 01:28:46 Training - Downloading the training image..[34mArguments: train[0m [34m[2021-11-04:01:29:02:INFO] Running standalone xgboost training.[0m [34m[2021-11-04:01:29:02:INFO] File size need to be processed in the node: 134.75mb. Available memory size in the node: 23771.49mb[0m [34m[2021-11-04:01:29:02:INFO] Determined delimiter of CSV input is ','[0m [34m[01:29:02] S3DistributionType set as FullyReplicated[0m [34m[01:29:03] 413138x138 matrix with 57013044 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m [34m[2021-11-04:01:29:03:INFO] Determined delimiter of CSV input is ','[0m [34m[01:29:03] S3DistributionType set as FullyReplicated[0m [34m[01:29:03] 8432x138 matrix with 1163616 entries loaded from /opt/ml/input/data/validation?format=csv&label_column=0&delimiter=,[0m [34m[01:29:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 280 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[0]#011train-rmse:26438.7#011validation-rmse:26719.4[0m [34m[01:29:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 186 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[1]#011train-rmse:25132#011validation-rmse:25360.7[0m [34m[01:29:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 134 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[2]#011train-rmse:24277.6#011validation-rmse:24491.7[0m [34m[01:29:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 516 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[3]#011train-rmse:23507.3#011validation-rmse:23752.3[0m [34m[01:29:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 376 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[4]#011train-rmse:22410.4#011validation-rmse:22620.7[0m [34m[01:29:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 294 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[5]#011train-rmse:21626.1#011validation-rmse:21807.8[0m [34m[01:29:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 280 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[6]#011train-rmse:21162.7#011validation-rmse:21349.2[0m [34m[01:29:07] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 582 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[7]#011train-rmse:20449.9#011validation-rmse:20609.4[0m
2021-11-04 01:29:14 Training - Training image download completed. Training in progress.[34m[01:29:07] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 200 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[8]#011train-rmse:20010.4#011validation-rmse:20186.7[0m [34m[01:29:07] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 512 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[9]#011train-rmse:19158.7#011validation-rmse:19302.7[0m [34m[01:29:07] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 264 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[10]#011train-rmse:18656.4#011validation-rmse:18803.7[0m [34m[01:29:08] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 568 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[11]#011train-rmse:18227.7#011validation-rmse:18348.3[0m [34m[01:29:08] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 756 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[12]#011train-rmse:17616.9#011validation-rmse:17700.8[0m [34m[01:29:08] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 360 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[13]#011train-rmse:17309.5#011validation-rmse:17375.6[0m [34m[01:29:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 294 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[14]#011train-rmse:16843.7#011validation-rmse:16882[0m [34m[01:29:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 368 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[15]#011train-rmse:16445#011validation-rmse:16438.4[0m [34m[01:29:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 322 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[16]#011train-rmse:16258.4#011validation-rmse:16249.4[0m [34m[01:29:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 362 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[17]#011train-rmse:15938.5#011validation-rmse:15885.8[0m [34m[01:29:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 270 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[18]#011train-rmse:15584.9#011validation-rmse:15513.5[0m [34m[01:29:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 534 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[19]#011train-rmse:15283.6#011validation-rmse:15172.6[0m [34m[01:29:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 424 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[20]#011train-rmse:15006.6#011validation-rmse:14858.5[0m [34m[01:29:11] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 778 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[21]#011train-rmse:14646.6#011validation-rmse:14502.8[0m [34m[01:29:11] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 282 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[22]#011train-rmse:14464.6#011validation-rmse:14300.9[0m [34m[01:29:11] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 256 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[23]#011train-rmse:14199.5#011validation-rmse:14023.8[0m [34m[01:29:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 688 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[24]#011train-rmse:13979.7#011validation-rmse:13792.4[0m [34m[01:29:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 828 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[25]#011train-rmse:13745#011validation-rmse:13539.5[0m [34m[01:29:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 232 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[26]#011train-rmse:13451.5#011validation-rmse:13231.4[0m [34m[01:29:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 518 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[27]#011train-rmse:13235#011validation-rmse:13032.9[0m [34m[01:29:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 316 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[28]#011train-rmse:13109#011validation-rmse:12883.8[0m [34m[01:29:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 170 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[29]#011train-rmse:12992.8#011validation-rmse:12781.5[0m [34m[01:29:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 242 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[30]#011train-rmse:12886.4#011validation-rmse:12669.8[0m [34m[01:29:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 790 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[31]#011train-rmse:12636.9#011validation-rmse:12410.4[0m [34m[01:29:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 328 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[32]#011train-rmse:12378.1#011validation-rmse:12136.2[0m [34m[01:29:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 306 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[33]#011train-rmse:12252.1#011validation-rmse:12005.4[0m [34m[01:29:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 556 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[34]#011train-rmse:12085#011validation-rmse:11850.3[0m [34m[01:29:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 922 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[35]#011train-rmse:11943.4#011validation-rmse:11701[0m [34m[01:29:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 942 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[36]#011train-rmse:11628.8#011validation-rmse:11417.1[0m [34m[01:29:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 998 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[37]#011train-rmse:11498.4#011validation-rmse:11306.6[0m [34m[01:29:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 510 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[38]#011train-rmse:11359.4#011validation-rmse:11172.8[0m [34m[01:29:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 630 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[39]#011train-rmse:11261.7#011validation-rmse:11084.4[0m [34m[01:29:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 464 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[40]#011train-rmse:11017.4#011validation-rmse:10820.3[0m [34m[01:29:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 284 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[41]#011train-rmse:10923.9#011validation-rmse:10712[0m [34m[01:29:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 824 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[42]#011train-rmse:10716.8#011validation-rmse:10471.9[0m [34m[01:29:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 760 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[43]#011train-rmse:10506#011validation-rmse:10261.4[0m [34m[01:29:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 342 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[44]#011train-rmse:10394.5#011validation-rmse:10153.8[0m [34m[01:29:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 438 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[45]#011train-rmse:10335.6#011validation-rmse:10093.1[0m [34m[01:29:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 438 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[46]#011train-rmse:10264.1#011validation-rmse:10010[0m [34m[01:29:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 294 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[47]#011train-rmse:10177#011validation-rmse:9920.74[0m [34m[01:29:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 1074 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[48]#011train-rmse:10028.8#011validation-rmse:9776.71[0m [34m[01:29:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 452 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[49]#011train-rmse:9959.12#011validation-rmse:9699.38[0m [34m[01:29:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 758 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[50]#011train-rmse:9819.62#011validation-rmse:9583.81[0m [34m[01:29:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 654 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[51]#011train-rmse:9756.69#011validation-rmse:9518.23[0m [34m[01:29:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 860 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[52]#011train-rmse:9654.91#011validation-rmse:9430.58[0m [34m[01:29:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 820 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[53]#011train-rmse:9501.04#011validation-rmse:9297.99[0m [34m[01:29:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 290 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[54]#011train-rmse:9400.23#011validation-rmse:9201.69[0m [34m[01:29:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 536 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[55]#011train-rmse:9344.84#011validation-rmse:9149.57[0m [34m[01:29:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 380 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[56]#011train-rmse:9267.61#011validation-rmse:9069.42[0m [34m[01:29:21] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 380 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[57]#011train-rmse:9209.92#011validation-rmse:9007.38[0m [34m[01:29:21] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 988 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[58]#011train-rmse:9095.4#011validation-rmse:8891.88[0m [34m[01:29:21] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 216 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[59]#011train-rmse:9018.6#011validation-rmse:8827.79[0m [34m[01:29:21] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 340 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[60]#011train-rmse:8969.99#011validation-rmse:8779.02[0m [34m[01:29:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 426 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[61]#011train-rmse:8913.37#011validation-rmse:8722.46[0m [34m[01:29:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 1056 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[62]#011train-rmse:8856.65#011validation-rmse:8664.98[0m [34m[01:29:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 430 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[63]#011train-rmse:8806.69#011validation-rmse:8622.59[0m [34m[01:29:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 762 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[64]#011train-rmse:8758.35#011validation-rmse:8575.46[0m [34m[01:29:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 612 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[65]#011train-rmse:8698.18#011validation-rmse:8506.64[0m [34m[01:29:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 578 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[66]#011train-rmse:8668.53#011validation-rmse:8477.54[0m [34m[01:29:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 832 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[67]#011train-rmse:8636.11#011validation-rmse:8444.05[0m [34m[01:29:24] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 322 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[68]#011train-rmse:8612.05#011validation-rmse:8419.51[0m [34m[01:29:24] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 504 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[69]#011train-rmse:8527.51#011validation-rmse:8335.6[0m [34m[01:29:24] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 814 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[70]#011train-rmse:8477.03#011validation-rmse:8287.09[0m [34m[01:29:24] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 486 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[71]#011train-rmse:8429.81#011validation-rmse:8248.77[0m [34m[01:29:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 344 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[72]#011train-rmse:8406.38#011validation-rmse:8225.97[0m [34m[01:29:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 846 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[73]#011train-rmse:8350.88#011validation-rmse:8165.68[0m [34m[01:29:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 962 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[74]#011train-rmse:8265.71#011validation-rmse:8097.3[0m [34m[01:29:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 978 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[75]#011train-rmse:8189.77#011validation-rmse:8013.82[0m [34m[01:29:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 294 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[76]#011train-rmse:8160.4#011validation-rmse:7984.17[0m [34m[01:29:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 698 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[77]#011train-rmse:8088.24#011validation-rmse:7904.87[0m [34m[01:29:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 416 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[78]#011train-rmse:8065.81#011validation-rmse:7880.81[0m [34m[01:29:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 1030 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[79]#011train-rmse:7985.32#011validation-rmse:7795.9[0m [34m[01:29:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 414 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[80]#011train-rmse:7959.09#011validation-rmse:7768.53[0m [34m[01:29:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 506 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[81]#011train-rmse:7899.51#011validation-rmse:7717.77[0m [34m[01:29:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 996 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[82]#011train-rmse:7842.48#011validation-rmse:7644.3[0m [34m[01:29:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 394 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[83]#011train-rmse:7812.29#011validation-rmse:7616.29[0m [34m[01:29:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 582 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[84]#011train-rmse:7775.24#011validation-rmse:7581.92[0m [34m[01:29:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 930 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[85]#011train-rmse:7732.53#011validation-rmse:7530.5[0m [34m[01:29:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 368 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[86]#011train-rmse:7707.12#011validation-rmse:7496.32[0m [34m[01:29:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 454 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[87]#011train-rmse:7684.77#011validation-rmse:7471.27[0m [34m[01:29:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 588 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[88]#011train-rmse:7642.79#011validation-rmse:7430.39[0m [34m[01:29:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 154 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[89]#011train-rmse:7614.04#011validation-rmse:7400.1[0m [34m[01:29:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 632 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[90]#011train-rmse:7537.41#011validation-rmse:7326.29[0m [34m[01:29:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 636 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[91]#011train-rmse:7516.38#011validation-rmse:7299.69[0m [34m[01:29:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 686 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[92]#011train-rmse:7456.1#011validation-rmse:7259.76[0m [34m[01:29:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 584 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[93]#011train-rmse:7435.81#011validation-rmse:7240.72[0m [34m[01:29:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 570 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[94]#011train-rmse:7409.44#011validation-rmse:7211.32[0m [34m[01:29:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 578 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[95]#011train-rmse:7375.82#011validation-rmse:7178.67[0m [34m[01:29:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 362 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[96]#011train-rmse:7354.69#011validation-rmse:7158.87[0m [34m[01:29:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 330 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[97]#011train-rmse:7340.66#011validation-rmse:7144.72[0m [34m[01:29:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 706 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[98]#011train-rmse:7301.74#011validation-rmse:7106.24[0m [34m[01:29:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 760 extra nodes, 0 pruned nodes, max_depth=10[0m [34m[99]#011train-rmse:7278.08#011validation-rmse:7081.5[0m
2021-11-04 01:29:54 Uploading - Uploading generated training model 2021-11-04 01:29:54 Completed - Training job completed Training seconds: 75 Billable seconds: 34 Managed Spot Training savings: 54.7%
Deploy Model
# Deploy the model to perform inference
Xgboost_regressor = Xgboost_regressor.deploy(initial_instance_count = 1, instance_type = 'ml.m5.2xlarge')
-----!
Set Serializer
# Content type over-rides the data that will be passed to the deployed model, since the deployed model expects data
# in text/csv format, we specify this as content -type.
# Serializer accepts a single argument, the input data, and returns a sequence of bytes in the specified content
# type
# Reference: https://sagemaker.readthedocs.io/en/stable/predictors.html
from sagemaker.serializers import CSVSerializer
Xgboost_regressor.serializer = CSVSerializer()
Shape Testing Data
Split testing data for metrics later
testing_data
Weekly_Sales | IsHoliday | Temperature | Fuel_Price | MarkDown1 | MarkDown2 | MarkDown3 | MarkDown4 | MarkDown5 | CPI | ... | Dept_90 | Dept_91 | Dept_92 | Dept_93 | Dept_94 | Dept_95 | Dept_96 | Dept_97 | Dept_98 | Dept_99 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
236081 | 19829.80 | 0 | 76.42 | 3.732 | 571.85 | 126.88 | 145.72 | 614.55 | 1867.65 | 138.233193 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
188104 | 12212.20 | 0 | 55.43 | 2.899 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 203.730749 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
353417 | 1962.50 | 0 | 75.98 | 3.721 | 241.47 | 0.00 | 0.00 | 0.00 | 2143.91 | 221.457860 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
229856 | 82315.53 | 0 | 45.63 | 3.138 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 132.917200 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
13179 | 48465.38 | 0 | 50.81 | 2.771 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 211.547030 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
127670 | 2870.09 | 0 | 67.96 | 3.821 | 10671.71 | 141.83 | 46.00 | 2465.37 | 12372.29 | 131.010333 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
396908 | 115.00 | 0 | 80.06 | 4.277 | 609.75 | 0.00 | 1.10 | 43.67 | 726.34 | 130.959226 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
251735 | 1285.74 | 0 | 59.85 | 3.924 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 134.942548 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
346065 | 5009.87 | 0 | 85.89 | 3.571 | 442.05 | 0.00 | 0.00 | 0.00 | 1381.39 | 220.719961 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
227008 | 130761.81 | 1 | 25.94 | 2.940 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 131.586613 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
8432 rows × 139 columns
# split testing_data.df into X_test.f32 & y_test.f32
y_test = testing_data['Weekly_Sales']
X_test = testing_data.drop(columns = ["Weekly_Sales"])
y_test.shape
(8432,)
X_test.shape
(8432, 138)
Remove Target for Predictions Function
.predictions() method needs the testing data without the y value (target)
# testing_data.pop('Weekly_Sales')
# testing_data
IsHoliday | Temperature | Fuel_Price | MarkDown1 | MarkDown2 | MarkDown3 | MarkDown4 | MarkDown5 | CPI | Unemployment | ... | Dept_90 | Dept_91 | Dept_92 | Dept_93 | Dept_94 | Dept_95 | Dept_96 | Dept_97 | Dept_98 | Dept_99 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
373039 | 0 | 70.66 | 2.735 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 132.724839 | 5.326 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
69323 | 0 | 60.18 | 2.719 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 214.164218 | 6.290 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
157684 | 0 | 15.64 | 2.667 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 126.552286 | 6.548 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
76599 | 0 | 62.18 | 3.891 | 6821.13 | 0.00 | 27.04 | 2101.86 | 3627.77 | 224.988362 | 5.679 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
313527 | 1 | 72.56 | 3.596 | 7271.43 | 172.04 | 233.55 | 916.85 | 3089.99 | 198.095048 | 7.872 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
214614 | 0 | 46.06 | 3.867 | 8463.01 | 51.31 | 11.36 | 3719.23 | 4895.18 | 141.554780 | 7.503 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
112581 | 0 | 71.81 | 4.031 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 129.049032 | 13.736 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
224550 | 0 | 33.11 | 3.876 | 8575.57 | 1125.61 | 5.08 | 4365.63 | 2742.87 | 137.506690 | 4.261 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
238214 | 0 | 69.31 | 2.899 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 204.140656 | 7.856 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
261374 | 0 | 61.71 | 4.117 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 138.330312 | 7.725 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
8432 rows × 138 columns
Convert Testing Data to float32
Testing data must be converted to float32 format before execution
testing_data_float32 = np.array(X_test).astype('float32')
testing_data_float32.shape
(8432, 138)
Make Predictions
# custom code to convert the values in bytes format to array
def bytes_2_array(x):
# makes entire prediction as string and splits based on ','
l = str(x).split(',')
# Since the first element contains unwanted characters like (b,',') we remove them
l[0] = l[0][2:]
#same-thing as above remove the unwanted last character (')
l[-1] = l[-1][:-1]
# iterating through the list of strings and converting them into float type
for i in range(len(l)):
l[i] = float(l[i])
# converting the list into array
l = np.array(l).astype('float32')
# reshape one-dimensional array to two-dimensional array
return l.reshape(-1,1)
Attempting to run the entire dataset would cause a crash because this dataset is so large, so let us predict 10,000 at a time
# making prediction
predictions_bytes = Xgboost_regressor.predict(testing_data_float32)
# convert bytes to array
predicted_values = bytes_2_array(predictions_bytes)
predicted_values.shape
(8432, 1)
Calculate Accuracy
Split Testing Data
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt
k = X_test.shape[1]
n = len(X_test)
RMSE = float(format(np.sqrt(mean_squared_error(y_test, predicted_values)),'.3f'))
MSE = mean_squared_error(y_test, predicted_values)
MAE = mean_absolute_error(y_test, predicted_values)
r2 = r2_score(y_test, predicted_values)
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)
print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2, '\nAdjusted R2 =', adj_r2)
RMSE = 7377.642 MSE = 54429599.67199714 MAE = 4311.153492210574 R2 = 0.903990280483614 Adjusted R2 = 0.9023926268850053
Comments
Recent Work
Basalt
basalt.softwareFree desktop AI Chat client, designed for developers and businesses. Unlocks advanced model settings only available in the API. Includes quality of life features like custom syntax highlighting.
BidBear
bidbear.ioBidbear is a report automation tool. It downloads Amazon Seller and Advertising reports, daily, to a private database. It then merges and formats the data into beautiful, on demand, exportable performance reports.