Skip to main content
Check out bidbear.io Automated Amazon Reports 🚀

AWS SageMaker Tutorial: Part 7

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import zipfile

# import the csv files using pandas
features = pd.read_csv('Features_data_set.csv')
sales = pd.read_csv('sales_data_set.csv')
stores = pd.read_csv('stores_data_set.csv')

Shape Datafield

# function to return month
def get_month(datetime):
return int(str(datetime).split('-')[1])

# Change the datatype of 'date' column
features['Date'] = pd.to_datetime(features['Date'])
sales['Date'] = pd.to_datetime(sales['Date'])

# merge the three seperate csv files
df = pd.merge(sales, features, on = ['Store','Date','IsHoliday'])
df = pd.merge(df, stores, on = ['Store'], how = 'left')

# retrieve month from date column and add to it's own "Month" column
df['Month'] = df['Date'].apply(get_month)

# Fill up NaN elements with zeros
df = df.fillna(0)

# Convert IsHoliday boolean to numbers
df['IsHoliday'] =df['IsHoliday'].apply(lambda element : 0 if element == False else 1)

## Move target "weekly_sales" into first column
# remove column into its own object
first_column = df.pop('Weekly_Sales')

# insert column using insert(position,column_name,
# first_column) function
df.insert(0, 'Weekly_Sales', first_column)

# drop the Date column
# We no longer need the date column as we have isolated the month into it's own column
df = df.drop(columns = ['Date'])

# get dummies
df = pd.get_dummies(df, columns = ['Type', 'Store', 'Dept'], drop_first = True)
df.sample(n=20)
Weekly_SalesIsHolidayTemperatureFuel_PriceMarkDown1MarkDown2MarkDown3MarkDown4MarkDown5CPI...Dept_90Dept_91Dept_92Dept_93Dept_94Dept_95Dept_96Dept_97Dept_98Dept_99
2992683745.69094.223.6840.000.000.000.000.00215.197852...0000000000
2385153543.48072.172.8080.000.000.000.000.00204.567546...0000000000
389906124014.71074.473.6467791.47306.7031.444856.674087.29197.832220...0010000000
1394763718.19065.832.9420.000.000.000.000.00132.473333...0000000000
41924915534.95041.553.81622832.382515.254.0013317.882560.48190.171493...0000000000
2342843289.22031.923.737755.784142.75190.3090.502594.66136.959839...0000000000
35138181.08055.413.112158.110.007.500.001316.88218.054185...0000000000
2318268931.06073.124.0690.000.000.000.000.00134.855161...0000000000
41488938683.93030.543.1090.000.000.000.000.00182.551954...0000000000
21506120421.97050.753.99121823.530.0037.876586.491565.11142.017793...0000000000
6949649155.24062.662.8080.000.000.000.000.00213.818636...0000000000
26439446906.16037.243.87414655.1510670.84162.8211286.991595.80141.214036...0001000000
3233556781.83052.432.6990.000.000.000.000.00126.491290...0000000000
2116887071.69058.564.1010.000.000.000.000.00138.587106...0000000000
970648373.04088.834.0024407.900.007.203037.563717.52130.790968...0000000000
859765302.91070.943.6884727.800.001.04356.781077.81225.478263...0000000000
9989210980.97087.702.6190.000.000.000.000.00214.889794...0000000000
2503081437.29016.703.2150.000.000.000.000.00132.951065...0000000000
39110417084.47054.342.9620.000.000.000.000.00126.442065...0000000100
428642907.10037.742.9830.000.000.000.000.00212.008514...0000000000

20 rows × 139 columns

Datafield is looking good at this point

df.shape

(421570, 139)

Shape Training and Validation Data

Split Data​

# spliting the data in to test and train sets
from sklearn.model_selection import train_test_split
training_data, testing_data = train_test_split(df, test_size = 0.02)

Generate CSV Files​

# save train_data and validation_data as csv files.
training_data.to_csv('training.csv', header = False, index = False)
testing_data.to_csv('validation.csv', header = False, index = False)

Configure Sagemaker

# Boto3 is the Amazon Web Services (AWS) Software Development Kit (SDK) for Python
# Boto3 allows Python developer to write software that makes use of services like Amazon S3 and Amazon EC2

import sagemaker
import boto3
from sagemaker import Session

# create sagemaker session
sagemaker_session = sagemaker.Session()

# specify bucket and folder
bucket = 'tutorial-sagemaker-sales-xgboost'
prefix = 'XGBoost-Regressor'
key = 'XGBoost-Regressor'

#Roles give learning and hosting access to the data
#This is specified while opening the sagemakers instance in "Create an IAM role"
role = sagemaker.get_execution_role()

print(bucket)
print(prefix)
print(key)
print(role)

tutorial-sagemaker-sales-xgboost XGBoost-Regressor XGBoost-Regressor arn:aws:iam::483449698840:role/service-role/AmazonSageMaker-ExecutionRole-20211018T115875

Training Data Location​

# read the data from csv file and then upload the data to s3 bucket
import os
with open('training.csv','rb') as file:
# The following code uploads the data into S3 bucket to be accessed later for training
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(file)

# Let's print out the training data location in s3
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

uploaded training data location: s3://tutorial-sagemaker-sales-xgboost/XGBoost-Regressor/train/XGBoost-Regressor

Validation Data Location​

# read the data from csv file and then upload the data to s3 bucket

with open('validation.csv','rb') as file:
# The following code uploads the data into S3 bucket to be accessed later for training

boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation', key)).upload_fileobj(file)
# Let's print out the validation data location in s3
s3_validation_data = 's3://{}/{}/validation/{}'.format(bucket, prefix, key)
print('uploaded validation data location: {}'.format(s3_validation_data))

uploaded validation data location: s3://tutorial-sagemaker-sales-xgboost/XGBoost-Regressor/validation/XGBoost-Regressor

Output Placeholder​

# creates output placeholder in S3 bucket to store the output

output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('training artifacts will be uploaded to: {}'.format(output_location))

training artifacts will be uploaded to: s3://tutorial-sagemaker-sales-xgboost/XGBoost-Regressor/output

Algorithm Container​

# This code is used to get the training container of sagemaker built-in algorithms
# all we have to do is to specify the name of the algorithm, that we want to use

region = boto3.Session().region_name
container = sagemaker.image_uris.retrieve('xgboost', region, version='latest')
container

'811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest'

Container and Algorithm Parameters

# Specify the type of instance that we would like to use for training
# output path and sagemaker session into the Estimator.
# We can also specify how many instances we would like to use for training

# Recall that XGBoost works by combining an ensemble of weak models to generate accurate/robust results.
# The weak models are randomized to avoid overfitting

# num_round: The number of rounds to run the training.


# Alpha: L1 regularization term on weights. Increasing this value makes models more conservative.

# colsample_by_tree: fraction of features that will be used to train each tree.

# eta: Step size shrinkage used in updates to prevent overfitting.
# After each boosting step, eta parameter shrinks the feature weights to make the boosting process more conservative.


Xgboost_regressor = sagemaker.estimator.Estimator(container,
role,
instance_count = 1,
instance_type = 'ml.m5.2xlarge',
output_path = output_location,
sagemaker_session = sagemaker_session,
# reduce cost with spot instances
use_spot_instances = True,
max_run = 300,
max_wait = 600
)

#We can tune the hyper-parameters to improve the performance of the model

Xgboost_regressor.set_hyperparameters(max_depth = 10,
objective = 'reg:linear',
colsample_bytree = 0.3,
alpha = 10,
eta = 0.1,
num_round = 100
)

Data Channels​

# Creating "train", "validation" channels to feed in the model
# Source: https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html

train_input = sagemaker.inputs.TrainingInput(s3_data = s3_train_data, content_type='csv',s3_data_type = 'S3Prefix')
valid_input = sagemaker.inputs.TrainingInput(s3_data = s3_validation_data, content_type='csv',s3_data_type = 'S3Prefix')


data_channels = {'train': train_input,'validation': valid_input}

Train Model

Xgboost_regressor.fit(data_channels)

2021-11-04 01:25:49 Starting - Starting the training job... 2021-11-04 01:26:13 Starting - Launching requested ML instancesProfilerReport-1635989149: InProgress ...... 2021-11-04 01:27:14 Starting - Preparing the instances for training......... 2021-11-04 01:28:46 Downloading - Downloading input data 2021-11-04 01:28:46 Training - Downloading the training image..Arguments: train [2021-11-04:01:29:02:INFO] Running standalone xgboost training. [2021-11-04:01:29:02:INFO] File size need to be processed in the node: 134.75mb. Available memory size in the node: 23771.49mb [2021-11-04:01:29:02:INFO] Determined delimiter of CSV input is ',' [01:29:02] S3DistributionType set as FullyReplicated [01:29:03] 413138x138 matrix with 57013044 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=, [2021-11-04:01:29:03:INFO] Determined delimiter of CSV input is ',' [01:29:03] S3DistributionType set as FullyReplicated [01:29:03] 8432x138 matrix with 1163616 entries loaded from /opt/ml/input/data/validation?format=csv&label_column=0&delimiter=, [01:29:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 280 extra nodes, 0 pruned nodes, max_depth=10 [0]#011train-rmse:26438.7#011validation-rmse:26719.4 [01:29:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 186 extra nodes, 0 pruned nodes, max_depth=10 [1]#011train-rmse:25132#011validation-rmse:25360.7 [01:29:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 134 extra nodes, 0 pruned nodes, max_depth=10 [2]#011train-rmse:24277.6#011validation-rmse:24491.7 [01:29:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 516 extra nodes, 0 pruned nodes, max_depth=10 [3]#011train-rmse:23507.3#011validation-rmse:23752.3 [01:29:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 376 extra nodes, 0 pruned nodes, max_depth=10 [4]#011train-rmse:22410.4#011validation-rmse:22620.7 [01:29:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 294 extra nodes, 0 pruned nodes, max_depth=10 [5]#011train-rmse:21626.1#011validation-rmse:21807.8 [01:29:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 280 extra nodes, 0 pruned nodes, max_depth=10 [6]#011train-rmse:21162.7#011validation-rmse:21349.2 [01:29:07] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 582 extra nodes, 0 pruned nodes, max_depth=10 [7]#011train-rmse:20449.9#011validation-rmse:20609.4

2021-11-04 01:29:14 Training - Training image download completed. Training in progress.[01:29:07] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 200 extra nodes, 0 pruned nodes, max_depth=10 [8]#011train-rmse:20010.4#011validation-rmse:20186.7 [01:29:07] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 512 extra nodes, 0 pruned nodes, max_depth=10 [9]#011train-rmse:19158.7#011validation-rmse:19302.7 [01:29:07] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 264 extra nodes, 0 pruned nodes, max_depth=10 [10]#011train-rmse:18656.4#011validation-rmse:18803.7 [01:29:08] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 568 extra nodes, 0 pruned nodes, max_depth=10 [11]#011train-rmse:18227.7#011validation-rmse:18348.3 [01:29:08] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 756 extra nodes, 0 pruned nodes, max_depth=10 [12]#011train-rmse:17616.9#011validation-rmse:17700.8 [01:29:08] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 360 extra nodes, 0 pruned nodes, max_depth=10 [13]#011train-rmse:17309.5#011validation-rmse:17375.6 [01:29:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 294 extra nodes, 0 pruned nodes, max_depth=10 [14]#011train-rmse:16843.7#011validation-rmse:16882 [01:29:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 368 extra nodes, 0 pruned nodes, max_depth=10 [15]#011train-rmse:16445#011validation-rmse:16438.4 [01:29:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 322 extra nodes, 0 pruned nodes, max_depth=10 [16]#011train-rmse:16258.4#011validation-rmse:16249.4 [01:29:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 362 extra nodes, 0 pruned nodes, max_depth=10 [17]#011train-rmse:15938.5#011validation-rmse:15885.8 [01:29:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 270 extra nodes, 0 pruned nodes, max_depth=10 [18]#011train-rmse:15584.9#011validation-rmse:15513.5 [01:29:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 534 extra nodes, 0 pruned nodes, max_depth=10 [19]#011train-rmse:15283.6#011validation-rmse:15172.6 [01:29:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 424 extra nodes, 0 pruned nodes, max_depth=10 [20]#011train-rmse:15006.6#011validation-rmse:14858.5 [01:29:11] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 778 extra nodes, 0 pruned nodes, max_depth=10 [21]#011train-rmse:14646.6#011validation-rmse:14502.8 [01:29:11] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 282 extra nodes, 0 pruned nodes, max_depth=10 [22]#011train-rmse:14464.6#011validation-rmse:14300.9 [01:29:11] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 256 extra nodes, 0 pruned nodes, max_depth=10 [23]#011train-rmse:14199.5#011validation-rmse:14023.8 [01:29:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 688 extra nodes, 0 pruned nodes, max_depth=10 [24]#011train-rmse:13979.7#011validation-rmse:13792.4 [01:29:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 828 extra nodes, 0 pruned nodes, max_depth=10 [25]#011train-rmse:13745#011validation-rmse:13539.5 [01:29:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 232 extra nodes, 0 pruned nodes, max_depth=10 [26]#011train-rmse:13451.5#011validation-rmse:13231.4 [01:29:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 518 extra nodes, 0 pruned nodes, max_depth=10 [27]#011train-rmse:13235#011validation-rmse:13032.9 [01:29:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 316 extra nodes, 0 pruned nodes, max_depth=10 [28]#011train-rmse:13109#011validation-rmse:12883.8 [01:29:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 170 extra nodes, 0 pruned nodes, max_depth=10 [29]#011train-rmse:12992.8#011validation-rmse:12781.5 [01:29:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 242 extra nodes, 0 pruned nodes, max_depth=10 [30]#011train-rmse:12886.4#011validation-rmse:12669.8 [01:29:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 790 extra nodes, 0 pruned nodes, max_depth=10 [31]#011train-rmse:12636.9#011validation-rmse:12410.4 [01:29:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 328 extra nodes, 0 pruned nodes, max_depth=10 [32]#011train-rmse:12378.1#011validation-rmse:12136.2 [01:29:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 306 extra nodes, 0 pruned nodes, max_depth=10 [33]#011train-rmse:12252.1#011validation-rmse:12005.4 [01:29:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 556 extra nodes, 0 pruned nodes, max_depth=10 [34]#011train-rmse:12085#011validation-rmse:11850.3 [01:29:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 922 extra nodes, 0 pruned nodes, max_depth=10 [35]#011train-rmse:11943.4#011validation-rmse:11701 [01:29:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 942 extra nodes, 0 pruned nodes, max_depth=10 [36]#011train-rmse:11628.8#011validation-rmse:11417.1 [01:29:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 998 extra nodes, 0 pruned nodes, max_depth=10 [37]#011train-rmse:11498.4#011validation-rmse:11306.6 [01:29:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 510 extra nodes, 0 pruned nodes, max_depth=10 [38]#011train-rmse:11359.4#011validation-rmse:11172.8 [01:29:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 630 extra nodes, 0 pruned nodes, max_depth=10 [39]#011train-rmse:11261.7#011validation-rmse:11084.4 [01:29:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 464 extra nodes, 0 pruned nodes, max_depth=10 [40]#011train-rmse:11017.4#011validation-rmse:10820.3 [01:29:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 284 extra nodes, 0 pruned nodes, max_depth=10 [41]#011train-rmse:10923.9#011validation-rmse:10712 [01:29:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 824 extra nodes, 0 pruned nodes, max_depth=10 [42]#011train-rmse:10716.8#011validation-rmse:10471.9 [01:29:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 760 extra nodes, 0 pruned nodes, max_depth=10 [43]#011train-rmse:10506#011validation-rmse:10261.4 [01:29:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 342 extra nodes, 0 pruned nodes, max_depth=10 [44]#011train-rmse:10394.5#011validation-rmse:10153.8 [01:29:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 438 extra nodes, 0 pruned nodes, max_depth=10 [45]#011train-rmse:10335.6#011validation-rmse:10093.1 [01:29:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 438 extra nodes, 0 pruned nodes, max_depth=10 [46]#011train-rmse:10264.1#011validation-rmse:10010 [01:29:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 294 extra nodes, 0 pruned nodes, max_depth=10 [47]#011train-rmse:10177#011validation-rmse:9920.74 [01:29:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 1074 extra nodes, 0 pruned nodes, max_depth=10 [48]#011train-rmse:10028.8#011validation-rmse:9776.71 [01:29:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 452 extra nodes, 0 pruned nodes, max_depth=10 [49]#011train-rmse:9959.12#011validation-rmse:9699.38 [01:29:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 758 extra nodes, 0 pruned nodes, max_depth=10 [50]#011train-rmse:9819.62#011validation-rmse:9583.81 [01:29:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 654 extra nodes, 0 pruned nodes, max_depth=10 [51]#011train-rmse:9756.69#011validation-rmse:9518.23 [01:29:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 860 extra nodes, 0 pruned nodes, max_depth=10 [52]#011train-rmse:9654.91#011validation-rmse:9430.58 [01:29:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 820 extra nodes, 0 pruned nodes, max_depth=10 [53]#011train-rmse:9501.04#011validation-rmse:9297.99 [01:29:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 290 extra nodes, 0 pruned nodes, max_depth=10 [54]#011train-rmse:9400.23#011validation-rmse:9201.69 [01:29:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 536 extra nodes, 0 pruned nodes, max_depth=10 [55]#011train-rmse:9344.84#011validation-rmse:9149.57 [01:29:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 380 extra nodes, 0 pruned nodes, max_depth=10 [56]#011train-rmse:9267.61#011validation-rmse:9069.42 [01:29:21] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 380 extra nodes, 0 pruned nodes, max_depth=10 [57]#011train-rmse:9209.92#011validation-rmse:9007.38 [01:29:21] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 988 extra nodes, 0 pruned nodes, max_depth=10 [58]#011train-rmse:9095.4#011validation-rmse:8891.88 [01:29:21] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 216 extra nodes, 0 pruned nodes, max_depth=10 [59]#011train-rmse:9018.6#011validation-rmse:8827.79 [01:29:21] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 340 extra nodes, 0 pruned nodes, max_depth=10 [60]#011train-rmse:8969.99#011validation-rmse:8779.02 [01:29:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 426 extra nodes, 0 pruned nodes, max_depth=10 [61]#011train-rmse:8913.37#011validation-rmse:8722.46 [01:29:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 1056 extra nodes, 0 pruned nodes, max_depth=10 [62]#011train-rmse:8856.65#011validation-rmse:8664.98 [01:29:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 430 extra nodes, 0 pruned nodes, max_depth=10 [63]#011train-rmse:8806.69#011validation-rmse:8622.59 [01:29:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 762 extra nodes, 0 pruned nodes, max_depth=10 [64]#011train-rmse:8758.35#011validation-rmse:8575.46 [01:29:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 612 extra nodes, 0 pruned nodes, max_depth=10 [65]#011train-rmse:8698.18#011validation-rmse:8506.64 [01:29:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 578 extra nodes, 0 pruned nodes, max_depth=10 [66]#011train-rmse:8668.53#011validation-rmse:8477.54 [01:29:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 832 extra nodes, 0 pruned nodes, max_depth=10 [67]#011train-rmse:8636.11#011validation-rmse:8444.05 [01:29:24] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 322 extra nodes, 0 pruned nodes, max_depth=10 [68]#011train-rmse:8612.05#011validation-rmse:8419.51 [01:29:24] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 504 extra nodes, 0 pruned nodes, max_depth=10 [69]#011train-rmse:8527.51#011validation-rmse:8335.6 [01:29:24] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 814 extra nodes, 0 pruned nodes, max_depth=10 [70]#011train-rmse:8477.03#011validation-rmse:8287.09 [01:29:24] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 486 extra nodes, 0 pruned nodes, max_depth=10 [71]#011train-rmse:8429.81#011validation-rmse:8248.77 [01:29:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 344 extra nodes, 0 pruned nodes, max_depth=10 [72]#011train-rmse:8406.38#011validation-rmse:8225.97 [01:29:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 846 extra nodes, 0 pruned nodes, max_depth=10 [73]#011train-rmse:8350.88#011validation-rmse:8165.68 [01:29:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 962 extra nodes, 0 pruned nodes, max_depth=10 [74]#011train-rmse:8265.71#011validation-rmse:8097.3 [01:29:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 978 extra nodes, 0 pruned nodes, max_depth=10 [75]#011train-rmse:8189.77#011validation-rmse:8013.82 [01:29:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 294 extra nodes, 0 pruned nodes, max_depth=10 [76]#011train-rmse:8160.4#011validation-rmse:7984.17 [01:29:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 698 extra nodes, 0 pruned nodes, max_depth=10 [77]#011train-rmse:8088.24#011validation-rmse:7904.87 [01:29:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 416 extra nodes, 0 pruned nodes, max_depth=10 [78]#011train-rmse:8065.81#011validation-rmse:7880.81 [01:29:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 1030 extra nodes, 0 pruned nodes, max_depth=10 [79]#011train-rmse:7985.32#011validation-rmse:7795.9 [01:29:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 414 extra nodes, 0 pruned nodes, max_depth=10 [80]#011train-rmse:7959.09#011validation-rmse:7768.53 [01:29:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 506 extra nodes, 0 pruned nodes, max_depth=10 [81]#011train-rmse:7899.51#011validation-rmse:7717.77 [01:29:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 996 extra nodes, 0 pruned nodes, max_depth=10 [82]#011train-rmse:7842.48#011validation-rmse:7644.3 [01:29:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 394 extra nodes, 0 pruned nodes, max_depth=10 [83]#011train-rmse:7812.29#011validation-rmse:7616.29 [01:29:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 582 extra nodes, 0 pruned nodes, max_depth=10 [84]#011train-rmse:7775.24#011validation-rmse:7581.92 [01:29:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 930 extra nodes, 0 pruned nodes, max_depth=10 [85]#011train-rmse:7732.53#011validation-rmse:7530.5 [01:29:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 368 extra nodes, 0 pruned nodes, max_depth=10 [86]#011train-rmse:7707.12#011validation-rmse:7496.32 [01:29:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 454 extra nodes, 0 pruned nodes, max_depth=10 [87]#011train-rmse:7684.77#011validation-rmse:7471.27 [01:29:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 588 extra nodes, 0 pruned nodes, max_depth=10 [88]#011train-rmse:7642.79#011validation-rmse:7430.39 [01:29:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 154 extra nodes, 0 pruned nodes, max_depth=10 [89]#011train-rmse:7614.04#011validation-rmse:7400.1 [01:29:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 632 extra nodes, 0 pruned nodes, max_depth=10 [90]#011train-rmse:7537.41#011validation-rmse:7326.29 [01:29:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 636 extra nodes, 0 pruned nodes, max_depth=10 [91]#011train-rmse:7516.38#011validation-rmse:7299.69 [01:29:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 686 extra nodes, 0 pruned nodes, max_depth=10 [92]#011train-rmse:7456.1#011validation-rmse:7259.76 [01:29:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 584 extra nodes, 0 pruned nodes, max_depth=10 [93]#011train-rmse:7435.81#011validation-rmse:7240.72 [01:29:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 570 extra nodes, 0 pruned nodes, max_depth=10 [94]#011train-rmse:7409.44#011validation-rmse:7211.32 [01:29:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 578 extra nodes, 0 pruned nodes, max_depth=10 [95]#011train-rmse:7375.82#011validation-rmse:7178.67 [01:29:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 362 extra nodes, 0 pruned nodes, max_depth=10 [96]#011train-rmse:7354.69#011validation-rmse:7158.87 [01:29:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 330 extra nodes, 0 pruned nodes, max_depth=10 [97]#011train-rmse:7340.66#011validation-rmse:7144.72 [01:29:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 706 extra nodes, 0 pruned nodes, max_depth=10 [98]#011train-rmse:7301.74#011validation-rmse:7106.24 [01:29:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 760 extra nodes, 0 pruned nodes, max_depth=10 [99]#011train-rmse:7278.08#011validation-rmse:7081.5

2021-11-04 01:29:54 Uploading - Uploading generated training model 2021-11-04 01:29:54 Completed - Training job completed Training seconds: 75 Billable seconds: 34 Managed Spot Training savings: 54.7%

Deploy Model

# Deploy the model to perform inference

Xgboost_regressor = Xgboost_regressor.deploy(initial_instance_count = 1, instance_type = 'ml.m5.2xlarge')

-----!

Set Serializer

# Content type over-rides the data that will be passed to the deployed model, since the deployed model expects data
# in text/csv format, we specify this as content -type.

# Serializer accepts a single argument, the input data, and returns a sequence of bytes in the specified content
# type

# Reference: https://sagemaker.readthedocs.io/en/stable/predictors.html

from sagemaker.serializers import CSVSerializer

Xgboost_regressor.serializer = CSVSerializer()

Shape Testing Data

Split testing data for metrics later​

testing_data
Weekly_SalesIsHolidayTemperatureFuel_PriceMarkDown1MarkDown2MarkDown3MarkDown4MarkDown5CPI...Dept_90Dept_91Dept_92Dept_93Dept_94Dept_95Dept_96Dept_97Dept_98Dept_99
23608119829.80076.423.732571.85126.88145.72614.551867.65138.233193...0000000000
18810412212.20055.432.8990.000.000.000.000.00203.730749...0000000000
3534171962.50075.983.721241.470.000.000.002143.91221.457860...0000000000
22985682315.53045.633.1380.000.000.000.000.00132.917200...0000010000
1317948465.38050.812.7710.000.000.000.000.00211.547030...0000000000
..................................................................
1276702870.09067.963.82110671.71141.8346.002465.3712372.29131.010333...0000000000
396908115.00080.064.277609.750.001.1043.67726.34130.959226...0000000000
2517351285.74059.853.9240.000.000.000.000.00134.942548...0000000000
3460655009.87085.893.571442.050.000.000.001381.39220.719961...0000000010
227008130761.81125.942.9400.000.000.000.000.00131.586613...0010000000

8432 rows × 139 columns

# split testing_data.df into X_test.f32 & y_test.f32

y_test = testing_data['Weekly_Sales']
X_test = testing_data.drop(columns = ["Weekly_Sales"])
y_test.shape

(8432,)

X_test.shape

(8432, 138)

Remove Target for Predictions Function​

.predictions() method needs the testing data without the y value (target)

# testing_data.pop('Weekly_Sales')
# testing_data
IsHolidayTemperatureFuel_PriceMarkDown1MarkDown2MarkDown3MarkDown4MarkDown5CPIUnemployment...Dept_90Dept_91Dept_92Dept_93Dept_94Dept_95Dept_96Dept_97Dept_98Dept_99
373039070.662.7350.000.000.000.000.00132.7248395.326...0000000000
69323060.182.7190.000.000.000.000.00214.1642186.290...0000000000
157684015.642.6670.000.000.000.000.00126.5522866.548...0000000000
76599062.183.8916821.130.0027.042101.863627.77224.9883625.679...0000000000
313527172.563.5967271.43172.04233.55916.853089.99198.0950487.872...0000000000
..................................................................
214614046.063.8678463.0151.3111.363719.234895.18141.5547807.503...0000000000
112581071.814.0310.000.000.000.000.00129.04903213.736...0000000000
224550033.113.8768575.571125.615.084365.632742.87137.5066904.261...0000000000
238214069.312.8990.000.000.000.000.00204.1406567.856...0000000000
261374061.714.1170.000.000.000.000.00138.3303127.725...0000000000

8432 rows × 138 columns

Convert Testing Data to float32​

Testing data must be converted to float32 format before execution

testing_data_float32 = np.array(X_test).astype('float32')
testing_data_float32.shape

(8432, 138)

Make Predictions

# custom code to convert the values in bytes format to array
def bytes_2_array(x):

# makes entire prediction as string and splits based on ','
l = str(x).split(',')

# Since the first element contains unwanted characters like (b,',') we remove them
l[0] = l[0][2:]
#same-thing as above remove the unwanted last character (')
l[-1] = l[-1][:-1]

# iterating through the list of strings and converting them into float type
for i in range(len(l)):
l[i] = float(l[i])

# converting the list into array
l = np.array(l).astype('float32')

# reshape one-dimensional array to two-dimensional array
return l.reshape(-1,1)

Attempting to run the entire dataset would cause a crash because this dataset is so large, so let us predict 10,000 at a time

# making prediction

predictions_bytes = Xgboost_regressor.predict(testing_data_float32)
# convert bytes to array
predicted_values = bytes_2_array(predictions_bytes)
predicted_values.shape

(8432, 1)

Calculate Accuracy

Split Testing Data​

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt
k = X_test.shape[1]
n = len(X_test)
RMSE = float(format(np.sqrt(mean_squared_error(y_test, predicted_values)),'.3f'))
MSE = mean_squared_error(y_test, predicted_values)
MAE = mean_absolute_error(y_test, predicted_values)
r2 = r2_score(y_test, predicted_values)
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)

print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2, '\nAdjusted R2 =', adj_r2)

RMSE = 7377.642 MSE = 54429599.67199714 MAE = 4311.153492210574 R2 = 0.903990280483614 Adjusted R2 = 0.9023926268850053

Automated Amazon Reports

Automatically download Amazon Seller and Advertising reports to a private database. View beautiful, on demand, exportable performance reports.

bidbear.io
bidbear-application-screenshot