Skip to main content

AWS SageMaker Tutorial: Part 7

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import zipfile

# import the csv files using pandas
features = pd.read_csv('Features_data_set.csv')
sales = pd.read_csv('sales_data_set.csv')
stores = pd.read_csv('stores_data_set.csv')

Shape Datafield

# function to return month
def get_month(datetime):
return int(str(datetime).split('-')[1])

# Change the datatype of 'date' column
features['Date'] = pd.to_datetime(features['Date'])
sales['Date'] = pd.to_datetime(sales['Date'])

# merge the three seperate csv files
df = pd.merge(sales, features, on = ['Store','Date','IsHoliday'])
df = pd.merge(df, stores, on = ['Store'], how = 'left')

# retrieve month from date column and add to it's own "Month" column
df['Month'] = df['Date'].apply(get_month)

# Fill up NaN elements with zeros
df = df.fillna(0)

# Convert IsHoliday boolean to numbers
df['IsHoliday'] =df['IsHoliday'].apply(lambda element : 0 if element == False else 1)

## Move target "weekly_sales" into first column
# remove column into its own object
first_column = df.pop('Weekly_Sales')

# insert column using insert(position,column_name,
# first_column) function
df.insert(0, 'Weekly_Sales', first_column)

# drop the Date column
# We no longer need the date column as we have isolated the month into it's own column
df = df.drop(columns = ['Date'])

# get dummies
df = pd.get_dummies(df, columns = ['Type', 'Store', 'Dept'], drop_first = True)
df.sample(n=20)
Weekly_SalesIsHolidayTemperatureFuel_PriceMarkDown1MarkDown2MarkDown3MarkDown4MarkDown5CPI...Dept_90Dept_91Dept_92Dept_93Dept_94Dept_95Dept_96Dept_97Dept_98Dept_99
2992683745.69094.223.6840.000.000.000.000.00215.197852...0000000000
2385153543.48072.172.8080.000.000.000.000.00204.567546...0000000000
389906124014.71074.473.6467791.47306.7031.444856.674087.29197.832220...0010000000
1394763718.19065.832.9420.000.000.000.000.00132.473333...0000000000
41924915534.95041.553.81622832.382515.254.0013317.882560.48190.171493...0000000000
2342843289.22031.923.737755.784142.75190.3090.502594.66136.959839...0000000000
35138181.08055.413.112158.110.007.500.001316.88218.054185...0000000000
2318268931.06073.124.0690.000.000.000.000.00134.855161...0000000000
41488938683.93030.543.1090.000.000.000.000.00182.551954...0000000000
21506120421.97050.753.99121823.530.0037.876586.491565.11142.017793...0000000000
6949649155.24062.662.8080.000.000.000.000.00213.818636...0000000000
26439446906.16037.243.87414655.1510670.84162.8211286.991595.80141.214036...0001000000
3233556781.83052.432.6990.000.000.000.000.00126.491290...0000000000
2116887071.69058.564.1010.000.000.000.000.00138.587106...0000000000
970648373.04088.834.0024407.900.007.203037.563717.52130.790968...0000000000
859765302.91070.943.6884727.800.001.04356.781077.81225.478263...0000000000
9989210980.97087.702.6190.000.000.000.000.00214.889794...0000000000
2503081437.29016.703.2150.000.000.000.000.00132.951065...0000000000
39110417084.47054.342.9620.000.000.000.000.00126.442065...0000000100
428642907.10037.742.9830.000.000.000.000.00212.008514...0000000000

20 rows × 139 columns

Datafield is looking good at this point

df.shape

(421570, 139)

Shape Training and Validation Data

Split Data

# spliting the data in to test and train sets
from sklearn.model_selection import train_test_split
training_data, testing_data = train_test_split(df, test_size = 0.02)

Generate CSV Files

# save train_data and validation_data as csv files.
training_data.to_csv('training.csv', header = False, index = False)
testing_data.to_csv('validation.csv', header = False, index = False)

Configure Sagemaker

# Boto3 is the Amazon Web Services (AWS) Software Development Kit (SDK) for Python
# Boto3 allows Python developer to write software that makes use of services like Amazon S3 and Amazon EC2

import sagemaker
import boto3
from sagemaker import Session

# create sagemaker session
sagemaker_session = sagemaker.Session()

# specify bucket and folder
bucket = 'tutorial-sagemaker-sales-xgboost'
prefix = 'XGBoost-Regressor'
key = 'XGBoost-Regressor'

#Roles give learning and hosting access to the data
#This is specified while opening the sagemakers instance in "Create an IAM role"
role = sagemaker.get_execution_role()

print(bucket)
print(prefix)
print(key)
print(role)

tutorial-sagemaker-sales-xgboost XGBoost-Regressor XGBoost-Regressor arn:aws:iam::483449698840:role/service-role/AmazonSageMaker-ExecutionRole-20211018T115875

Training Data Location

# read the data from csv file and then upload the data to s3 bucket
import os
with open('training.csv','rb') as file:
# The following code uploads the data into S3 bucket to be accessed later for training
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(file)

# Let's print out the training data location in s3
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

uploaded training data location: s3://tutorial-sagemaker-sales-xgboost/XGBoost-Regressor/train/XGBoost-Regressor

Validation Data Location

# read the data from csv file and then upload the data to s3 bucket

with open('validation.csv','rb') as file:
# The following code uploads the data into S3 bucket to be accessed later for training

boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation', key)).upload_fileobj(file)
# Let's print out the validation data location in s3
s3_validation_data = 's3://{}/{}/validation/{}'.format(bucket, prefix, key)
print('uploaded validation data location: {}'.format(s3_validation_data))

uploaded validation data location: s3://tutorial-sagemaker-sales-xgboost/XGBoost-Regressor/validation/XGBoost-Regressor

Output Placeholder

# creates output placeholder in S3 bucket to store the output

output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('training artifacts will be uploaded to: {}'.format(output_location))

training artifacts will be uploaded to: s3://tutorial-sagemaker-sales-xgboost/XGBoost-Regressor/output

Algorithm Container

# This code is used to get the training container of sagemaker built-in algorithms
# all we have to do is to specify the name of the algorithm, that we want to use

region = boto3.Session().region_name
container = sagemaker.image_uris.retrieve('xgboost', region, version='latest')
container

'811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest'

Container and Algorithm Parameters

# Specify the type of instance that we would like to use for training
# output path and sagemaker session into the Estimator.
# We can also specify how many instances we would like to use for training

# Recall that XGBoost works by combining an ensemble of weak models to generate accurate/robust results.
# The weak models are randomized to avoid overfitting

# num_round: The number of rounds to run the training.


# Alpha: L1 regularization term on weights. Increasing this value makes models more conservative.

# colsample_by_tree: fraction of features that will be used to train each tree.

# eta: Step size shrinkage used in updates to prevent overfitting.
# After each boosting step, eta parameter shrinks the feature weights to make the boosting process more conservative.


Xgboost_regressor = sagemaker.estimator.Estimator(container,
role,
instance_count = 1,
instance_type = 'ml.m5.2xlarge',
output_path = output_location,
sagemaker_session = sagemaker_session,
# reduce cost with spot instances
use_spot_instances = True,
max_run = 300,
max_wait = 600
)

#We can tune the hyper-parameters to improve the performance of the model

Xgboost_regressor.set_hyperparameters(max_depth = 10,
objective = 'reg:linear',
colsample_bytree = 0.3,
alpha = 10,
eta = 0.1,
num_round = 100
)

Data Channels

# Creating "train", "validation" channels to feed in the model
# Source: https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html

train_input = sagemaker.inputs.TrainingInput(s3_data = s3_train_data, content_type='csv',s3_data_type = 'S3Prefix')
valid_input = sagemaker.inputs.TrainingInput(s3_data = s3_validation_data, content_type='csv',s3_data_type = 'S3Prefix')


data_channels = {'train': train_input,'validation': valid_input}

Train Model

Xgboost_regressor.fit(data_channels)

2021-11-04 01:25:49 Starting - Starting the training job... 2021-11-04 01:26:13 Starting - Launching requested ML instancesProfilerReport-1635989149: InProgress ...... 2021-11-04 01:27:14 Starting - Preparing the instances for training......... 2021-11-04 01:28:46 Downloading - Downloading input data 2021-11-04 01:28:46 Training - Downloading the training image..Arguments: train [2021-11-04:01:29:02:INFO] Running standalone xgboost training. [2021-11-04:01:29:02:INFO] File size need to be processed in the node: 134.75mb. Available memory size in the node: 23771.49mb [2021-11-04:01:29:02:INFO] Determined delimiter of CSV input is ',' [01:29:02] S3DistributionType set as FullyReplicated [01:29:03] 413138x138 matrix with 57013044 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=, [2021-11-04:01:29:03:INFO] Determined delimiter of CSV input is ',' [01:29:03] S3DistributionType set as FullyReplicated [01:29:03] 8432x138 matrix with 1163616 entries loaded from /opt/ml/input/data/validation?format=csv&label_column=0&delimiter=, [01:29:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 280 extra nodes, 0 pruned nodes, max_depth=10 [0]#011train-rmse:26438.7#011validation-rmse:26719.4 [01:29:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 186 extra nodes, 0 pruned nodes, max_depth=10 [1]#011train-rmse:25132#011validation-rmse:25360.7 [01:29:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 134 extra nodes, 0 pruned nodes, max_depth=10 [2]#011train-rmse:24277.6#011validation-rmse:24491.7 [01:29:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 516 extra nodes, 0 pruned nodes, max_depth=10 [3]#011train-rmse:23507.3#011validation-rmse:23752.3 [01:29:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 376 extra nodes, 0 pruned nodes, max_depth=10 [4]#011train-rmse:22410.4#011validation-rmse:22620.7 [01:29:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 294 extra nodes, 0 pruned nodes, max_depth=10 [5]#011train-rmse:21626.1#011validation-rmse:21807.8 [01:29:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 280 extra nodes, 0 pruned nodes, max_depth=10 [6]#011train-rmse:21162.7#011validation-rmse:21349.2 [01:29:07] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 582 extra nodes, 0 pruned nodes, max_depth=10 [7]#011train-rmse:20449.9#011validation-rmse:20609.4

2021-11-04 01:29:14 Training - Training image download completed. Training in progress.[01:29:07] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 200 extra nodes, 0 pruned nodes, max_depth=10 [8]#011train-rmse:20010.4#011validation-rmse:20186.7 [01:29:07] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 512 extra nodes, 0 pruned nodes, max_depth=10 [9]#011train-rmse:19158.7#011validation-rmse:19302.7 [01:29:07] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 264 extra nodes, 0 pruned nodes, max_depth=10 [10]#011train-rmse:18656.4#011validation-rmse:18803.7 [01:29:08] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 568 extra nodes, 0 pruned nodes, max_depth=10 [11]#011train-rmse:18227.7#011validation-rmse:18348.3 [01:29:08] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 756 extra nodes, 0 pruned nodes, max_depth=10 [12]#011train-rmse:17616.9#011validation-rmse:17700.8 [01:29:08] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 360 extra nodes, 0 pruned nodes, max_depth=10 [13]#011train-rmse:17309.5#011validation-rmse:17375.6 [01:29:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 294 extra nodes, 0 pruned nodes, max_depth=10 [14]#011train-rmse:16843.7#011validation-rmse:16882 [01:29:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 368 extra nodes, 0 pruned nodes, max_depth=10 [15]#011train-rmse:16445#011validation-rmse:16438.4 [01:29:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 322 extra nodes, 0 pruned nodes, max_depth=10 [16]#011train-rmse:16258.4#011validation-rmse:16249.4 [01:29:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 362 extra nodes, 0 pruned nodes, max_depth=10 [17]#011train-rmse:15938.5#011validation-rmse:15885.8 [01:29:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 270 extra nodes, 0 pruned nodes, max_depth=10 [18]#011train-rmse:15584.9#011validation-rmse:15513.5 [01:29:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 534 extra nodes, 0 pruned nodes, max_depth=10 [19]#011train-rmse:15283.6#011validation-rmse:15172.6 [01:29:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 424 extra nodes, 0 pruned nodes, max_depth=10 [20]#011train-rmse:15006.6#011validation-rmse:14858.5 [01:29:11] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 778 extra nodes, 0 pruned nodes, max_depth=10 [21]#011train-rmse:14646.6#011validation-rmse:14502.8 [01:29:11] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 282 extra nodes, 0 pruned nodes, max_depth=10 [22]#011train-rmse:14464.6#011validation-rmse:14300.9 [01:29:11] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 256 extra nodes, 0 pruned nodes, max_depth=10 [23]#011train-rmse:14199.5#011validation-rmse:14023.8 [01:29:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 688 extra nodes, 0 pruned nodes, max_depth=10 [24]#011train-rmse:13979.7#011validation-rmse:13792.4 [01:29:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 828 extra nodes, 0 pruned nodes, max_depth=10 [25]#011train-rmse:13745#011validation-rmse:13539.5 [01:29:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 232 extra nodes, 0 pruned nodes, max_depth=10 [26]#011train-rmse:13451.5#011validation-rmse:13231.4 [01:29:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 518 extra nodes, 0 pruned nodes, max_depth=10 [27]#011train-rmse:13235#011validation-rmse:13032.9 [01:29:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 316 extra nodes, 0 pruned nodes, max_depth=10 [28]#011train-rmse:13109#011validation-rmse:12883.8 [01:29:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 170 extra nodes, 0 pruned nodes, max_depth=10 [29]#011train-rmse:12992.8#011validation-rmse:12781.5 [01:29:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 242 extra nodes, 0 pruned nodes, max_depth=10 [30]#011train-rmse:12886.4#011validation-rmse:12669.8 [01:29:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 790 extra nodes, 0 pruned nodes, max_depth=10 [31]#011train-rmse:12636.9#011validation-rmse:12410.4 [01:29:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 328 extra nodes, 0 pruned nodes, max_depth=10 [32]#011train-rmse:12378.1#011validation-rmse:12136.2 [01:29:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 306 extra nodes, 0 pruned nodes, max_depth=10 [33]#011train-rmse:12252.1#011validation-rmse:12005.4 [01:29:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 556 extra nodes, 0 pruned nodes, max_depth=10 [34]#011train-rmse:12085#011validation-rmse:11850.3 [01:29:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 922 extra nodes, 0 pruned nodes, max_depth=10 [35]#011train-rmse:11943.4#011validation-rmse:11701 [01:29:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 942 extra nodes, 0 pruned nodes, max_depth=10 [36]#011train-rmse:11628.8#011validation-rmse:11417.1 [01:29:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 998 extra nodes, 0 pruned nodes, max_depth=10 [37]#011train-rmse:11498.4#011validation-rmse:11306.6 [01:29:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 510 extra nodes, 0 pruned nodes, max_depth=10 [38]#011train-rmse:11359.4#011validation-rmse:11172.8 [01:29:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 630 extra nodes, 0 pruned nodes, max_depth=10 [39]#011train-rmse:11261.7#011validation-rmse:11084.4 [01:29:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 464 extra nodes, 0 pruned nodes, max_depth=10 [40]#011train-rmse:11017.4#011validation-rmse:10820.3 [01:29:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 284 extra nodes, 0 pruned nodes, max_depth=10 [41]#011train-rmse:10923.9#011validation-rmse:10712 [01:29:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 824 extra nodes, 0 pruned nodes, max_depth=10 [42]#011train-rmse:10716.8#011validation-rmse:10471.9 [01:29:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 760 extra nodes, 0 pruned nodes, max_depth=10 [43]#011train-rmse:10506#011validation-rmse:10261.4 [01:29:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 342 extra nodes, 0 pruned nodes, max_depth=10 [44]#011train-rmse:10394.5#011validation-rmse:10153.8 [01:29:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 438 extra nodes, 0 pruned nodes, max_depth=10 [45]#011train-rmse:10335.6#011validation-rmse:10093.1 [01:29:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 438 extra nodes, 0 pruned nodes, max_depth=10 [46]#011train-rmse:10264.1#011validation-rmse:10010 [01:29:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 294 extra nodes, 0 pruned nodes, max_depth=10 [47]#011train-rmse:10177#011validation-rmse:9920.74 [01:29:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 1074 extra nodes, 0 pruned nodes, max_depth=10 [48]#011train-rmse:10028.8#011validation-rmse:9776.71 [01:29:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 452 extra nodes, 0 pruned nodes, max_depth=10 [49]#011train-rmse:9959.12#011validation-rmse:9699.38 [01:29:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 758 extra nodes, 0 pruned nodes, max_depth=10 [50]#011train-rmse:9819.62#011validation-rmse:9583.81 [01:29:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 654 extra nodes, 0 pruned nodes, max_depth=10 [51]#011train-rmse:9756.69#011validation-rmse:9518.23 [01:29:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 860 extra nodes, 0 pruned nodes, max_depth=10 [52]#011train-rmse:9654.91#011validation-rmse:9430.58 [01:29:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 820 extra nodes, 0 pruned nodes, max_depth=10 [53]#011train-rmse:9501.04#011validation-rmse:9297.99 [01:29:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 290 extra nodes, 0 pruned nodes, max_depth=10 [54]#011train-rmse:9400.23#011validation-rmse:9201.69 [01:29:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 536 extra nodes, 0 pruned nodes, max_depth=10 [55]#011train-rmse:9344.84#011validation-rmse:9149.57 [01:29:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 380 extra nodes, 0 pruned nodes, max_depth=10 [56]#011train-rmse:9267.61#011validation-rmse:9069.42 [01:29:21] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 380 extra nodes, 0 pruned nodes, max_depth=10 [57]#011train-rmse:9209.92#011validation-rmse:9007.38 [01:29:21] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 988 extra nodes, 0 pruned nodes, max_depth=10 [58]#011train-rmse:9095.4#011validation-rmse:8891.88 [01:29:21] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 216 extra nodes, 0 pruned nodes, max_depth=10 [59]#011train-rmse:9018.6#011validation-rmse:8827.79 [01:29:21] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 340 extra nodes, 0 pruned nodes, max_depth=10 [60]#011train-rmse:8969.99#011validation-rmse:8779.02 [01:29:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 426 extra nodes, 0 pruned nodes, max_depth=10 [61]#011train-rmse:8913.37#011validation-rmse:8722.46 [01:29:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 1056 extra nodes, 0 pruned nodes, max_depth=10 [62]#011train-rmse:8856.65#011validation-rmse:8664.98 [01:29:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 430 extra nodes, 0 pruned nodes, max_depth=10 [63]#011train-rmse:8806.69#011validation-rmse:8622.59 [01:29:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 762 extra nodes, 0 pruned nodes, max_depth=10 [64]#011train-rmse:8758.35#011validation-rmse:8575.46 [01:29:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 612 extra nodes, 0 pruned nodes, max_depth=10 [65]#011train-rmse:8698.18#011validation-rmse:8506.64 [01:29:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 578 extra nodes, 0 pruned nodes, max_depth=10 [66]#011train-rmse:8668.53#011validation-rmse:8477.54 [01:29:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 832 extra nodes, 0 pruned nodes, max_depth=10 [67]#011train-rmse:8636.11#011validation-rmse:8444.05 [01:29:24] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 322 extra nodes, 0 pruned nodes, max_depth=10 [68]#011train-rmse:8612.05#011validation-rmse:8419.51 [01:29:24] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 504 extra nodes, 0 pruned nodes, max_depth=10 [69]#011train-rmse:8527.51#011validation-rmse:8335.6 [01:29:24] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 814 extra nodes, 0 pruned nodes, max_depth=10 [70]#011train-rmse:8477.03#011validation-rmse:8287.09 [01:29:24] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 486 extra nodes, 0 pruned nodes, max_depth=10 [71]#011train-rmse:8429.81#011validation-rmse:8248.77 [01:29:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 344 extra nodes, 0 pruned nodes, max_depth=10 [72]#011train-rmse:8406.38#011validation-rmse:8225.97 [01:29:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 846 extra nodes, 0 pruned nodes, max_depth=10 [73]#011train-rmse:8350.88#011validation-rmse:8165.68 [01:29:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 962 extra nodes, 0 pruned nodes, max_depth=10 [74]#011train-rmse:8265.71#011validation-rmse:8097.3 [01:29:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 978 extra nodes, 0 pruned nodes, max_depth=10 [75]#011train-rmse:8189.77#011validation-rmse:8013.82 [01:29:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 294 extra nodes, 0 pruned nodes, max_depth=10 [76]#011train-rmse:8160.4#011validation-rmse:7984.17 [01:29:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 698 extra nodes, 0 pruned nodes, max_depth=10 [77]#011train-rmse:8088.24#011validation-rmse:7904.87 [01:29:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 416 extra nodes, 0 pruned nodes, max_depth=10 [78]#011train-rmse:8065.81#011validation-rmse:7880.81 [01:29:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 1030 extra nodes, 0 pruned nodes, max_depth=10 [79]#011train-rmse:7985.32#011validation-rmse:7795.9 [01:29:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 414 extra nodes, 0 pruned nodes, max_depth=10 [80]#011train-rmse:7959.09#011validation-rmse:7768.53 [01:29:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 506 extra nodes, 0 pruned nodes, max_depth=10 [81]#011train-rmse:7899.51#011validation-rmse:7717.77 [01:29:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 996 extra nodes, 0 pruned nodes, max_depth=10 [82]#011train-rmse:7842.48#011validation-rmse:7644.3 [01:29:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 394 extra nodes, 0 pruned nodes, max_depth=10 [83]#011train-rmse:7812.29#011validation-rmse:7616.29 [01:29:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 582 extra nodes, 0 pruned nodes, max_depth=10 [84]#011train-rmse:7775.24#011validation-rmse:7581.92 [01:29:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 930 extra nodes, 0 pruned nodes, max_depth=10 [85]#011train-rmse:7732.53#011validation-rmse:7530.5 [01:29:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 368 extra nodes, 0 pruned nodes, max_depth=10 [86]#011train-rmse:7707.12#011validation-rmse:7496.32 [01:29:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 454 extra nodes, 0 pruned nodes, max_depth=10 [87]#011train-rmse:7684.77#011validation-rmse:7471.27 [01:29:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 588 extra nodes, 0 pruned nodes, max_depth=10 [88]#011train-rmse:7642.79#011validation-rmse:7430.39 [01:29:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 154 extra nodes, 0 pruned nodes, max_depth=10 [89]#011train-rmse:7614.04#011validation-rmse:7400.1 [01:29:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 632 extra nodes, 0 pruned nodes, max_depth=10 [90]#011train-rmse:7537.41#011validation-rmse:7326.29 [01:29:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 636 extra nodes, 0 pruned nodes, max_depth=10 [91]#011train-rmse:7516.38#011validation-rmse:7299.69 [01:29:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 686 extra nodes, 0 pruned nodes, max_depth=10 [92]#011train-rmse:7456.1#011validation-rmse:7259.76 [01:29:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 584 extra nodes, 0 pruned nodes, max_depth=10 [93]#011train-rmse:7435.81#011validation-rmse:7240.72 [01:29:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 570 extra nodes, 0 pruned nodes, max_depth=10 [94]#011train-rmse:7409.44#011validation-rmse:7211.32 [01:29:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 578 extra nodes, 0 pruned nodes, max_depth=10 [95]#011train-rmse:7375.82#011validation-rmse:7178.67 [01:29:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 362 extra nodes, 0 pruned nodes, max_depth=10 [96]#011train-rmse:7354.69#011validation-rmse:7158.87 [01:29:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 330 extra nodes, 0 pruned nodes, max_depth=10 [97]#011train-rmse:7340.66#011validation-rmse:7144.72 [01:29:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 706 extra nodes, 0 pruned nodes, max_depth=10 [98]#011train-rmse:7301.74#011validation-rmse:7106.24 [01:29:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 760 extra nodes, 0 pruned nodes, max_depth=10 [99]#011train-rmse:7278.08#011validation-rmse:7081.5

2021-11-04 01:29:54 Uploading - Uploading generated training model 2021-11-04 01:29:54 Completed - Training job completed Training seconds: 75 Billable seconds: 34 Managed Spot Training savings: 54.7%

Deploy Model

# Deploy the model to perform inference

Xgboost_regressor = Xgboost_regressor.deploy(initial_instance_count = 1, instance_type = 'ml.m5.2xlarge')

-----!

Set Serializer

# Content type over-rides the data that will be passed to the deployed model, since the deployed model expects data
# in text/csv format, we specify this as content -type.

# Serializer accepts a single argument, the input data, and returns a sequence of bytes in the specified content
# type

# Reference: https://sagemaker.readthedocs.io/en/stable/predictors.html

from sagemaker.serializers import CSVSerializer

Xgboost_regressor.serializer = CSVSerializer()

Shape Testing Data

Split testing data for metrics later

testing_data
Weekly_SalesIsHolidayTemperatureFuel_PriceMarkDown1MarkDown2MarkDown3MarkDown4MarkDown5CPI...Dept_90Dept_91Dept_92Dept_93Dept_94Dept_95Dept_96Dept_97Dept_98Dept_99
23608119829.80076.423.732571.85126.88145.72614.551867.65138.233193...0000000000
18810412212.20055.432.8990.000.000.000.000.00203.730749...0000000000
3534171962.50075.983.721241.470.000.000.002143.91221.457860...0000000000
22985682315.53045.633.1380.000.000.000.000.00132.917200...0000010000
1317948465.38050.812.7710.000.000.000.000.00211.547030...0000000000
..................................................................
1276702870.09067.963.82110671.71141.8346.002465.3712372.29131.010333...0000000000
396908115.00080.064.277609.750.001.1043.67726.34130.959226...0000000000
2517351285.74059.853.9240.000.000.000.000.00134.942548...0000000000
3460655009.87085.893.571442.050.000.000.001381.39220.719961...0000000010
227008130761.81125.942.9400.000.000.000.000.00131.586613...0010000000

8432 rows × 139 columns

# split testing_data.df into X_test.f32 & y_test.f32

y_test = testing_data['Weekly_Sales']
X_test = testing_data.drop(columns = ["Weekly_Sales"])
y_test.shape

(8432,)

X_test.shape

(8432, 138)

Remove Target for Predictions Function

.predictions() method needs the testing data without the y value (target)

# testing_data.pop('Weekly_Sales')
# testing_data
IsHolidayTemperatureFuel_PriceMarkDown1MarkDown2MarkDown3MarkDown4MarkDown5CPIUnemployment...Dept_90Dept_91Dept_92Dept_93Dept_94Dept_95Dept_96Dept_97Dept_98Dept_99
373039070.662.7350.000.000.000.000.00132.7248395.326...0000000000
69323060.182.7190.000.000.000.000.00214.1642186.290...0000000000
157684015.642.6670.000.000.000.000.00126.5522866.548...0000000000
76599062.183.8916821.130.0027.042101.863627.77224.9883625.679...0000000000
313527172.563.5967271.43172.04233.55916.853089.99198.0950487.872...0000000000
..................................................................
214614046.063.8678463.0151.3111.363719.234895.18141.5547807.503...0000000000
112581071.814.0310.000.000.000.000.00129.04903213.736...0000000000
224550033.113.8768575.571125.615.084365.632742.87137.5066904.261...0000000000
238214069.312.8990.000.000.000.000.00204.1406567.856...0000000000
261374061.714.1170.000.000.000.000.00138.3303127.725...0000000000

8432 rows × 138 columns

Convert Testing Data to float32

Testing data must be converted to float32 format before execution

testing_data_float32 = np.array(X_test).astype('float32')
testing_data_float32.shape

(8432, 138)

Make Predictions

# custom code to convert the values in bytes format to array
def bytes_2_array(x):

# makes entire prediction as string and splits based on ','
l = str(x).split(',')

# Since the first element contains unwanted characters like (b,',') we remove them
l[0] = l[0][2:]
#same-thing as above remove the unwanted last character (')
l[-1] = l[-1][:-1]

# iterating through the list of strings and converting them into float type
for i in range(len(l)):
l[i] = float(l[i])

# converting the list into array
l = np.array(l).astype('float32')

# reshape one-dimensional array to two-dimensional array
return l.reshape(-1,1)

Attempting to run the entire dataset would cause a crash because this dataset is so large, so let us predict 10,000 at a time

# making prediction

predictions_bytes = Xgboost_regressor.predict(testing_data_float32)
# convert bytes to array
predicted_values = bytes_2_array(predictions_bytes)
predicted_values.shape

(8432, 1)

Calculate Accuracy

Split Testing Data

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt
k = X_test.shape[1]
n = len(X_test)
RMSE = float(format(np.sqrt(mean_squared_error(y_test, predicted_values)),'.3f'))
MSE = mean_squared_error(y_test, predicted_values)
MAE = mean_absolute_error(y_test, predicted_values)
r2 = r2_score(y_test, predicted_values)
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)

print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2, '\nAdjusted R2 =', adj_r2)

RMSE = 7377.642 MSE = 54429599.67199714 MAE = 4311.153492210574 R2 = 0.903990280483614 Adjusted R2 = 0.9023926268850053

Comments

Recent Work

Free desktop AI Chat client, designed for developers and businesses. Unlocks advanced model settings only available in the API. Includes quality of life features like custom syntax highlighting.

Learn More

BidBear

bidbear.io

Bidbear is a report automation tool. It downloads Amazon Seller and Advertising reports, daily, to a private database. It then merges and formats the data into beautiful, on demand, exportable performance reports.

Learn More