import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import datetime as dt
import warnings
warnings.filterwarnings("ignore")
The data set contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail.The company mainly sells unique all-occasion gifts. Many customers of the company are wholesalers.
# loading the dataset
df_ecommerce = pd.read_csv("data.csv", encoding= 'unicode_escape')
# first 5 rows of the dataset
df_ecommerce.head()
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | |
|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 12/1/2010 8:26 | 2.55 | 17850.0 | United Kingdom |
| 1 | 536365 | 71053 | WHITE METAL LANTERN | 6 | 12/1/2010 8:26 | 3.39 | 17850.0 | United Kingdom |
| 2 | 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 12/1/2010 8:26 | 2.75 | 17850.0 | United Kingdom |
| 3 | 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 12/1/2010 8:26 | 3.39 | 17850.0 | United Kingdom |
| 4 | 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 12/1/2010 8:26 | 3.39 | 17850.0 | United Kingdom |
# dimensions of the data
df_ecommerce.shape
(541909, 8)
There are 541909 rows and 8 coulmns in our data
# information about the dataset
df_ecommerce.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 541909 entries, 0 to 541908 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 InvoiceNo 541909 non-null object 1 StockCode 541909 non-null object 2 Description 540455 non-null object 3 Quantity 541909 non-null int64 4 InvoiceDate 541909 non-null object 5 UnitPrice 541909 non-null float64 6 CustomerID 406829 non-null float64 7 Country 541909 non-null object dtypes: float64(2), int64(1), object(5) memory usage: 33.1+ MB
# columns in the dataset
df_ecommerce.columns
Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
'UnitPrice', 'CustomerID', 'Country'],
dtype='object')
Since the "Quantity" column signifies quantity of orders, it is safe to assume that the quantity can never be negative. We shall now look into different values of the "Quantity" column
df_ecommerce[(df_ecommerce.Quantity <0)].count()['Quantity']
10624
df_ecommerce['Quantity'].unique()
array([ 6, 8, 2, 32, 3, 4, 24, 12,
48, 18, 20, 36, 80, 64, 10, 120,
96, 23, 5, 1, -1, 50, 40, 100,
192, 432, 144, 288, -12, -24, 16, 9,
128, 25, 30, 28, 7, 56, 72, 200,
600, 480, -6, 14, -2, 11, 33, 13,
-4, -5, -7, -3, 70, 252, 60, 216,
384, -10, 27, 15, 22, 19, 17, 21,
34, 47, 108, 52, -9360, -38, 75, 270,
42, 240, 90, 320, 1824, 204, 69, -36,
-192, -144, 160, 2880, 1400, 39, -48, -50,
26, 1440, 31, 82, 78, 97, 98, 35,
57, -20, 110, -22, -30, -70, -130, -80,
-120, -40, -25, -14, -15, -69, -140, -320,
-8, 720, 156, 324, 38, 37, 49, 95,
-9, -11, 29, 41, -72, -35, -21, -43,
-19, -18, -44, 402, 378, 150, 300, 54,
104, 67, 258, 66, 44, 55, 46, 99,
61, 408, 972, 208, 1008, 1000, -77, 1488,
250, 1394, 400, -223, -150, -13, -33, -723,
-177, 79, 84, -32, -100, -28, 272, -145,
-47, -96, 113, 45, 106, 68, 267, 115,
65, 1728, -60, -16, 53, -240, 76, 460,
71, 43, 213, 58, 576, 2400, 500, 180,
-300, -500, -23, 752, 960, 1296, 210, 172,
215, 129, 138, 116, 135, 197, -106, -54,
-17, -939, 147, 168, 256, -201, -53, -29,
-2600, -990, -290, -45, 860, 1010, 1356, 1284,
186, 114, 360, 1930, 2000, 3114, 1300, 670,
111, 211, 59, -310, -61, -41, 176, 648,
62, 74215, -74215, -64, -84, 89, -1400, 73,
-57, 112, 456, -59, -31, 5568, 2560, 136,
900, -600, -42, -94, -207, -52, 130, -206,
2592, 420, 800, 101, 1200, 864, -217, 94,
-1430, 1287, -162, -230, -173, -390, -234, 504,
123, 118, -76, -200, 1056, 1500, 280, 407,
141, 124, -99, 51, -92, -741, 3906, -400,
-114, 102, 1152, -88, 198, 117, 86, -720,
125, -86, -391, -87, -278, 140, 228, -154,
-3000, 81, -675, -210, -345, -975, -1200, -1121,
-27, -541, -1277, -197, 3000, -1100, -63, -5368,
219, -259, 232, -82, -178, 170, -110, 768,
88, 700, -102, 2160, -323, -450, -232, -83,
-153, -524, -2472, -49, -105, 342, 304, 167,
-34, 640, 175, 220, 74, 93, 164, 163,
63, 637, 122, 158, 165, 350, -46, -75,
-690, -39, -66, 83, 312, -180, 392, -624,
-194, -26, -62, -37, -91, -139, -158, -1479,
-55, -576, -750, 330, 151, -93, -432, -58,
-1092, -670, -1300, -1930, -2000, -3114, 462, -65,
-68, 492, -620, 273, -90, -170, -1512, -51,
85, -56, -160, -360, 105, -960, -2376, 1350,
428, -1350, 336, -786, -590, -168, -101, -71,
132, -413, -664, 227, 2700, 222, 246, 906,
756, 888, 552, -97, 224, 738, 608, -212,
4300, 146, 143, 276, -125, -116, -108, 4000,
-304, -272, -1206, -95, 1600, 323, -161, -472,
-618, -204, 1515, -1515, -9058, -9600, 660, -420,
-126, -220, -271, -1440, 264, 188, 588, 612,
152, -324, -480, -164, -78, -118, 430, -1681,
87, 155, 701, 828, 540, 696, 560, -250,
-408, -179, 121, -124, 512, -251, -3100, 3100,
-169, 126, 291, 3186, -2834, -109, 109, -121,
-530, -227, 261, -346, 352, 142, 107, -188,
-1060, -342, -288, 348, 1900, 157, -343, -455,
425, 968, 684, 824, -828, -701, 196, 248,
410, 236, 230, -156, -553, 145, 448, 245,
-252, -334, -318, -113, -115, 171, -242, 840,
-967, -203, -3167, -443, -1897, 225, -434, 750,
-682, -484, 682, 344, -635, -117, -3667, 450,
310, 494, -384, 92, 1788, -138, 624, 744,
416, 496, 396, 306, 1878, 1944, 666, 708,
1428, 852, 1412, 528, -756, -752, -152, -85,
-312, -79, -147, -67, -131, 183, -209, -186,
-231, -129, -458, -275, -2880, 1540, 672, -800,
-430, -380, -74, -840, -1296, -365, -104, -270,
-73, -306, 91, 255, 468, -468, -111, -184,
-103, -335, 4800, -112, -1000, 912, 1992, 184,
148, -657, -1671, -1158, -2618, -2003, -674, -4830,
-905, -1128, 832, 992, 630, 1020, 2100, 162,
-1560, -1284, -81, 314, 370, 131, 133, 484,
149, 153, 257, 139, 137, 628, 179, 1820,
478, 335, 253, 242, 375, -276, -256, 281,
193, 181, 404, 244, 207, 199, 1130, 326,
654, 688, 268, 249, -864, 234, -1510, -550,
-244, -132, -327, -313, 279, -398, -280, 374,
-504, -696, -149, -224, -428, 212, -267, -175,
12540, 760, -98, 774, -151, 2040, -1131, -135,
1120, 77, -900, 177, 490, 388, 620, 1404,
-155, -355, -337, -1050, -338, 205, -235, 698,
80995, -80995], dtype=int64)
We may notice above that the "Quantity" column 10624 rows that have values that are less than 0. We shall now drop these rows and retain only the records for which the "Quantity" is greater than 0.
df_ecommerce = df_ecommerce[(df_ecommerce.Quantity >=0)]
df_ecommerce
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | |
|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 12/1/2010 8:26 | 2.55 | 17850.0 | United Kingdom |
| 1 | 536365 | 71053 | WHITE METAL LANTERN | 6 | 12/1/2010 8:26 | 3.39 | 17850.0 | United Kingdom |
| 2 | 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 12/1/2010 8:26 | 2.75 | 17850.0 | United Kingdom |
| 3 | 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 12/1/2010 8:26 | 3.39 | 17850.0 | United Kingdom |
| 4 | 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 12/1/2010 8:26 | 3.39 | 17850.0 | United Kingdom |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 541904 | 581587 | 22613 | PACK OF 20 SPACEBOY NAPKINS | 12 | 12/9/2011 12:50 | 0.85 | 12680.0 | France |
| 541905 | 581587 | 22899 | CHILDREN'S APRON DOLLY GIRL | 6 | 12/9/2011 12:50 | 2.10 | 12680.0 | France |
| 541906 | 581587 | 23254 | CHILDRENS CUTLERY DOLLY GIRL | 4 | 12/9/2011 12:50 | 4.15 | 12680.0 | France |
| 541907 | 581587 | 23255 | CHILDRENS CUTLERY CIRCUS PARADE | 4 | 12/9/2011 12:50 | 4.15 | 12680.0 | France |
| 541908 | 581587 | 22138 | BAKING SET 9 PIECE RETROSPOT | 3 | 12/9/2011 12:50 | 4.95 | 12680.0 | France |
531285 rows × 8 columns
df_ecommerce[(df_ecommerce.UnitPrice < 0)]
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | |
|---|---|---|---|---|---|---|---|---|
| 299983 | A563186 | B | Adjust bad debt | 1 | 8/12/2011 14:51 | -11062.06 | NaN | United Kingdom |
| 299984 | A563187 | B | Adjust bad debt | 1 | 8/12/2011 14:52 | -11062.06 | NaN | United Kingdom |
Similarly we can notice that the "UnitPrice" column has values that are less than 0 for 2 records. Since, any price cannot be less than 0, we shall drop these records.
df_ecommerce.drop(df_ecommerce[df_ecommerce['UnitPrice']<0].index, inplace=True)
df_ecommerce.isnull().sum()
InvoiceNo 0 StockCode 0 Description 592 Quantity 0 InvoiceDate 0 UnitPrice 0 CustomerID 133359 Country 0 dtype: int64
We notice that the two columns "Description" and "CustomerID" have NULL values.
df_ecommerce = df_ecommerce[df_ecommerce['CustomerID'].notna()]
df_ecommerce
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | |
|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 12/1/2010 8:26 | 2.55 | 17850.0 | United Kingdom |
| 1 | 536365 | 71053 | WHITE METAL LANTERN | 6 | 12/1/2010 8:26 | 3.39 | 17850.0 | United Kingdom |
| 2 | 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 12/1/2010 8:26 | 2.75 | 17850.0 | United Kingdom |
| 3 | 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 12/1/2010 8:26 | 3.39 | 17850.0 | United Kingdom |
| 4 | 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 12/1/2010 8:26 | 3.39 | 17850.0 | United Kingdom |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 541904 | 581587 | 22613 | PACK OF 20 SPACEBOY NAPKINS | 12 | 12/9/2011 12:50 | 0.85 | 12680.0 | France |
| 541905 | 581587 | 22899 | CHILDREN'S APRON DOLLY GIRL | 6 | 12/9/2011 12:50 | 2.10 | 12680.0 | France |
| 541906 | 581587 | 23254 | CHILDRENS CUTLERY DOLLY GIRL | 4 | 12/9/2011 12:50 | 4.15 | 12680.0 | France |
| 541907 | 581587 | 23255 | CHILDRENS CUTLERY CIRCUS PARADE | 4 | 12/9/2011 12:50 | 4.15 | 12680.0 | France |
| 541908 | 581587 | 22138 | BAKING SET 9 PIECE RETROSPOT | 3 | 12/9/2011 12:50 | 4.95 | 12680.0 | France |
397924 rows × 8 columns
df_ecommerce.isnull().sum()
InvoiceNo 0 StockCode 0 Description 0 Quantity 0 InvoiceDate 0 UnitPrice 0 CustomerID 0 Country 0 dtype: int64
We notice that just by dropping the CustomerID column to retain only the not NULL values, the NULL values in the description column has also been taken care of.
df_ecommerce.duplicated().sum()
5192
There are 5192 duplicate rows in the data set and now we shall remove them.
# removing duplicates
df_ecommerce.drop_duplicates(inplace=True)
df_ecommerce.head()
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | |
|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 12/1/2010 8:26 | 2.55 | 17850.0 | United Kingdom |
| 1 | 536365 | 71053 | WHITE METAL LANTERN | 6 | 12/1/2010 8:26 | 3.39 | 17850.0 | United Kingdom |
| 2 | 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 12/1/2010 8:26 | 2.75 | 17850.0 | United Kingdom |
| 3 | 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 12/1/2010 8:26 | 3.39 | 17850.0 | United Kingdom |
| 4 | 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 12/1/2010 8:26 | 3.39 | 17850.0 | United Kingdom |
df_ecommerce.shape
(392732, 8)
After data-preprocessing, we have 392732 rows and 8 columnns in our dataset
We have the columns for "Quantity" and "UnitPrice". Now we shall create a new column for Sales which is just the product of "Quantity" and "UnitPrice"
df_ecommerce.insert(8,"Sales", df_ecommerce['Quantity']*df_ecommerce['UnitPrice'])
We notice that the InvoiceDate is of the 'object' datatype. We shall convert it into a datetime object so that it is easier to perform further manipulations
# convert the "InvoiceDate" into a datetime object so that it is easier to perform further manipulations
df_ecommerce['InvoiceDate'] = df_ecommerce.InvoiceDate.astype('datetime64')
df_ecommerce.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 392732 entries, 0 to 541908 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 InvoiceNo 392732 non-null object 1 StockCode 392732 non-null object 2 Description 392732 non-null object 3 Quantity 392732 non-null int64 4 InvoiceDate 392732 non-null datetime64[ns] 5 UnitPrice 392732 non-null float64 6 CustomerID 392732 non-null float64 7 Country 392732 non-null object 8 Sales 392732 non-null float64 dtypes: datetime64[ns](1), float64(3), int64(1), object(4) memory usage: 30.0+ MB
The InvoiceDate is now a datatime datatype. Now we shall create columns for date, year, month, week of the year, day of the week, hour of the day.
# Creating a date column
df_ecommerce['Date'] = df_ecommerce['InvoiceDate'].dt.date
# Creating a year column
df_ecommerce['Year'] = df_ecommerce['InvoiceDate'].dt.year
# Creating a month column
df_ecommerce['Month'] = df_ecommerce['InvoiceDate'].dt.month
# Creating a week of the year column
df_ecommerce['WeekOfYear'] = df_ecommerce['InvoiceDate'].dt.isocalendar().week
# Creating a day of the week column
df_ecommerce['DayOfWeek'] = [d.day_name() for d in df_ecommerce['InvoiceDate']]
# Ordering the day of the week column
df_ecommerce['DayOfWeek'] = pd.Categorical(df_ecommerce['DayOfWeek'],
categories= ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday', 'Sunday'],
ordered = True)
# Creating the time of the day column
df_ecommerce['Time'] = [d.time() for d in df_ecommerce['InvoiceDate']]
# Splitting the the time of the day column into different sessions in a day
df_ecommerce=df_ecommerce.assign(Session=pd.cut(df_ecommerce.InvoiceDate.dt.hour,
[0,6,12,18,23],
labels=['Night','Morning','Afternoon','Evening'],
include_lowest=True))
# Creating the hour of the day column
df_ecommerce['Hour'] = df_ecommerce.InvoiceDate.dt.hour
df_ecommerce.head()
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | Sales | Date | Year | Month | WeekOfYear | DayOfWeek | Time | Session | Hour | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 2010-12-01 08:26:00 | 2.55 | 17850.0 | United Kingdom | 15.30 | 2010-12-01 | 2010 | 12 | 48 | Wednesday | 08:26:00 | Morning | 8 |
| 1 | 536365 | 71053 | WHITE METAL LANTERN | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 20.34 | 2010-12-01 | 2010 | 12 | 48 | Wednesday | 08:26:00 | Morning | 8 |
| 2 | 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 2010-12-01 08:26:00 | 2.75 | 17850.0 | United Kingdom | 22.00 | 2010-12-01 | 2010 | 12 | 48 | Wednesday | 08:26:00 | Morning | 8 |
| 3 | 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 20.34 | 2010-12-01 | 2010 | 12 | 48 | Wednesday | 08:26:00 | Morning | 8 |
| 4 | 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 20.34 | 2010-12-01 | 2010 | 12 | 48 | Wednesday | 08:26:00 | Morning | 8 |
df_ecommerce.isnull().sum()
InvoiceNo 0 StockCode 0 Description 0 Quantity 0 InvoiceDate 0 UnitPrice 0 CustomerID 0 Country 0 Sales 0 Date 0 Year 0 Month 0 WeekOfYear 0 DayOfWeek 0 Time 0 Session 0 Hour 0 dtype: int64
df_ecommerce.columns
Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
'UnitPrice', 'CustomerID', 'Country', 'Sales', 'Date', 'Year', 'Month',
'WeekOfYear', 'DayOfWeek', 'Time', 'Session', 'Hour'],
dtype='object')
df_ecommerce.shape
(392732, 17)
After the data manipulation and feature engineering, our data now has 392732 rows and 17 coulumns
df_ecommerce.reset_index(inplace=True)
df_ecommerce.drop(axis=1, columns=['index'], inplace=True)
df_ecommerce.head()
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | Sales | Date | Year | Month | WeekOfYear | DayOfWeek | Time | Session | Hour | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 2010-12-01 08:26:00 | 2.55 | 17850.0 | United Kingdom | 15.30 | 2010-12-01 | 2010 | 12 | 48 | Wednesday | 08:26:00 | Morning | 8 |
| 1 | 536365 | 71053 | WHITE METAL LANTERN | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 20.34 | 2010-12-01 | 2010 | 12 | 48 | Wednesday | 08:26:00 | Morning | 8 |
| 2 | 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 2010-12-01 08:26:00 | 2.75 | 17850.0 | United Kingdom | 22.00 | 2010-12-01 | 2010 | 12 | 48 | Wednesday | 08:26:00 | Morning | 8 |
| 3 | 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 20.34 | 2010-12-01 | 2010 | 12 | 48 | Wednesday | 08:26:00 | Morning | 8 |
| 4 | 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 20.34 | 2010-12-01 | 2010 | 12 | 48 | Wednesday | 08:26:00 | Morning | 8 |
df_ecommerce.shape
(392732, 17)
After Feature Engineering, we have 392732 rows and 17 columnns in our dataset
# plotting the count of orders for each country
ax = df_ecommerce.groupby(['Country']).count()['InvoiceNo'].plot(kind = 'bar', figsize = (20,10),
title = 'Count of transactions for each country')
ax.set_xlabel('Coutry')
ax.set_ylabel('Count');
df_ecommerce['Country'].value_counts()
United Kingdom 349227 Germany 9027 France 8327 EIRE 7228 Spain 2480 Netherlands 2363 Belgium 2031 Switzerland 1842 Portugal 1453 Australia 1184 Norway 1072 Italy 758 Channel Islands 747 Finland 685 Cyprus 603 Sweden 450 Austria 398 Denmark 380 Poland 330 Japan 321 Israel 245 Unspecified 241 Singapore 222 Iceland 182 USA 179 Canada 151 Greece 145 Malta 112 United Arab Emirates 68 European Community 60 RSA 58 Lebanon 45 Lithuania 35 Brazil 32 Czech Republic 25 Bahrain 17 Saudi Arabia 9 Name: Country, dtype: int64
From the above chart, we can see that most of the cusomers (349227 which is almost 88% of the customers) are from United Kingdom.
And since the description of the dataset does mention that the data "is a transnational data set which contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail.The company mainly sells unique all-occasion gifts. Many customers of the company are wholesalers." So, it makes sense that most of the customers are from the UK.
# plotting the Count of orders on each day of the year
ax = df_ecommerce.groupby(['Date']).count()['InvoiceNo'].plot(kind = 'line', figsize = (20,5),
title = 'Total Count of transactions on each day of the year')
ax.set_xlabel('Day of the year')
ax.set_ylabel('Count of transactions');
# plotting the quantity of orders on each day of the year
ax = df_ecommerce.groupby(['Date']).sum()['Quantity'].plot(kind = 'line', figsize = (20,5),
title = 'Total quantity of orders on each day of the year')
ax.set_xlabel('Day of the year')
ax.set_ylabel('Quantity');
# plotting the sales on each day of the year
ax = df_ecommerce.groupby(['Date']).sum()['Sales'].plot(kind = 'line', figsize = (20,5),
title = 'Total sales on each day of the year')
ax.set_xlabel('Day of the year')
ax.set_ylabel('Count');
From the above time series chart, we can see that there is an increasing trend in the count of orders over the year.
And there is also a seasonal trend in both the quantity of orders placed and the corresponding sales. However, we do notice that there are two peaks - one at the beginning of the year and one at the end. But we do not have enough evidence from the data to determine whether these are genuine data points or outliers. The description of the data says that "The company mainly sells unique all-occasion gifts. Many customers of the company are wholesalers." We can assume that some wholesale dealer bought bulk of the items on those particular days.
Similarly, we notice a peak at the end of the year, and since we do not know for sure whether it is an outlier, we shall have it in our analysis for now. If we know more about the data and the domain knowledge, we can make a more educated guess as to whether or not to include the datapoint in our analysis.
# plotting the count of orders on each year
ax = df_ecommerce.groupby(['Year']).sum()['Quantity'].plot(kind = 'bar', figsize = (10,5),
title = 'Quantity of orders on each year')
ax.set_xlabel('Year')
ax.set_ylabel('Count');
# plotting the count of orders on each month of the year
ax = df_ecommerce.groupby(['Month']).sum()['Quantity'].plot(kind = 'bar', figsize = (15,5),
title = 'Quantity of orders on each month of the year')
ax.set_xlabel('Month')
ax.set_ylabel('Count');
# plotting the sales on each month of the year
ax = df_ecommerce.groupby(['Month']).sum()['Sales'].plot(kind = 'bar', figsize = (15,5),
title = 'Sales on each month of the year')
ax.set_xlabel('Month')
ax.set_ylabel('Count');
Since the data is from 01/12/2010 and 09/12/2011, it seems logical that the number of orders are way more in 2011 as compared the number of orders in the year 2010. From the above two charts, we notice that the count of orders and the sales follow a similar pattern except for the months September and December where relatively less number of orders have produced comparitively more sales. We may infer that high priced good got sold more during these months as compared to other months.
# plotting the sales on each week of the year
ax = df_ecommerce.groupby(['WeekOfYear']).sum()['Sales'].plot(kind = 'bar', figsize = (15,5),
title = 'Sales on each week of the year')
ax.set_xlabel('Week of the year')
ax.set_ylabel('Count');
We notice that the sales were at its peak during the 49th month of the year. However, as we noticed in the trend charts for the sales above, we found a peak in the last month which could have been on the 49th week.
# plotting the sales on each day of the week
print(df_ecommerce.groupby(['DayOfWeek']).sum()['Sales'])
ax = df_ecommerce.groupby(['DayOfWeek']).sum()['Sales'].plot(kind = 'bar', figsize = (15,5),
title = 'Sales on each day of the week')
ax.set_xlabel('Day of the Week')
ax.set_ylabel('Sales');
DayOfWeek Monday 1363604.401 Tuesday 1697733.801 Wednesday 1584283.830 Thursday 1973015.730 Friday 1483080.811 Saturday 0.000 Sunday 785490.321 Name: Sales, dtype: float64
# plotting the count of orders on each hour of the day
ax = df_ecommerce.groupby(['Hour']).count()['InvoiceNo'].plot(kind = 'bar', figsize = (15,5),
title = 'Count of orders on each hour of the day')
ax.set_xlabel('Hour of the day')
ax.set_ylabel('Count');
# plotting the quantity of orders on each hour of each day of the week
ax = df_ecommerce.groupby(['DayOfWeek','Hour']).count()['Quantity'].plot(kind = 'bar', figsize = (20,5),
title = 'Quantity of orders on each hour of each day of the week')
ax.set_xlabel('Hour/Day of the Week')
ax.set_ylabel('Quantity');
The sales seem to be peaking on Thursdays followed by high sales on Tuesday. Mondays, Wednesdays, and Fridays seem to having similar sales amounts. Sundays register the lowest sales. There are no sales on Saturday. So we may assume that the retail store does not function on Saturdays. And even with respect to everyday, the count of orders seem to peak around the afternoon and taper at the start and end of the day. We notice that the customer activity in at its highest with respect to placing orders during the day time from around 10 Am to 4 PM.
# plotting the count of orders on each session of the day
ax = df_ecommerce.groupby(['Session']).count()['InvoiceNo'].plot(kind = 'bar', figsize = (15,5),
title = 'Count of orders on each session of the day')
ax.set_xlabel('Session of the day')
ax.set_ylabel('Count');
Most number of orders are placed during the mornings and afternoons.
# plotting the Top 20 orders based on the description
ax = df_ecommerce.groupby(['Description']).count()['InvoiceNo'].sort_values(ascending = False).head(20).plot(kind = 'bar',
figsize = (15,5),
title = 'Top 20 orders based on the description')
ax.set_xlabel('Hour of the day')
ax.set_ylabel('Count');
The item named "White Hanging Heart T-Light Holder" seems to be the highest sold item in the retail store.
FB Prophet is a procedure for forecasting time series data based on an Generalized Additive Model (GAM) where non-linear trends are fit with yearly, weekly, and daily seasonality, plus holiday effects.
The systematic component of an additive model is the arithmetic sum of the individual effects of the predictors. Prophet combines seasonality, trend, and holidays.
y(t) = g(t) +s(t) +h(t) + et
Where
g(t) = trend component (the trend function which models non-periodic changes in the value of thetime series)
s(t) = seasonal component (represents periodic changes (e.g., weekly and yearly seasonality))
h(t) = holiday component (represents the effects of holidays)
et = remainder component (represents any idiosyncratic changes or errors)
from fbprophet import Prophet
# import some modules that we need to initialize our environment
from fbprophet import Prophet
from fbprophet.diagnostics import cross_validation
from fbprophet.diagnostics import performance_metrics
from fbprophet.plot import plot_cross_validation_metric
df_ecommerce.head()
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | Sales | Date | Year | Month | WeekOfYear | DayOfWeek | Time | Session | Hour | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 2010-12-01 08:26:00 | 2.55 | 17850.0 | United Kingdom | 15.30 | 2010-12-01 | 2010 | 12 | 48 | Wednesday | 08:26:00 | Morning | 8 |
| 1 | 536365 | 71053 | WHITE METAL LANTERN | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 20.34 | 2010-12-01 | 2010 | 12 | 48 | Wednesday | 08:26:00 | Morning | 8 |
| 2 | 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 2010-12-01 08:26:00 | 2.75 | 17850.0 | United Kingdom | 22.00 | 2010-12-01 | 2010 | 12 | 48 | Wednesday | 08:26:00 | Morning | 8 |
| 3 | 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 20.34 | 2010-12-01 | 2010 | 12 | 48 | Wednesday | 08:26:00 | Morning | 8 |
| 4 | 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 20.34 | 2010-12-01 | 2010 | 12 | 48 | Wednesday | 08:26:00 | Morning | 8 |
ts = df_ecommerce[['Date','Sales']]
ts = ts.groupby(['Date'], as_index = False).agg(Sales = ('Sales', 'sum'))
ts
| Date | Sales | |
|---|---|---|
| 0 | 2010-12-01 | 46192.49 |
| 1 | 2010-12-02 | 47197.57 |
| 2 | 2010-12-03 | 23876.63 |
| 3 | 2010-12-05 | 31361.28 |
| 4 | 2010-12-06 | 31009.33 |
| ... | ... | ... |
| 300 | 2011-12-05 | 58081.09 |
| 301 | 2011-12-06 | 45989.66 |
| 302 | 2011-12-07 | 69230.60 |
| 303 | 2011-12-08 | 50395.96 |
| 304 | 2011-12-09 | 184329.66 |
305 rows × 2 columns
For running the Fb Prophet model, it is mandatory to have the ‘date’ column renamed as ‘ds’ and ‘sales’ column renamed as ‘y’.
ts.columns = ['ds','y']
ts.columns
Index(['ds', 'y'], dtype='object')
print("The mean value of sales is {}".format(ts['y'].mean()))
print("The std value of sales is {}".format(ts['y'].std()))
The mean value of sales is 29138.389816393497 The std value of sales is 17834.947343909098
The time series ranges from 2010-12-01 until 2011-12-09, it has 305 observations, a mean of 29138.38 and a standard deviation of 17834.94.
ts.head()
| ds | y | |
|---|---|---|
| 0 | 2010-12-01 | 46192.49 |
| 1 | 2010-12-02 | 47197.57 |
| 2 | 2010-12-03 | 23876.63 |
| 3 | 2010-12-05 | 31361.28 |
| 4 | 2010-12-06 | 31009.33 |
ts.tail()
| ds | y | |
|---|---|---|
| 300 | 2011-12-05 | 58081.09 |
| 301 | 2011-12-06 | 45989.66 |
| 302 | 2011-12-07 | 69230.60 |
| 303 | 2011-12-08 | 50395.96 |
| 304 | 2011-12-09 | 184329.66 |
ts['ds']=pd.to_datetime(ts.ds)
ts.plot(x='ds',y='y', figsize = (20,5));
We can notice that there is one huge spike on the last day of the dataset corrresponding to the date '2011-12-09' with the value 184329.66. Now let us see how the time series chart looks without that value.
ts1 = ts.loc[:303,['ds','y']]
ts1
| ds | y | |
|---|---|---|
| 0 | 2010-12-01 | 46192.49 |
| 1 | 2010-12-02 | 47197.57 |
| 2 | 2010-12-03 | 23876.63 |
| 3 | 2010-12-05 | 31361.28 |
| 4 | 2010-12-06 | 31009.33 |
| ... | ... | ... |
| 299 | 2011-12-04 | 20232.00 |
| 300 | 2011-12-05 | 58081.09 |
| 301 | 2011-12-06 | 45989.66 |
| 302 | 2011-12-07 | 69230.60 |
| 303 | 2011-12-08 | 50395.96 |
304 rows × 2 columns
ts1['ds']=pd.to_datetime(ts.ds)
ts1.plot(x='ds',y='y', figsize = (20,5));
We notice that if we just remove that one data point the time series chart looks much better. Although, we do not know much abuot the description of the dataset, for now let us assume that this one point is an outlier and remove it from our data.
ts = ts.loc[:303,['ds','y']]
ts
| ds | y | |
|---|---|---|
| 0 | 2010-12-01 | 46192.49 |
| 1 | 2010-12-02 | 47197.57 |
| 2 | 2010-12-03 | 23876.63 |
| 3 | 2010-12-05 | 31361.28 |
| 4 | 2010-12-06 | 31009.33 |
| ... | ... | ... |
| 299 | 2011-12-04 | 20232.00 |
| 300 | 2011-12-05 | 58081.09 |
| 301 | 2011-12-06 | 45989.66 |
| 302 | 2011-12-07 | 69230.60 |
| 303 | 2011-12-08 | 50395.96 |
304 rows × 2 columns
print("The mean value of sales is {}".format(ts['y'].mean()))
print("The std value of sales is {}".format(ts['y'].std()))
The mean value of sales is 28627.892217105316 The std value of sales is 15472.15003619462
The time series ranges from 2010-12-01 until 2011-12-09, it has 303 observations, a mean of 28627.89 and a standard deviation of 15472.15
We can use statsmodels to perform a decomposition of this time series. The decomposition of time series is a statistical task that deconstructs a time series into several components, each representing one of the underlying categories of patterns. With statsmodels we will be able to see the trend, seasonal, and residual components of our data.
from statsmodels.tsa.seasonal import seasonal_decompose
decompose = seasonal_decompose(ts['y'],model='additive', period=12)
fig = plt.figure()
fig = decompose.plot()
fig.set_size_inches(15,10);
<Figure size 432x288 with 0 Axes>
Above we can see the data is somewhat nonstationary as there is an slightly increasing trend at the end and also it is definitely seasonal because of the constant fluctuations at time intervals. By stationarity, we mean that there should be constant mean, variance, and covariance in the data if we divide the data into segments with respect to time and seasonality means the same type of trend the data is following if segregated based on time intervals. Therefore, in general while working with a timeseries data like this, we need to make it stationary and nonseasonal. However, Fbprophet has the capability of handling this type of data especially the seasonal component and therefore we need not worry regarding the preprocessing part. The FbProphet library has the capability of handling stationarity within the data and also seasonality related components.
There are two types of seasonalities in any data.
One is additive, which can be considered as the result of adding numbers. This type of data tends to show a linear trend.
Another is multiplicative, which can be considered as the result of the compounding effect with percentage growth. This type of data tends to show an exponential trend.
Since our data tends to show a linear trend, here we use FB Prophet model that fits additive seasonalities, meaning the effect of the seasonality is added to the trend to get the forecast is additive.
len(ts)
304
Since our dataset has values for 303 days, we shall use approximately the first 80% of the data (244 data points) as training data and the last 20% of data (60 data points) as test data.
train=ts[:244]
test=ts[244:]
train.head()
| Date | Sales | |
|---|---|---|
| 0 | 2010-12-01 | 46192.49 |
| 1 | 2010-12-02 | 47197.57 |
| 2 | 2010-12-03 | 23876.63 |
| 3 | 2010-12-05 | 31361.28 |
| 4 | 2010-12-06 | 31009.33 |
test.head()
| Date | Sales | |
|---|---|---|
| 244 | 2011-09-30 | 40023.44 |
| 245 | 2011-10-02 | 11479.98 |
| 246 | 2011-10-03 | 62143.07 |
| 247 | 2011-10-04 | 38532.09 |
| 248 | 2011-10-05 | 73975.57 |
# a function to fit the FB Prophet model to the train and test data
def fit_prophet(dtf_train, dtf_test, lst_exog=None, model=None,
freq="D", figsize=(15,10)):
## train
model.fit(dtf_train)
## test
dtf_prophet = model.make_future_dataframe(periods=len(dtf_test),
freq=freq, include_history=True)
dtf_prophet = model.predict(dtf_prophet)
fig1 = model.plot(dtf_prophet) # Plot the fit to past data and future forcast.
fig2 = model.plot_components(dtf_prophet) # Plot breakdown of components.
plt.show()
dtf_train = dtf_train.merge(dtf_prophet[["ds","yhat"]],
how="left").rename(columns={'yhat':'model',
'y':'ts'}).set_index("ds")
dtf_test = dtf_test.merge(dtf_prophet[["ds","yhat"]],
how="left").rename(columns={'yhat':'forecast',
'y':'ts'}).set_index("ds")
## evaluate
dtf = dtf_train.append(dtf_test)
dtf = utils_evaluate_forecast(dtf, figsize=figsize,
title="Prophet")
return dtf, model
# function to evaluate the forecast
def utils_evaluate_forecast(dtf, title, plot=True, figsize=(20,13)):
try:
## residuals
dtf["residuals"] = dtf["ts"] - dtf["model"]
dtf["error"] = dtf["ts"] - dtf["forecast"]
dtf["error_pct"] = dtf["error"] / dtf["ts"]
## kpi
residuals_mean = dtf["residuals"].mean()
residuals_std = dtf["residuals"].std()
error_mean = dtf["error"].mean()
error_std = dtf["error"].std()
mae = dtf["error"].apply(lambda x: np.abs(x)).mean()
mape = dtf["error_pct"].apply(lambda x: np.abs(x)).mean()
mse = dtf["error"].apply(lambda x: x**2).mean()
rmse = np.sqrt(mse) #root mean squared error
## intervals
dtf["conf_int_low"] = dtf["forecast"] - 1.96*residuals_std
dtf["conf_int_up"] = dtf["forecast"] + 1.96*residuals_std
dtf["pred_int_low"] = dtf["forecast"] - 1.96*error_std
dtf["pred_int_up"] = dtf["forecast"] + 1.96*error_std
## plot
if plot==True:
fig = plt.figure(figsize=figsize)
fig.suptitle(title, fontsize=20)
ax1 = fig.add_subplot(2,2, 1)
ax2 = fig.add_subplot(2,2, 2, sharey=ax1)
ax3 = fig.add_subplot(2,2, 3)
ax4 = fig.add_subplot(2,2, 4)
### training
dtf[pd.notnull(dtf["model"])][["ts","model"]].plot(color=["black","green"], title="Model", grid=True, ax=ax1)
ax1.set(xlabel=None)
### test
dtf[pd.isnull(dtf["model"])][["ts","forecast"]].plot(color=["black","red"], title="Forecast", grid=True, ax=ax2)
ax2.fill_between(x=dtf.index, y1=dtf['pred_int_low'], y2=dtf['pred_int_up'], color='b', alpha=0.2)
ax2.fill_between(x=dtf.index, y1=dtf['conf_int_low'], y2=dtf['conf_int_up'], color='b', alpha=0.3)
ax2.set(xlabel=None)
### residuals
dtf[["residuals","error"]].plot(ax=ax3, color=["green","red"], title="Residuals", grid=True)
ax3.set(xlabel=None)
### residuals distribution
dtf[["residuals","error"]].plot(ax=ax4, color=["green","red"], kind='kde', title="Residuals Distribution", grid=True)
ax4.set(ylabel=None)
plt.show()
print("Training --> Residuals mean:", np.round(residuals_mean), " | std:", np.round(residuals_std))
print("Test --> Error mean:", np.round(error_mean), " | std:", np.round(error_std),
" | mae:",np.round(mae), " | mape:",np.round(mape*100), "% | mse:",np.round(mse), " | rmse:",np.round(rmse))
return dtf[["ts","model","residuals","conf_int_low","conf_int_up",
"forecast","error","pred_int_low","pred_int_up"]]
except Exception as e:
print("--- got error ---")
print(e)
model = Prophet(growth="linear",
seasonality_mode="additive",
yearly_seasonality=True,
weekly_seasonality=True,
daily_seasonality=True);
dtf, model = fit_prophet(train, test, model=model, freq="D")
Training --> Residuals mean: -1.0 | std: 10186.0 Test --> Error mean: 18793.0 | std: 13278.0 | mae: 19395.0 | mape: 44.0 % | mse: 526025768.0 | rmse: 22935.0
In the first plot above, the black dots represent actual measurements blue line displays Prophet’s forecast light blue window indicates uncertainty intervals. We also notice that the yearly seasonality is being overfitted with a lot of fluctuation points.
Mean Absolute Error (MAE) : The Mean Absolute Error is the average of the absolute difference between the actual and predicted values
Mean Absolute Percentage Error (MAPE) : The Mean Absolute Percentge Error is the average of absolute percentage errors
Mean Squared Error (MSE) : Mean Squared Error represents the average of the squared difference between the original and predicted values
Root Mean Squared Error (RMSE) : Root Mean Squared Error is the square root of Mean Squared error. It measures the standard deviation of residuals
We shall use MAPE to evaluate our model. The MAPE for our base model is 44%. This indicates that over all the points predicted, we are out with an average of 44% from the true value.
growth: This parameter can be set to either "linear" or "logistic". By looking at the trend line, we notice a trend that keeps on growing with no real saturation insight. Hence, we shall set this parameter to "linear". If we plot the trend and notice a curve that is showing promise of saturation, then we would set this parameter to “logistic”
seasonality_mode: This parameter can be set to either "additive" or "multiplicative". This parameter indicates how your seasonality components should be integrated with the predictions. The default value here is additive with multiplicative being the other option. We use "additive" since our seasonality is almost “constant” over the entire period. Prophet default value for this is parameter is "additive"
n_changepoints: Changepoints are the points in your data where there are sudden and abrupt changes in the trend. This parameter determines how many change points we would like to have.
changepoint_prior_scale: The changepoint_prior_scale indicates how flexible the changepoints are allowed to be. As in, how much can the changepoints fit to the data. Large values will allow many changepoints and small values will allow fewer change points. I found 0.095 to result in the least MAPE.
holidays_prior_scale: This parameter determines how much of an effect holidays should have on your predictions. The default value is 10. However, when I increased the value to 20, it resulted in a lower MAPE. This makes sense because the data we are dealing with is a retail data and holidays might have a noticeable effect on the sales.
add_seasonality: There are a lot of options in Prophet to control seasonality. Yearly, weekly, and daily seasonality and their granularity. I set the yearly_seasonality, weekly_seasonality and daily_seasonality all to false and then add in my own seasonalities using the "add_seasonality" option. By doing this we get more power and control over seasonality. For example, for the "monthly" seasonality, the period is set to 30.5 which is how we tell the model that what happened at a certain point is likely to happen again in 30.5 days. Similarly we shall set the periods for other seasonalities as well. The advantage of this method is that we can introduce new seasonalities based on domain knowledge such as quarterly seasonalities that are not avialable by default in the model. For example, by making the "period" parameter to the value of 365/4 you can add in quarterly seasonality to your model. The other parameter you can tweak using this technique is the number of Fourier components (fourier_order). With this, we can change how accurately it should start representing the curve or how many more curves can be present in the curve. I played around with values ranging from 10 to 25 and retained the ones that gave the least MAPE. The prior_scale parameter allows our seasonalities to be more flexible. I got the least MAPE with the prior_scale value of 0.00001 for the yearly seasonality.
add_country_holidays: FB Prophet gives us an option to include the country holidays. Since we are dealing with the UK-based and registered non-store online retail store, I chose to include the UK holidays
I incorporated the hyperparameters in the model below.
model = Prophet(growth="linear",
seasonality_mode="additive",
n_changepoints=25,
changepoint_prior_scale = 0.095,
holidays_prior_scale=20,
yearly_seasonality=False,
weekly_seasonality=False,
daily_seasonality=False).add_seasonality(
name='monthly',
period=30.5,
fourier_order=10,
prior_scale=50).add_seasonality(
name='daily',
period=1,
fourier_order=5,
prior_scale=25).add_seasonality(
name='weekly',
period=7,
fourier_order=20,
prior_scale=25).add_seasonality(
name='yearly',
period=365.5,
fourier_order=1,
prior_scale=0.00001)
model.add_country_holidays(country_name='UK');
dtf, model = fit_prophet(train, test, model=model, freq="D")
Training --> Residuals mean: 14.0 | std: 11055.0 Test --> Error mean: 7347.0 | std: 12506.0 | mae: 10953.0 | mape: 26.0 % | mse: 207326231.0 | rmse: 14399.0
We notice that the trend line is increasing over the year.
The weekly peaks occur during Thrusdays and Tuesdays
The sales seem to peak during the later part of the year around November and dip duing the mid part of the year around May
After all the hyperparameter tuning, we notice that,
The Mean Absolute Error (MAE) is reduced to 10610.0 from being 19395.0 in the base model.
The Mean Absolute Percentage Error (MAPE) is reduced to 26% from being 44% in the base model.
The Mean Squared Error (MSE) is reduced to 193477641.0 from being 526025768.0 in the base model.
The Root Mean Squared Error (RMSE) is reduced to 13910.0 from being 22935.0 in the base model.
The Mean Absolute Percentage Error (MAPE) is reduced to 26% from being 44% in the base model. This indicates that over all the predicted values for the sales in the retail store, we are out with an average of 26% from the true value.
The Root Mean Squared Error (RMSE) is reduced to 13910.0 from being 22935.0 in the base model. This means that in general, the model’s predicted sales for the retail store is generally about 13910.0 Sterlings off.
With the help of specific domain knowledge, we may be able to improve these scores.
fig=model.plot(forecast)
a=add_changepoints_to_plot(fig.gca(),model,forecast)
model.changepoints
8 2010-12-10 16 2010-12-20 23 2011-01-07 31 2011-01-17 39 2011-01-26 47 2011-02-04 54 2011-02-13 62 2011-02-22 70 2011-03-03 78 2011-03-13 85 2011-03-21 93 2011-03-30 101 2011-04-08 109 2011-04-18 116 2011-05-01 124 2011-05-11 132 2011-05-20 140 2011-05-31 147 2011-06-08 155 2011-06-17 163 2011-06-27 171 2011-07-06 178 2011-07-14 186 2011-07-24 194 2011-08-02 Name: ds, dtype: datetime64[ns]
In the above chart, the change points (red dotted lined) denote the points where the trend changes (either increases or decreases). Based on the dates (shown above) on which the trend changes, we can expect an increase or a decrease in sales in the future years around the same dates. This would help us in running appropriate marketing campaigns and planning for the inventory and logistics activities. With the help of some domain knoweldge expert we could fine tune the change points to reflect the specific marketing strategy roll-outs and promotion deals.
model.train_holiday_names
0 New Year's Day 1 New Year Holiday [Scotland] 2 New Year Holiday [Scotland] (Observed) 3 St. Patrick's Day [Northern Ireland] 4 Battle of the Boyne [Northern Ireland] 5 Summer Bank Holiday [Scotland] 6 St. Andrew's Day [Scotland] 7 Christmas Day 8 Christmas Day (Observed) 9 Good Friday 10 Easter Monday [England/Wales/Northern Ireland] 11 May Day 12 Spring Bank Holiday 13 Late Summer Bank Holiday [England/Wales/Northe... 14 Boxing Day 15 Boxing Day (Observed) 16 New Year's Day (Observed) 17 Wedding of William and Catherine dtype: object
The above holidays which are specific to the United Kingdom holidays that were added to the model. However, with the help of domain knowledge experts, if the company wants to include other holidays such as "Thanksgiving", "Superbowl", etc we can provide them along with the other holidays to improve our forecast.
import pmdarima
import statsmodels.tsa.api as smt
# the data needs only the 'Date' and the 'Sales' columns
ts = df_ecommerce[['Date','Sales']]
ts = ts.groupby(['Date'], as_index = False).agg(Sales = ('Sales', 'sum'))
ts
| Date | Sales | |
|---|---|---|
| 0 | 2010-12-01 | 46192.49 |
| 1 | 2010-12-02 | 47197.57 |
| 2 | 2010-12-03 | 23876.63 |
| 3 | 2010-12-05 | 31361.28 |
| 4 | 2010-12-06 | 31009.33 |
| ... | ... | ... |
| 300 | 2011-12-05 | 58081.09 |
| 301 | 2011-12-06 | 45989.66 |
| 302 | 2011-12-07 | 69230.60 |
| 303 | 2011-12-08 | 50395.96 |
| 304 | 2011-12-09 | 184329.66 |
305 rows × 2 columns
# finding the optimal parameters for the ARIMA model
best_model = pmdarima.auto_arima(ts['Sales'],
seasonal=True,stationary=False,
m=7, information_criterion='aic',
max_p=10, max_d=3, max_q=10,
max_P=10, max_D=3, max_Q=10,
error_action='ignore')
print("best model --> (p, d, q):", best_model.order, " and (P, D, Q, s):", best_model.seasonal_order)
best model --> (p, d, q): (0, 1, 2) and (P, D, Q, s): (0, 0, 2, 7)
# function to fit the SARIMAX model
def fit_sarimax(ts_train, ts_test, order=(1,0,1),
seasonal_order=(0,0,0,0), exog_train=None,
exog_test=None, figsize=(15,10)):
## train
model = smt.SARIMAX(ts_train, order=order,
seasonal_order=seasonal_order,
exog=exog_train, enforce_stationarity=False,
enforce_invertibility=False).fit()
dtf_train = ts_train.to_frame(name="ts")
dtf_train["model"] = model.fittedvalues
## test
dtf_test = ts_test.to_frame(name="ts")
dtf_test["forecast"] = model.predict(start=len(ts_train),
end=len(ts_train)+len(ts_test)-1,
exog=exog_test)
## evaluate
dtf = dtf_train.append(dtf_test)
dtf = utils_evaluate_forecast(dtf, figsize=figsize, title=None)
return dtf, model
# building the model using the optimal 'order' and 'seasonal order' values we found above
dtf, model = fit_sarimax(train['Sales'], test['Sales'], order=(0,1,2),
seasonal_order=(0,0,2,7))
Training --> Residuals mean: -1079.0 | std: 14413.0 Test --> Error mean: 9861.0 | std: 23436.0 | mae: 15243.0 | mape: 35.0 % | mse: 637476831.0 | rmse: 25248.0
We find that the MAPE of SARIMAX is 35% which means that our model is out with an average of 35% from the true value.
We used the sales data to create an FB Prophet timeseries model and SARIMAX model to forecast future sales and explore the hyperparameters we can tune to get the most out of the historic sales data. We found that our FB Prophet model works better than the SARIMAX model
In conclusion, by coupling our analysis with the insights from some domain knowledge experts, we would be able to make the best out of our data.
References: I referred various articles on customer segmentation from websites such as Medium, Analytics Vidhya, Stack Overflow and used the FB Prophet documentation for the time series forecasting.
I enjoyed working on this case study. Thank you very much for the opportunity!