import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import datetime as dt
import warnings
warnings.filterwarnings("ignore")


# loading the dataset

df_ecommerce = pd.read_csv("data.csv", encoding= 'unicode_escape')


# first 5 rows of the dataset

df_ecommerce.head()


# dimensions of the data

df_ecommerce.shape

(541909, 8)


# information about the dataset

df_ecommerce.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


# columns in the dataset

df_ecommerce.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')


df_ecommerce[(df_ecommerce.Quantity <0)].count()['Quantity']

10624


df_ecommerce['Quantity'].unique()

array([     6,      8,      2,     32,      3,      4,     24,     12,
           48,     18,     20,     36,     80,     64,     10,    120,
           96,     23,      5,      1,     -1,     50,     40,    100,
          192,    432,    144,    288,    -12,    -24,     16,      9,
          128,     25,     30,     28,      7,     56,     72,    200,
          600,    480,     -6,     14,     -2,     11,     33,     13,
           -4,     -5,     -7,     -3,     70,    252,     60,    216,
          384,    -10,     27,     15,     22,     19,     17,     21,
           34,     47,    108,     52,  -9360,    -38,     75,    270,
           42,    240,     90,    320,   1824,    204,     69,    -36,
         -192,   -144,    160,   2880,   1400,     39,    -48,    -50,
           26,   1440,     31,     82,     78,     97,     98,     35,
           57,    -20,    110,    -22,    -30,    -70,   -130,    -80,
         -120,    -40,    -25,    -14,    -15,    -69,   -140,   -320,
           -8,    720,    156,    324,     38,     37,     49,     95,
           -9,    -11,     29,     41,    -72,    -35,    -21,    -43,
          -19,    -18,    -44,    402,    378,    150,    300,     54,
          104,     67,    258,     66,     44,     55,     46,     99,
           61,    408,    972,    208,   1008,   1000,    -77,   1488,
          250,   1394,    400,   -223,   -150,    -13,    -33,   -723,
         -177,     79,     84,    -32,   -100,    -28,    272,   -145,
          -47,    -96,    113,     45,    106,     68,    267,    115,
           65,   1728,    -60,    -16,     53,   -240,     76,    460,
           71,     43,    213,     58,    576,   2400,    500,    180,
         -300,   -500,    -23,    752,    960,   1296,    210,    172,
          215,    129,    138,    116,    135,    197,   -106,    -54,
          -17,   -939,    147,    168,    256,   -201,    -53,    -29,
        -2600,   -990,   -290,    -45,    860,   1010,   1356,   1284,
          186,    114,    360,   1930,   2000,   3114,   1300,    670,
          111,    211,     59,   -310,    -61,    -41,    176,    648,
           62,  74215, -74215,    -64,    -84,     89,  -1400,     73,
          -57,    112,    456,    -59,    -31,   5568,   2560,    136,
          900,   -600,    -42,    -94,   -207,    -52,    130,   -206,
         2592,    420,    800,    101,   1200,    864,   -217,     94,
        -1430,   1287,   -162,   -230,   -173,   -390,   -234,    504,
          123,    118,    -76,   -200,   1056,   1500,    280,    407,
          141,    124,    -99,     51,    -92,   -741,   3906,   -400,
         -114,    102,   1152,    -88,    198,    117,     86,   -720,
          125,    -86,   -391,    -87,   -278,    140,    228,   -154,
        -3000,     81,   -675,   -210,   -345,   -975,  -1200,  -1121,
          -27,   -541,  -1277,   -197,   3000,  -1100,    -63,  -5368,
          219,   -259,    232,    -82,   -178,    170,   -110,    768,
           88,    700,   -102,   2160,   -323,   -450,   -232,    -83,
         -153,   -524,  -2472,    -49,   -105,    342,    304,    167,
          -34,    640,    175,    220,     74,     93,    164,    163,
           63,    637,    122,    158,    165,    350,    -46,    -75,
         -690,    -39,    -66,     83,    312,   -180,    392,   -624,
         -194,    -26,    -62,    -37,    -91,   -139,   -158,  -1479,
          -55,   -576,   -750,    330,    151,    -93,   -432,    -58,
        -1092,   -670,  -1300,  -1930,  -2000,  -3114,    462,    -65,
          -68,    492,   -620,    273,    -90,   -170,  -1512,    -51,
           85,    -56,   -160,   -360,    105,   -960,  -2376,   1350,
          428,  -1350,    336,   -786,   -590,   -168,   -101,    -71,
          132,   -413,   -664,    227,   2700,    222,    246,    906,
          756,    888,    552,    -97,    224,    738,    608,   -212,
         4300,    146,    143,    276,   -125,   -116,   -108,   4000,
         -304,   -272,  -1206,    -95,   1600,    323,   -161,   -472,
         -618,   -204,   1515,  -1515,  -9058,  -9600,    660,   -420,
         -126,   -220,   -271,  -1440,    264,    188,    588,    612,
          152,   -324,   -480,   -164,    -78,   -118,    430,  -1681,
           87,    155,    701,    828,    540,    696,    560,   -250,
         -408,   -179,    121,   -124,    512,   -251,  -3100,   3100,
         -169,    126,    291,   3186,  -2834,   -109,    109,   -121,
         -530,   -227,    261,   -346,    352,    142,    107,   -188,
        -1060,   -342,   -288,    348,   1900,    157,   -343,   -455,
          425,    968,    684,    824,   -828,   -701,    196,    248,
          410,    236,    230,   -156,   -553,    145,    448,    245,
         -252,   -334,   -318,   -113,   -115,    171,   -242,    840,
         -967,   -203,  -3167,   -443,  -1897,    225,   -434,    750,
         -682,   -484,    682,    344,   -635,   -117,  -3667,    450,
          310,    494,   -384,     92,   1788,   -138,    624,    744,
          416,    496,    396,    306,   1878,   1944,    666,    708,
         1428,    852,   1412,    528,   -756,   -752,   -152,    -85,
         -312,    -79,   -147,    -67,   -131,    183,   -209,   -186,
         -231,   -129,   -458,   -275,  -2880,   1540,    672,   -800,
         -430,   -380,    -74,   -840,  -1296,   -365,   -104,   -270,
          -73,   -306,     91,    255,    468,   -468,   -111,   -184,
         -103,   -335,   4800,   -112,  -1000,    912,   1992,    184,
          148,   -657,  -1671,  -1158,  -2618,  -2003,   -674,  -4830,
         -905,  -1128,    832,    992,    630,   1020,   2100,    162,
        -1560,  -1284,    -81,    314,    370,    131,    133,    484,
          149,    153,    257,    139,    137,    628,    179,   1820,
          478,    335,    253,    242,    375,   -276,   -256,    281,
          193,    181,    404,    244,    207,    199,   1130,    326,
          654,    688,    268,    249,   -864,    234,  -1510,   -550,
         -244,   -132,   -327,   -313,    279,   -398,   -280,    374,
         -504,   -696,   -149,   -224,   -428,    212,   -267,   -175,
        12540,    760,    -98,    774,   -151,   2040,  -1131,   -135,
         1120,     77,   -900,    177,    490,    388,    620,   1404,
         -155,   -355,   -337,  -1050,   -338,    205,   -235,    698,
        80995, -80995], dtype=int64)


df_ecommerce = df_ecommerce[(df_ecommerce.Quantity >=0)]
df_ecommerce


df_ecommerce[(df_ecommerce.UnitPrice < 0)]


df_ecommerce.drop(df_ecommerce[df_ecommerce['UnitPrice']<0].index, inplace=True)


df_ecommerce.isnull().sum()

InvoiceNo           0
StockCode           0
Description       592
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     133359
Country             0
dtype: int64


df_ecommerce = df_ecommerce[df_ecommerce['CustomerID'].notna()]
df_ecommerce


df_ecommerce.isnull().sum()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64


df_ecommerce.duplicated().sum()

5192


# removing duplicates

df_ecommerce.drop_duplicates(inplace=True)


df_ecommerce.head()


df_ecommerce.shape

(392732, 8)


df_ecommerce.insert(8,"Sales", df_ecommerce['Quantity']*df_ecommerce['UnitPrice'])


# convert the "InvoiceDate" into a datetime object so that it is easier to perform further manipulations

df_ecommerce['InvoiceDate'] = df_ecommerce.InvoiceDate.astype('datetime64')


df_ecommerce.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392732 entries, 0 to 541908
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    392732 non-null  object        
 1   StockCode    392732 non-null  object        
 2   Description  392732 non-null  object        
 3   Quantity     392732 non-null  int64         
 4   InvoiceDate  392732 non-null  datetime64[ns]
 5   UnitPrice    392732 non-null  float64       
 6   CustomerID   392732 non-null  float64       
 7   Country      392732 non-null  object        
 8   Sales        392732 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(4)
memory usage: 30.0+ MB


# Creating a date column

df_ecommerce['Date'] = df_ecommerce['InvoiceDate'].dt.date


# Creating a year column

df_ecommerce['Year'] = df_ecommerce['InvoiceDate'].dt.year


# Creating a month column

df_ecommerce['Month'] = df_ecommerce['InvoiceDate'].dt.month


# Creating a week of the year column

df_ecommerce['WeekOfYear'] = df_ecommerce['InvoiceDate'].dt.isocalendar().week


# Creating a day of the week column

df_ecommerce['DayOfWeek'] = [d.day_name() for d in df_ecommerce['InvoiceDate']]


# Ordering the day of the week column

df_ecommerce['DayOfWeek'] = pd.Categorical(df_ecommerce['DayOfWeek'], 
                                 categories= ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday', 'Sunday'],
                                              ordered = True)


# Creating the time of the day column

df_ecommerce['Time'] = [d.time() for d in df_ecommerce['InvoiceDate']]


# Splitting the the time of the day column into different sessions in a day

df_ecommerce=df_ecommerce.assign(Session=pd.cut(df_ecommerce.InvoiceDate.dt.hour,
                            [0,6,12,18,23],
                            labels=['Night','Morning','Afternoon','Evening'],
                            include_lowest=True))


# Creating the hour of the day column

df_ecommerce['Hour'] = df_ecommerce.InvoiceDate.dt.hour


df_ecommerce.head()


df_ecommerce.isnull().sum()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
Sales          0
Date           0
Year           0
Month          0
WeekOfYear     0
DayOfWeek      0
Time           0
Session        0
Hour           0
dtype: int64


df_ecommerce.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country', 'Sales', 'Date', 'Year', 'Month',
       'WeekOfYear', 'DayOfWeek', 'Time', 'Session', 'Hour'],
      dtype='object')


df_ecommerce.shape

(392732, 17)


df_ecommerce.reset_index(inplace=True)
df_ecommerce.drop(axis=1, columns=['index'], inplace=True)


df_ecommerce.head()


df_ecommerce.shape

(392732, 17)


# plotting the count of orders for each country

ax = df_ecommerce.groupby(['Country']).count()['InvoiceNo'].plot(kind = 'bar', figsize = (20,10), 
                            title = 'Count of transactions for each country')
ax.set_xlabel('Coutry')
ax.set_ylabel('Count');


df_ecommerce['Country'].value_counts()

United Kingdom          349227
Germany                   9027
France                    8327
EIRE                      7228
Spain                     2480
Netherlands               2363
Belgium                   2031
Switzerland               1842
Portugal                  1453
Australia                 1184
Norway                    1072
Italy                      758
Channel Islands            747
Finland                    685
Cyprus                     603
Sweden                     450
Austria                    398
Denmark                    380
Poland                     330
Japan                      321
Israel                     245
Unspecified                241
Singapore                  222
Iceland                    182
USA                        179
Canada                     151
Greece                     145
Malta                      112
United Arab Emirates        68
European Community          60
RSA                         58
Lebanon                     45
Lithuania                   35
Brazil                      32
Czech Republic              25
Bahrain                     17
Saudi Arabia                 9
Name: Country, dtype: int64


# plotting the Count of orders on each day of the year

ax = df_ecommerce.groupby(['Date']).count()['InvoiceNo'].plot(kind = 'line', figsize = (20,5), 
                            title = 'Total Count of transactions on each day of the year')
ax.set_xlabel('Day of the year')
ax.set_ylabel('Count of transactions');


# plotting the quantity of orders on each day of the year

ax = df_ecommerce.groupby(['Date']).sum()['Quantity'].plot(kind = 'line', figsize = (20,5), 
                            title = 'Total quantity of orders on each day of the year')
ax.set_xlabel('Day of the year')
ax.set_ylabel('Quantity');


# plotting the sales on each day of the year

ax = df_ecommerce.groupby(['Date']).sum()['Sales'].plot(kind = 'line', figsize = (20,5), 
                            title = 'Total sales on each day of the year')
ax.set_xlabel('Day of the year')
ax.set_ylabel('Count');


# plotting the count of orders on each year

ax = df_ecommerce.groupby(['Year']).sum()['Quantity'].plot(kind = 'bar', figsize = (10,5), 
                            title = 'Quantity of orders on each year')
ax.set_xlabel('Year')
ax.set_ylabel('Count');


# plotting the count of orders on each month of the year

ax = df_ecommerce.groupby(['Month']).sum()['Quantity'].plot(kind = 'bar', figsize = (15,5), 
                            title = 'Quantity of orders on each month of the year')
ax.set_xlabel('Month')
ax.set_ylabel('Count');


# plotting the sales on each month of the year

ax = df_ecommerce.groupby(['Month']).sum()['Sales'].plot(kind = 'bar', figsize = (15,5), 
                            title = 'Sales on each month of the year')
ax.set_xlabel('Month')
ax.set_ylabel('Count');


# plotting the sales on each week of the year

ax = df_ecommerce.groupby(['WeekOfYear']).sum()['Sales'].plot(kind = 'bar', figsize = (15,5), 
                            title = 'Sales on each week of the year')
ax.set_xlabel('Week of the year')
ax.set_ylabel('Count');


# plotting the sales on each day of the week
print(df_ecommerce.groupby(['DayOfWeek']).sum()['Sales'])
ax = df_ecommerce.groupby(['DayOfWeek']).sum()['Sales'].plot(kind = 'bar', figsize = (15,5), 
                            title = 'Sales on each day of the week')
ax.set_xlabel('Day of the Week')
ax.set_ylabel('Sales');

DayOfWeek
Monday       1363604.401
Tuesday      1697733.801
Wednesday    1584283.830
Thursday     1973015.730
Friday       1483080.811
Saturday           0.000
Sunday        785490.321
Name: Sales, dtype: float64


# plotting the count of orders on each hour of the day

ax = df_ecommerce.groupby(['Hour']).count()['InvoiceNo'].plot(kind = 'bar', figsize = (15,5), 
                            title = 'Count of orders on each hour of the day')
ax.set_xlabel('Hour of the day')
ax.set_ylabel('Count');


# plotting the quantity of orders on each hour of each day of the week

ax = df_ecommerce.groupby(['DayOfWeek','Hour']).count()['Quantity'].plot(kind = 'bar', figsize = (20,5), 
                            title = 'Quantity of orders on each hour of each day of the week')
ax.set_xlabel('Hour/Day of the Week')
ax.set_ylabel('Quantity');


# plotting the count of orders on each session of the day

ax = df_ecommerce.groupby(['Session']).count()['InvoiceNo'].plot(kind = 'bar', figsize = (15,5), 
                            title = 'Count of orders on each session of the day')
ax.set_xlabel('Session of the day')
ax.set_ylabel('Count');


# plotting the Top 20 orders based on the description

ax = df_ecommerce.groupby(['Description']).count()['InvoiceNo'].sort_values(ascending = False).head(20).plot(kind = 'bar',
                            figsize = (15,5), 
                            title = 'Top 20 orders based on the description')
ax.set_xlabel('Hour of the day')
ax.set_ylabel('Count');


from fbprophet import Prophet


# import some modules that we need to initialize our environment

from fbprophet import Prophet
from fbprophet.diagnostics import cross_validation
from fbprophet.diagnostics import performance_metrics
from fbprophet.plot import plot_cross_validation_metric


df_ecommerce.head()


ts = df_ecommerce[['Date','Sales']]
ts = ts.groupby(['Date'], as_index = False).agg(Sales = ('Sales', 'sum'))
ts


ts.columns = ['ds','y']
ts.columns

Index(['ds', 'y'], dtype='object')


print("The mean value of sales is {}".format(ts['y'].mean()))
print("The std value of sales is {}".format(ts['y'].std()))

The mean value of sales is 29138.389816393497
The std value of sales is 17834.947343909098


ts.head()


ts.tail()


ts['ds']=pd.to_datetime(ts.ds)
ts.plot(x='ds',y='y', figsize = (20,5));


ts1 = ts.loc[:303,['ds','y']]
ts1


ts1['ds']=pd.to_datetime(ts.ds)
ts1.plot(x='ds',y='y', figsize = (20,5));


ts = ts.loc[:303,['ds','y']]
ts


print("The mean value of sales is {}".format(ts['y'].mean()))
print("The std value of sales is {}".format(ts['y'].std()))

The mean value of sales is 28627.892217105316
The std value of sales is 15472.15003619462


from statsmodels.tsa.seasonal import seasonal_decompose
decompose  = seasonal_decompose(ts['y'],model='additive', period=12)
fig = plt.figure()
fig = decompose.plot()
fig.set_size_inches(15,10);

<Figure size 432x288 with 0 Axes>


len(ts)

304


train=ts[:244]
test=ts[244:]


train.head()


test.head()


# a function to fit the FB Prophet model to the train and test data

def fit_prophet(dtf_train, dtf_test, lst_exog=None, model=None, 
                freq="D", figsize=(15,10)):
    ## train
    model.fit(dtf_train)
    
    ## test
    dtf_prophet = model.make_future_dataframe(periods=len(dtf_test), 
                  freq=freq, include_history=True)
    dtf_prophet = model.predict(dtf_prophet)
    fig1 = model.plot(dtf_prophet) # Plot the fit to past data and future forcast.
    fig2 = model.plot_components(dtf_prophet) # Plot breakdown of components.
    plt.show()
    dtf_train = dtf_train.merge(dtf_prophet[["ds","yhat"]], 
                how="left").rename(columns={'yhat':'model', 
                'y':'ts'}).set_index("ds")
    dtf_test = dtf_test.merge(dtf_prophet[["ds","yhat"]], 
                how="left").rename(columns={'yhat':'forecast',  
                'y':'ts'}).set_index("ds")
    
    ## evaluate
    dtf = dtf_train.append(dtf_test)
    dtf = utils_evaluate_forecast(dtf, figsize=figsize, 
                                  title="Prophet")
    return dtf, model


# function to evaluate the forecast

def utils_evaluate_forecast(dtf, title, plot=True, figsize=(20,13)):
    try:
        ## residuals
        dtf["residuals"] = dtf["ts"] - dtf["model"]
        dtf["error"] = dtf["ts"] - dtf["forecast"]
        dtf["error_pct"] = dtf["error"] / dtf["ts"]
        
        ## kpi
        residuals_mean = dtf["residuals"].mean()
        residuals_std = dtf["residuals"].std()
        error_mean = dtf["error"].mean()
        error_std = dtf["error"].std()
        mae = dtf["error"].apply(lambda x: np.abs(x)).mean()
        mape = dtf["error_pct"].apply(lambda x: np.abs(x)).mean()  
        mse = dtf["error"].apply(lambda x: x**2).mean()
        rmse = np.sqrt(mse)  #root mean squared error
        
        ## intervals
        dtf["conf_int_low"] = dtf["forecast"] - 1.96*residuals_std
        dtf["conf_int_up"] = dtf["forecast"] + 1.96*residuals_std
        dtf["pred_int_low"] = dtf["forecast"] - 1.96*error_std
        dtf["pred_int_up"] = dtf["forecast"] + 1.96*error_std
        
        ## plot
        if plot==True:
            fig = plt.figure(figsize=figsize)
            fig.suptitle(title, fontsize=20)   
            ax1 = fig.add_subplot(2,2, 1)
            ax2 = fig.add_subplot(2,2, 2, sharey=ax1)
            ax3 = fig.add_subplot(2,2, 3)
            ax4 = fig.add_subplot(2,2, 4)
            ### training
            dtf[pd.notnull(dtf["model"])][["ts","model"]].plot(color=["black","green"], title="Model", grid=True, ax=ax1)      
            ax1.set(xlabel=None)
            ### test
            dtf[pd.isnull(dtf["model"])][["ts","forecast"]].plot(color=["black","red"], title="Forecast", grid=True, ax=ax2)
            ax2.fill_between(x=dtf.index, y1=dtf['pred_int_low'], y2=dtf['pred_int_up'], color='b', alpha=0.2)
            ax2.fill_between(x=dtf.index, y1=dtf['conf_int_low'], y2=dtf['conf_int_up'], color='b', alpha=0.3)     
            ax2.set(xlabel=None)
            ### residuals
            dtf[["residuals","error"]].plot(ax=ax3, color=["green","red"], title="Residuals", grid=True)
            ax3.set(xlabel=None)
            ### residuals distribution
            dtf[["residuals","error"]].plot(ax=ax4, color=["green","red"], kind='kde', title="Residuals Distribution", grid=True)
            ax4.set(ylabel=None)
            plt.show()
            print("Training --> Residuals mean:", np.round(residuals_mean), " | std:", np.round(residuals_std))
            print("Test --> Error mean:", np.round(error_mean), " | std:", np.round(error_std),
                  " | mae:",np.round(mae), " | mape:",np.round(mape*100), "%  | mse:",np.round(mse), " | rmse:",np.round(rmse))
        
        return dtf[["ts","model","residuals","conf_int_low","conf_int_up", 
                    "forecast","error","pred_int_low","pred_int_up"]]
    
    except Exception as e:
        print("--- got error ---")
        print(e)


model = Prophet(growth="linear",
                seasonality_mode="additive",
                yearly_seasonality=True, 
                weekly_seasonality=True, 
                daily_seasonality=True);


dtf, model = fit_prophet(train, test, model=model, freq="D")

Training --> Residuals mean: -1.0  | std: 10186.0
Test --> Error mean: 18793.0  | std: 13278.0  | mae: 19395.0  | mape: 44.0 %  | mse: 526025768.0  | rmse: 22935.0


model = Prophet(growth="linear",
                seasonality_mode="additive",
                n_changepoints=25, 
                changepoint_prior_scale = 0.095, 
                holidays_prior_scale=20,
                yearly_seasonality=False, 
                weekly_seasonality=False, 
                daily_seasonality=False).add_seasonality(
                name='monthly',
                period=30.5,
                fourier_order=10,
                prior_scale=50).add_seasonality(
                name='daily',
                period=1,
                fourier_order=5,
                prior_scale=25).add_seasonality(
                name='weekly',
                period=7,
                fourier_order=20,
                prior_scale=25).add_seasonality(
                name='yearly',
                period=365.5,
                fourier_order=1,
                prior_scale=0.00001)
model.add_country_holidays(country_name='UK');


dtf, model = fit_prophet(train, test, model=model, freq="D")

Training --> Residuals mean: 14.0  | std: 11055.0
Test --> Error mean: 7347.0  | std: 12506.0  | mae: 10953.0  | mape: 26.0 %  | mse: 207326231.0  | rmse: 14399.0


fig=model.plot(forecast)
a=add_changepoints_to_plot(fig.gca(),model,forecast)


model.changepoints

8     2010-12-10
16    2010-12-20
23    2011-01-07
31    2011-01-17
39    2011-01-26
47    2011-02-04
54    2011-02-13
62    2011-02-22
70    2011-03-03
78    2011-03-13
85    2011-03-21
93    2011-03-30
101   2011-04-08
109   2011-04-18
116   2011-05-01
124   2011-05-11
132   2011-05-20
140   2011-05-31
147   2011-06-08
155   2011-06-17
163   2011-06-27
171   2011-07-06
178   2011-07-14
186   2011-07-24
194   2011-08-02
Name: ds, dtype: datetime64[ns]


model.train_holiday_names

0                                        New Year's Day
1                           New Year Holiday [Scotland]
2                New Year Holiday [Scotland] (Observed)
3                  St. Patrick's Day [Northern Ireland]
4                Battle of the Boyne [Northern Ireland]
5                        Summer Bank Holiday [Scotland]
6                           St. Andrew's Day [Scotland]
7                                         Christmas Day
8                              Christmas Day (Observed)
9                                           Good Friday
10       Easter Monday [England/Wales/Northern Ireland]
11                                              May Day
12                                  Spring Bank Holiday
13    Late Summer Bank Holiday [England/Wales/Northe...
14                                           Boxing Day
15                                Boxing Day (Observed)
16                            New Year's Day (Observed)
17                     Wedding of William and Catherine
dtype: object


import pmdarima
import statsmodels.tsa.api as smt


# the data needs only the 'Date' and the 'Sales' columns

ts = df_ecommerce[['Date','Sales']]
ts = ts.groupby(['Date'], as_index = False).agg(Sales = ('Sales', 'sum'))
ts


# finding the optimal parameters for the ARIMA model

best_model = pmdarima.auto_arima(ts['Sales'],                                    
                                 seasonal=True,stationary=False, 
                                 m=7, information_criterion='aic',                                    
                                 max_p=10, max_d=3, max_q=10,                                     
                                 max_P=10, max_D=3, max_Q=10,                                   
                                 error_action='ignore')
print("best model --> (p, d, q):", best_model.order, " and  (P, D, Q, s):", best_model.seasonal_order)

best model --> (p, d, q): (0, 1, 2)  and  (P, D, Q, s): (0, 0, 2, 7)


# function to fit the SARIMAX model

def fit_sarimax(ts_train, ts_test, order=(1,0,1), 
                seasonal_order=(0,0,0,0), exog_train=None, 
                exog_test=None, figsize=(15,10)):
    ## train
    model = smt.SARIMAX(ts_train, order=order, 
                        seasonal_order=seasonal_order, 
                        exog=exog_train, enforce_stationarity=False, 
                        enforce_invertibility=False).fit()
    dtf_train = ts_train.to_frame(name="ts")
    dtf_train["model"] = model.fittedvalues
    
    ## test
    dtf_test = ts_test.to_frame(name="ts")
    dtf_test["forecast"] = model.predict(start=len(ts_train), 
                            end=len(ts_train)+len(ts_test)-1, 
                            exog=exog_test)
    
    ## evaluate
    dtf = dtf_train.append(dtf_test)
    dtf = utils_evaluate_forecast(dtf, figsize=figsize, title=None)
    return dtf, model


# building the model using the optimal 'order' and 'seasonal order' values we found above

dtf, model = fit_sarimax(train['Sales'], test['Sales'], order=(0,1,2), 
                         seasonal_order=(0,0,2,7))

Training --> Residuals mean: -1079.0  | std: 14413.0
Test --> Error mean: 9861.0  | std: 23436.0  | mae: 15243.0  | mape: 35.0 %  | mse: 637476831.0  | rmse: 25248.0

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
0	536365	85123A	WHITE HANGING HEART T-LIGHT HOLDER	6	12/1/2010 8:26	2.55	17850.0	United Kingdom
1	536365	71053	WHITE METAL LANTERN	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
2	536365	84406B	CREAM CUPID HEARTS COAT HANGER	8	12/1/2010 8:26	2.75	17850.0	United Kingdom
3	536365	84029G	KNITTED UNION FLAG HOT WATER BOTTLE	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
4	536365	84029E	RED WOOLLY HOTTIE WHITE HEART.	6	12/1/2010 8:26	3.39	17850.0	United Kingdom

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
0	536365	85123A	WHITE HANGING HEART T-LIGHT HOLDER	6	12/1/2010 8:26	2.55	17850.0	United Kingdom
1	536365	71053	WHITE METAL LANTERN	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
2	536365	84406B	CREAM CUPID HEARTS COAT HANGER	8	12/1/2010 8:26	2.75	17850.0	United Kingdom
3	536365	84029G	KNITTED UNION FLAG HOT WATER BOTTLE	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
4	536365	84029E	RED WOOLLY HOTTIE WHITE HEART.	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
...	...	...	...	...	...	...	...	...
541904	581587	22613	PACK OF 20 SPACEBOY NAPKINS	12	12/9/2011 12:50	0.85	12680.0	France
541905	581587	22899	CHILDREN'S APRON DOLLY GIRL	6	12/9/2011 12:50	2.10	12680.0	France
541906	581587	23254	CHILDRENS CUTLERY DOLLY GIRL	4	12/9/2011 12:50	4.15	12680.0	France
541907	581587	23255	CHILDRENS CUTLERY CIRCUS PARADE	4	12/9/2011 12:50	4.15	12680.0	France
541908	581587	22138	BAKING SET 9 PIECE RETROSPOT	3	12/9/2011 12:50	4.95	12680.0	France

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
299983	A563186	B	Adjust bad debt	1	8/12/2011 14:51	-11062.06	NaN	United Kingdom
299984	A563187	B	Adjust bad debt	1	8/12/2011 14:52	-11062.06	NaN	United Kingdom

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
0	536365	85123A	WHITE HANGING HEART T-LIGHT HOLDER	6	12/1/2010 8:26	2.55	17850.0	United Kingdom
1	536365	71053	WHITE METAL LANTERN	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
2	536365	84406B	CREAM CUPID HEARTS COAT HANGER	8	12/1/2010 8:26	2.75	17850.0	United Kingdom
3	536365	84029G	KNITTED UNION FLAG HOT WATER BOTTLE	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
4	536365	84029E	RED WOOLLY HOTTIE WHITE HEART.	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
...	...	...	...	...	...	...	...	...
541904	581587	22613	PACK OF 20 SPACEBOY NAPKINS	12	12/9/2011 12:50	0.85	12680.0	France
541905	581587	22899	CHILDREN'S APRON DOLLY GIRL	6	12/9/2011 12:50	2.10	12680.0	France
541906	581587	23254	CHILDRENS CUTLERY DOLLY GIRL	4	12/9/2011 12:50	4.15	12680.0	France
541907	581587	23255	CHILDRENS CUTLERY CIRCUS PARADE	4	12/9/2011 12:50	4.15	12680.0	France
541908	581587	22138	BAKING SET 9 PIECE RETROSPOT	3	12/9/2011 12:50	4.95	12680.0	France

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
0	536365	85123A	WHITE HANGING HEART T-LIGHT HOLDER	6	12/1/2010 8:26	2.55	17850.0	United Kingdom
1	536365	71053	WHITE METAL LANTERN	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
2	536365	84406B	CREAM CUPID HEARTS COAT HANGER	8	12/1/2010 8:26	2.75	17850.0	United Kingdom
3	536365	84029G	KNITTED UNION FLAG HOT WATER BOTTLE	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
4	536365	84029E	RED WOOLLY HOTTIE WHITE HEART.	6	12/1/2010 8:26	3.39	17850.0	United Kingdom

	Date	Sales
0	2010-12-01	46192.49
1	2010-12-02	47197.57
2	2010-12-03	23876.63
3	2010-12-05	31361.28
4	2010-12-06	31009.33
...	...	...
300	2011-12-05	58081.09
301	2011-12-06	45989.66
302	2011-12-07	69230.60
303	2011-12-08	50395.96
304	2011-12-09	184329.66

	Date	Sales
244	2011-09-30	40023.44
245	2011-10-02	11479.98
246	2011-10-03	62143.07
247	2011-10-04	38532.09
248	2011-10-05	73975.57

Importing the necessary packages¶

The data¶

Data Pre-processing¶

Feature Engineering¶

Exploratory Data Analysis (EDA)¶

Count of transactions for each country¶

Count of transactions, quantity of orders, and sales on each day of the year¶

Insights¶

Quantity of orders and sales on each month of the year¶

Insights¶

Sales on each week of the year¶

Insights¶

Sales on each day of the week¶

Count of orders on each hour of the day¶

Insights¶

Count of orders on each session of the day¶

Top 20 orders based on the description¶

Sales forecasting using Fb Prophet time series modeling¶

Assumption¶

Decomposing the time series¶

Inference¶

Trend Component:¶

Seasonal Component:¶

Stationary and Seasonality Effect¶

Splitting train and test data¶

Building the FB Prophet model¶

Base Model¶

Evaluation metrics¶

Hyperparameter optimization¶

Insights¶

Hyperparameter Optimization Results¶

Interpretation of the model results for non-technical business stakeholders¶

Busuiness Recommendation¶

The changepoint effect¶

Recommendation¶

The Holiday Effect¶

Recommendation¶

ARIMA Implementation¶

Insights¶

Model Comparison¶

FB Prophet model: MAPE = 26%¶

SARIMAX model: MAPE = 35%¶

The lower the MAPE score, the better the model performance is. Hence, our FB Prophet model worked better than the SARIMAX model in this case.¶

Conclusion¶

Kishor Kumar Sridhar¶

kishorkumarsridhar@gmail.com¶