import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import datetime as dt
import warnings
warnings.filterwarnings("ignore")


# loading the dataset

df_ecommerce = pd.read_csv("data.csv", encoding= 'unicode_escape')


# first 5 rows of the dataset

df_ecommerce.head()


# dimensions of the data

df_ecommerce.shape

(541909, 8)


# information about the dataset

df_ecommerce.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


# columns in the dataset

df_ecommerce.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')


df_ecommerce[(df_ecommerce.Quantity <0)].count()['Quantity']

10624


df_ecommerce['Quantity'].unique()

array([     6,      8,      2,     32,      3,      4,     24,     12,
           48,     18,     20,     36,     80,     64,     10,    120,
           96,     23,      5,      1,     -1,     50,     40,    100,
          192,    432,    144,    288,    -12,    -24,     16,      9,
          128,     25,     30,     28,      7,     56,     72,    200,
          600,    480,     -6,     14,     -2,     11,     33,     13,
           -4,     -5,     -7,     -3,     70,    252,     60,    216,
          384,    -10,     27,     15,     22,     19,     17,     21,
           34,     47,    108,     52,  -9360,    -38,     75,    270,
           42,    240,     90,    320,   1824,    204,     69,    -36,
         -192,   -144,    160,   2880,   1400,     39,    -48,    -50,
           26,   1440,     31,     82,     78,     97,     98,     35,
           57,    -20,    110,    -22,    -30,    -70,   -130,    -80,
         -120,    -40,    -25,    -14,    -15,    -69,   -140,   -320,
           -8,    720,    156,    324,     38,     37,     49,     95,
           -9,    -11,     29,     41,    -72,    -35,    -21,    -43,
          -19,    -18,    -44,    402,    378,    150,    300,     54,
          104,     67,    258,     66,     44,     55,     46,     99,
           61,    408,    972,    208,   1008,   1000,    -77,   1488,
          250,   1394,    400,   -223,   -150,    -13,    -33,   -723,
         -177,     79,     84,    -32,   -100,    -28,    272,   -145,
          -47,    -96,    113,     45,    106,     68,    267,    115,
           65,   1728,    -60,    -16,     53,   -240,     76,    460,
           71,     43,    213,     58,    576,   2400,    500,    180,
         -300,   -500,    -23,    752,    960,   1296,    210,    172,
          215,    129,    138,    116,    135,    197,   -106,    -54,
          -17,   -939,    147,    168,    256,   -201,    -53,    -29,
        -2600,   -990,   -290,    -45,    860,   1010,   1356,   1284,
          186,    114,    360,   1930,   2000,   3114,   1300,    670,
          111,    211,     59,   -310,    -61,    -41,    176,    648,
           62,  74215, -74215,    -64,    -84,     89,  -1400,     73,
          -57,    112,    456,    -59,    -31,   5568,   2560,    136,
          900,   -600,    -42,    -94,   -207,    -52,    130,   -206,
         2592,    420,    800,    101,   1200,    864,   -217,     94,
        -1430,   1287,   -162,   -230,   -173,   -390,   -234,    504,
          123,    118,    -76,   -200,   1056,   1500,    280,    407,
          141,    124,    -99,     51,    -92,   -741,   3906,   -400,
         -114,    102,   1152,    -88,    198,    117,     86,   -720,
          125,    -86,   -391,    -87,   -278,    140,    228,   -154,
        -3000,     81,   -675,   -210,   -345,   -975,  -1200,  -1121,
          -27,   -541,  -1277,   -197,   3000,  -1100,    -63,  -5368,
          219,   -259,    232,    -82,   -178,    170,   -110,    768,
           88,    700,   -102,   2160,   -323,   -450,   -232,    -83,
         -153,   -524,  -2472,    -49,   -105,    342,    304,    167,
          -34,    640,    175,    220,     74,     93,    164,    163,
           63,    637,    122,    158,    165,    350,    -46,    -75,
         -690,    -39,    -66,     83,    312,   -180,    392,   -624,
         -194,    -26,    -62,    -37,    -91,   -139,   -158,  -1479,
          -55,   -576,   -750,    330,    151,    -93,   -432,    -58,
        -1092,   -670,  -1300,  -1930,  -2000,  -3114,    462,    -65,
          -68,    492,   -620,    273,    -90,   -170,  -1512,    -51,
           85,    -56,   -160,   -360,    105,   -960,  -2376,   1350,
          428,  -1350,    336,   -786,   -590,   -168,   -101,    -71,
          132,   -413,   -664,    227,   2700,    222,    246,    906,
          756,    888,    552,    -97,    224,    738,    608,   -212,
         4300,    146,    143,    276,   -125,   -116,   -108,   4000,
         -304,   -272,  -1206,    -95,   1600,    323,   -161,   -472,
         -618,   -204,   1515,  -1515,  -9058,  -9600,    660,   -420,
         -126,   -220,   -271,  -1440,    264,    188,    588,    612,
          152,   -324,   -480,   -164,    -78,   -118,    430,  -1681,
           87,    155,    701,    828,    540,    696,    560,   -250,
         -408,   -179,    121,   -124,    512,   -251,  -3100,   3100,
         -169,    126,    291,   3186,  -2834,   -109,    109,   -121,
         -530,   -227,    261,   -346,    352,    142,    107,   -188,
        -1060,   -342,   -288,    348,   1900,    157,   -343,   -455,
          425,    968,    684,    824,   -828,   -701,    196,    248,
          410,    236,    230,   -156,   -553,    145,    448,    245,
         -252,   -334,   -318,   -113,   -115,    171,   -242,    840,
         -967,   -203,  -3167,   -443,  -1897,    225,   -434,    750,
         -682,   -484,    682,    344,   -635,   -117,  -3667,    450,
          310,    494,   -384,     92,   1788,   -138,    624,    744,
          416,    496,    396,    306,   1878,   1944,    666,    708,
         1428,    852,   1412,    528,   -756,   -752,   -152,    -85,
         -312,    -79,   -147,    -67,   -131,    183,   -209,   -186,
         -231,   -129,   -458,   -275,  -2880,   1540,    672,   -800,
         -430,   -380,    -74,   -840,  -1296,   -365,   -104,   -270,
          -73,   -306,     91,    255,    468,   -468,   -111,   -184,
         -103,   -335,   4800,   -112,  -1000,    912,   1992,    184,
          148,   -657,  -1671,  -1158,  -2618,  -2003,   -674,  -4830,
         -905,  -1128,    832,    992,    630,   1020,   2100,    162,
        -1560,  -1284,    -81,    314,    370,    131,    133,    484,
          149,    153,    257,    139,    137,    628,    179,   1820,
          478,    335,    253,    242,    375,   -276,   -256,    281,
          193,    181,    404,    244,    207,    199,   1130,    326,
          654,    688,    268,    249,   -864,    234,  -1510,   -550,
         -244,   -132,   -327,   -313,    279,   -398,   -280,    374,
         -504,   -696,   -149,   -224,   -428,    212,   -267,   -175,
        12540,    760,    -98,    774,   -151,   2040,  -1131,   -135,
         1120,     77,   -900,    177,    490,    388,    620,   1404,
         -155,   -355,   -337,  -1050,   -338,    205,   -235,    698,
        80995, -80995], dtype=int64)


df_ecommerce = df_ecommerce[(df_ecommerce.Quantity >=0)]
df_ecommerce


df_ecommerce[(df_ecommerce.UnitPrice < 0)]


df_ecommerce.drop(df_ecommerce[df_ecommerce['UnitPrice']<0].index, inplace=True)


df_ecommerce.isnull().sum()

InvoiceNo           0
StockCode           0
Description       592
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     133359
Country             0
dtype: int64


df_ecommerce = df_ecommerce[df_ecommerce['CustomerID'].notna()]
df_ecommerce


df_ecommerce.isnull().sum()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64


df_ecommerce.duplicated().sum()

5192


# removing duplicates

df_ecommerce.drop_duplicates(inplace=True)


df_ecommerce.head()


df_ecommerce.shape

(392732, 8)


df_ecommerce.insert(8,"Sales", df_ecommerce['Quantity']*df_ecommerce['UnitPrice'])


# convert the "InvoiceDate" into a datetime object so that it is easier to perform further manipulations

df_ecommerce['InvoiceDate'] = df_ecommerce.InvoiceDate.astype('datetime64')


df_ecommerce.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392732 entries, 0 to 541908
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    392732 non-null  object        
 1   StockCode    392732 non-null  object        
 2   Description  392732 non-null  object        
 3   Quantity     392732 non-null  int64         
 4   InvoiceDate  392732 non-null  datetime64[ns]
 5   UnitPrice    392732 non-null  float64       
 6   CustomerID   392732 non-null  float64       
 7   Country      392732 non-null  object        
 8   Sales        392732 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(4)
memory usage: 30.0+ MB


# Creating a date column

df_ecommerce['Date'] = df_ecommerce['InvoiceDate'].dt.date


# Creating a year column

df_ecommerce['Year'] = df_ecommerce['InvoiceDate'].dt.year


# Creating a month column

df_ecommerce['Month'] = df_ecommerce['InvoiceDate'].dt.month


# Creating a week of the year column

df_ecommerce['WeekOfYear'] = df_ecommerce['InvoiceDate'].dt.isocalendar().week


# Creating a day of the week column

df_ecommerce['DayOfWeek'] = [d.day_name() for d in df_ecommerce['InvoiceDate']]


# Ordering the day of the week column

df_ecommerce['DayOfWeek'] = pd.Categorical(df_ecommerce['DayOfWeek'], 
                                 categories= ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday', 'Sunday'],
                                              ordered = True)


# Creating the time of the day column

df_ecommerce['Time'] = [d.time() for d in df_ecommerce['InvoiceDate']]


# Splitting the the time of the day column into different sessions in a day

df_ecommerce=df_ecommerce.assign(Session=pd.cut(df_ecommerce.InvoiceDate.dt.hour,
                            [0,6,12,18,23],
                            labels=['Night','Morning','Afternoon','Evening'],
                            include_lowest=True))


# Creating the hour of the day column

df_ecommerce['Hour'] = df_ecommerce.InvoiceDate.dt.hour


df_ecommerce.head()


df_ecommerce.isnull().sum()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
Sales          0
Date           0
Year           0
Month          0
WeekOfYear     0
DayOfWeek      0
Time           0
Session        0
Hour           0
dtype: int64


df_ecommerce.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country', 'Sales', 'Date', 'Year', 'Month',
       'WeekOfYear', 'DayOfWeek', 'Time', 'Session', 'Hour'],
      dtype='object')


df_ecommerce.shape

(392732, 17)


df_ecommerce.reset_index(inplace=True)
df_ecommerce.drop(axis=1, columns=['index'], inplace=True)


df_ecommerce.head()


df_ecommerce.shape

(392732, 17)


# plotting the count of orders for each country

ax = df_ecommerce.groupby(['Country']).count()['InvoiceNo'].plot(kind = 'bar', figsize = (20,10), 
                            title = 'Count of transactions for each country')
ax.set_xlabel('Coutry')
ax.set_ylabel('Count');


df_ecommerce['Country'].value_counts()

United Kingdom          349227
Germany                   9027
France                    8327
EIRE                      7228
Spain                     2480
Netherlands               2363
Belgium                   2031
Switzerland               1842
Portugal                  1453
Australia                 1184
Norway                    1072
Italy                      758
Channel Islands            747
Finland                    685
Cyprus                     603
Sweden                     450
Austria                    398
Denmark                    380
Poland                     330
Japan                      321
Israel                     245
Unspecified                241
Singapore                  222
Iceland                    182
USA                        179
Canada                     151
Greece                     145
Malta                      112
United Arab Emirates        68
European Community          60
RSA                         58
Lebanon                     45
Lithuania                   35
Brazil                      32
Czech Republic              25
Bahrain                     17
Saudi Arabia                 9
Name: Country, dtype: int64


# plotting the Count of orders on each day of the year

ax = df_ecommerce.groupby(['Date']).count()['InvoiceNo'].plot(kind = 'line', figsize = (20,5), 
                            title = 'Total Count of transactions on each day of the year')
ax.set_xlabel('Day of the year')
ax.set_ylabel('Count of transactions');


# plotting the quantity of orders on each day of the year

ax = df_ecommerce.groupby(['Date']).sum()['Quantity'].plot(kind = 'line', figsize = (20,5), 
                            title = 'Total quantity of orders on each day of the year')
ax.set_xlabel('Day of the year')
ax.set_ylabel('Quantity');


# plotting the sales on each day of the year

ax = df_ecommerce.groupby(['Date']).sum()['Sales'].plot(kind = 'line', figsize = (20,5), 
                            title = 'Total sales on each day of the year')
ax.set_xlabel('Day of the year')
ax.set_ylabel('Count');


# plotting the count of orders on each year

ax = df_ecommerce.groupby(['Year']).sum()['Quantity'].plot(kind = 'bar', figsize = (10,5), 
                            title = 'Quantity of orders on each year')
ax.set_xlabel('Year')
ax.set_ylabel('Count');


# plotting the count of orders on each month of the year

ax = df_ecommerce.groupby(['Month']).sum()['Quantity'].plot(kind = 'bar', figsize = (15,5), 
                            title = 'Quantity of orders on each month of the year')
ax.set_xlabel('Month')
ax.set_ylabel('Count');


# plotting the sales on each month of the year

ax = df_ecommerce.groupby(['Month']).sum()['Sales'].plot(kind = 'bar', figsize = (15,5), 
                            title = 'Sales on each month of the year')
ax.set_xlabel('Month')
ax.set_ylabel('Count');


# plotting the sales on each week of the year

ax = df_ecommerce.groupby(['WeekOfYear']).sum()['Sales'].plot(kind = 'bar', figsize = (15,5), 
                            title = 'Sales on each week of the year')
ax.set_xlabel('Week of the year')
ax.set_ylabel('Count');


# plotting the sales on each day of the week
print(df_ecommerce.groupby(['DayOfWeek']).sum()['Sales'])
ax = df_ecommerce.groupby(['DayOfWeek']).sum()['Sales'].plot(kind = 'bar', figsize = (15,5), 
                            title = 'Sales on each day of the week')
ax.set_xlabel('Day of the Week')
ax.set_ylabel('Sales');

DayOfWeek
Monday       1363604.401
Tuesday      1697733.801
Wednesday    1584283.830
Thursday     1973015.730
Friday       1483080.811
Saturday           0.000
Sunday        785490.321
Name: Sales, dtype: float64


# plotting the count of orders on each hour of the day

ax = df_ecommerce.groupby(['Hour']).count()['InvoiceNo'].plot(kind = 'bar', figsize = (15,5), 
                            title = 'Count of orders on each hour of the day')
ax.set_xlabel('Hour of the day')
ax.set_ylabel('Count');


# plotting the quantity of orders on each hour of each day of the week

ax = df_ecommerce.groupby(['DayOfWeek','Hour']).count()['Quantity'].plot(kind = 'bar', figsize = (20,5), 
                            title = 'Quantity of orders on each hour of each day of the week')
ax.set_xlabel('Hour/Day of the Week')
ax.set_ylabel('Quantity');


# plotting the count of orders on each session of the day

ax = df_ecommerce.groupby(['Session']).count()['InvoiceNo'].plot(kind = 'bar', figsize = (15,5), 
                            title = 'Count of orders on each session of the day')
ax.set_xlabel('Session of the day')
ax.set_ylabel('Count');


# plotting the Top 20 orders based on the description

ax = df_ecommerce.groupby(['Description']).count()['InvoiceNo'].sort_values(ascending = False).head(20).plot(kind = 'bar',
                            figsize = (15,5), 
                            title = 'Top 20 orders based on the description')
ax.set_xlabel('Hour of the day')
ax.set_ylabel('Count');


recency_df = df_ecommerce.groupby(by='CustomerID', as_index=False)['Date'].max()
recency_df.columns = ['CustomerID','RecentPurchaseDate']
recency_df.head(5)


chosen_date = dt.date(2011,12,31)
recency_df['Recency'] = recency_df['RecentPurchaseDate'].apply(lambda x: (chosen_date - x).days)
recency_df.drop('RecentPurchaseDate',axis = 1,inplace=True)
recency_df.head(5)


customer_12347 = df_ecommerce[df_ecommerce['CustomerID']==12347.0]
customer_12347


print("The number of rows for the customer 12347 is {}".format(customer_12347.shape[0]))

The number of rows for the customer 12347 is 182


customer_12347 = df_ecommerce[df_ecommerce['CustomerID']==12347.0]
customer_12347.drop_duplicates(['InvoiceNo','CustomerID'],keep='first',inplace=True)
customer_12347


print("The number of rows for the customer 12347 with unique InvoiceNo and CustomerID pair is {}"
      .format(customer_12347.shape[0]))

The number of rows for the customer 12347 with unique InvoiceNo and CustomerID pair is 7


df_copy_temp = df_ecommerce.copy()


df_copy_temp.drop_duplicates(['InvoiceNo','CustomerID'],keep='first',inplace=True)
frequency_df = df_copy_temp.groupby(by='CustomerID', as_index=False)['InvoiceNo'].count()
frequency_df.columns = ['CustomerID','Frequency']
frequency_df.head(5)


monetary_df = df_ecommerce.groupby(by='CustomerID', as_index=False)['Sales'].sum()
monetary_df.columns = ['CustomerID','Monetary_value']
monetary_df.head(5)


monetary_df['Monetary_value'] = monetary_df['Monetary_value'].astype(np.int64)
monetary_df.head(5)


rfm_df = recency_df.merge(frequency_df,on='CustomerID').merge(monetary_df,on='CustomerID')
rfm_df.set_index('CustomerID',inplace=True)
rfm_df.head(5)


rfm_df.describe()


plt.figure(figsize=(20, 15))
plt.subplot(3, 3, 1)
sns.distplot(rfm_df['Recency'])
plt.subplot(3, 3, 2)
sns.distplot(rfm_df['Frequency'])
plt.subplot(3, 3, 3)
sns.distplot(rfm_df['Monetary_value']);


rfm_df[['Recency','Frequency','Monetary_value']]


sns.pairplot(rfm_df[['Recency','Frequency','Monetary_value']],diag_kind='kde');


sns.heatmap(rfm_df[['Recency','Frequency','Monetary_value']].corr(),annot=True);


rfm_df.head()


# binning the recency, frequency and monetary_value columns

recency_bins = [-1, 39, 165, 395]
frequency_bins = [-1, 3, 10, 210]
monetary_bins = [-1, 308, 1661, 280207]
rfm_df['recency_score'] = pd.cut(rfm_df['Recency'], recency_bins, labels = [3, 2, 1])
rfm_df['frequency_score'] = pd.cut(rfm_df['Frequency'], frequency_bins, labels = [1, 2, 3])
rfm_df['monetary_score'] = pd.cut(rfm_df['Monetary_value'], monetary_bins, labels = [1, 2, 3])


rfm_df


rfm_df.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 4339 entries, 12346.0 to 18287.0
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Recency          4339 non-null   int64   
 1   Frequency        4339 non-null   int64   
 2   Monetary_value   4339 non-null   int64   
 3   recency_score    4339 non-null   category
 4   frequency_score  4339 non-null   category
 5   monetary_score   4339 non-null   category
dtypes: category(3), int64(3)
memory usage: 148.6 KB


rfm_df['recency_score'] = rfm_df.recency_score.astype(str)
rfm_df['frequency_score'] = rfm_df.frequency_score.astype(str)
rfm_df['monetary_score'] = rfm_df.monetary_score.astype(str)


rfm_df['rfm_group'] = rfm_df['recency_score'].str.cat(rfm_df['frequency_score']).str.cat(rfm_df['monetary_score'])


rfm_df.reset_index(inplace=True)


rfm_df.groupby(['rfm_group']).count()['CustomerID'].sort_values(ascending=False).plot(kind = 'bar', figsize = (20,5), 
                            title = 'Count of customers on each day of the rfm group')
ax.set_xlabel('rfm groups')
ax.set_ylabel('Count');


# counting the number of customers in each of the rfm segment

rfm_groupby = rfm_df.groupby(['rfm_group'], as_index=False).agg(Count_of_customers = ('CustomerID','count'))
rfm_groupby = rfm_groupby.sort_values(by='Count_of_customers', ascending = False)
rfm_groupby.reset_index(inplace = True)
rfm_groupby.drop(columns = ['index'], axis = 1, inplace = True)
rfm_groupby.head()


# creating a columnn showing the percentage of customers in various rfm groups

rfm_groupby['Percentage_of_customers'] = rfm_groupby['Count_of_customers']/rfm_groupby['Count_of_customers'].sum()*100
rfm_groupby


# creating a columnn showing the percentage of customers in various customer segments

rfm_groupby['Customer_segment'] = ""
rfm_groupby.loc[rfm_groupby.rfm_group.isin(['333']), "Customer_segment"] = "Champions"
rfm_groupby.loc[rfm_groupby.rfm_group.isin(['211','311']), "Customer_segment"] = "Rookies - Newest Customers"
rfm_groupby.loc[rfm_groupby.rfm_group.isin(['313','312']), "Customer_segment"] = "New Big Spenders"
rfm_groupby.loc[rfm_groupby.rfm_group.isin(['322','323','332','223','233']), "Customer_segment"] = "Potential Loyalists"
rfm_groupby.loc[rfm_groupby.rfm_group.isin(['133','123']), "Customer_segment"] = "At Risk Customers"
rfm_groupby.loc[rfm_groupby.rfm_group.isin(['212']), "Customer_segment"] = "Average Customers"
rfm_groupby.loc[rfm_groupby.rfm_group.isin(['222']), "Customer_segment"] = "Can’t Lose Them"
rfm_groupby.loc[rfm_groupby.rfm_group.isin(['111','112','121']), "Customer_segment"] = "Churned Customers"
rfm_groupby.loc[rfm_groupby.rfm_group.isin(['213','122','113','321','221','232','112','121']), "Customer_segment"] = "Others"
rfm_groupby


print(rfm_groupby.groupby(['Customer_segment']).sum()['Percentage_of_customers'].sort_values(ascending=False))
rfm_groupby.groupby(['Customer_segment']).sum()['Percentage_of_customers'].sort_values(ascending=False).plot(kind = 'bar',
                            figsize = (20,5), 
                            title = 'Percentage of customers in each of the customer segments')
ax.set_xlabel('customer segments')
ax.set_ylabel('Percentage');

Customer_segment
Average Customers             20.995621
Potential Loyalists           20.488592
Others                        14.565568
Rookies - Newest Customers    13.205808
Churned Customers             12.007375
Can’t Lose Them                6.752708
New Big Spenders               5.830837
Champions                      5.784743
At Risk Customers              0.368749
Name: Percentage_of_customers, dtype: float64


rfm_df


rfm_df_copy = rfm_df.copy()
rfm_df_copy = rfm_df_copy[['Recency','Frequency','Monetary_value']]
rfm_df_copy


sns.pairplot(rfm_df_copy[['Recency','Frequency','Monetary_value']],diag_kind='kde');


plt.figure(figsize=(20, 15))
plt.subplot(3, 3, 1)
sns.distplot(rfm_df_copy['Recency'])
plt.subplot(3, 3, 2)
sns.distplot(rfm_df_copy['Frequency'])
plt.subplot(3, 3, 3)
sns.distplot(rfm_df_copy['Monetary_value']);


# data transformation to remove skewness

from scipy import stats
rfm_skew_adj = pd.DataFrame()
rfm_skew_adj["Recency"] = stats.boxcox(rfm_df_copy['Recency'])[0]
rfm_skew_adj["Frequency"] = stats.boxcox(rfm_df_copy['Frequency'])[0]
rfm_skew_adj["Monetary_value"] = pd.Series(np.cbrt(rfm_df_copy['Monetary_value'])).values
rfm_skew_adj.tail()


rfm_df_copy[['Recency','Frequency','Monetary_value']].skew()

Recency            1.246137
Frequency         12.100028
Monetary_value    19.341434
dtype: float64


rfm_skew_adj[['Recency','Frequency','Monetary_value']].skew()

Recency           0.074741
Frequency         0.144932
Monetary_value    2.998136
dtype: float64


# using standard scalar to scale the columns

# Import library
from sklearn.preprocessing import StandardScaler
# Initialize the Object
scaler = StandardScaler()
# Fit and Transform The Data
scaler.fit(rfm_skew_adj)
rfm_skew_adj_norm = scaler.transform(rfm_skew_adj)
# Assert that it has mean 0 and variance 1
print("The mean is {}".format(rfm_skew_adj_norm.mean(axis = 0).round(2)) )
print("The std is {}".format(rfm_skew_adj_norm.std(axis = 0).round(2)))

The mean is [0. 0. 0.]
The std is [1. 1. 1.]


from sklearn.cluster import KMeans
sse = {}
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=51)
    kmeans.fit(rfm_skew_adj_norm)
    sse[k] = kmeans.inertia_ # SSE to closest cluster centroid
plt.title('The Elbow Method')
plt.xlabel('k')
plt.ylabel('SSE')
sns.pointplot(x=list(sse.keys()), y=list(sse.values()))
plt.show()


model = KMeans(n_clusters=3, random_state=42)
model.fit(rfm_skew_adj_norm)
model.labels_.shape

(4339,)


rfm_df_copy["Cluster"] = model.labels_
rfm_df_copy.groupby('Cluster').agg({
    'Recency':'mean',
    'Frequency':'mean',
    'Monetary_value':['mean', 'count']}).round(2)


# Create the dataframe
df_normalized = pd.DataFrame(rfm_skew_adj_norm, columns=['Recency', 'Frequency', 'Monetary_value'])
df_normalized['ID'] = rfm_df_copy.index
df_normalized['Cluster'] = model.labels_
# Melt The Data
df_nor_melt = pd.melt(df_normalized.reset_index(),
                      id_vars=['ID', 'Cluster'],
                      value_vars=['Recency','Frequency','Monetary_value'],
                      var_name='Attribute',
                      value_name='Value')
df_nor_melt.head()
# Visualize it
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.lineplot('Attribute', 'Value', hue='Cluster', data=df_nor_melt);


from sklearn.cluster import KMeans
sse = {}
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=51)
    kmeans.fit(rfm_skew_adj_norm)
    sse[k] = kmeans.inertia_ # SSE to closest cluster centroid
plt.title('The Elbow Method')
plt.xlabel('k')
plt.ylabel('SSE')
sns.pointplot(x=list(sse.keys()), y=list(sse.values()))
plt.show()


model = KMeans(n_clusters=8, random_state=42)
model.fit(rfm_skew_adj_norm)
model.labels_.shape

(4339,)


rfm_df_copy["Cluster"] = model.labels_
rfm_df_copy.groupby('Cluster').agg({
    'Recency':'mean',
    'Frequency':'mean',
    'Monetary_value':['mean', 'count']}).round(2)


# Create the dataframe
df_normalized = pd.DataFrame(rfm_skew_adj_norm, columns=['Recency', 'Frequency', 'Monetary_value'])
df_normalized['ID'] = rfm_df_copy.index
df_normalized['Cluster'] = model.labels_
# Melt The Data
df_nor_melt = pd.melt(df_normalized.reset_index(),
                      id_vars=['ID', 'Cluster'],
                      value_vars=['Recency','Frequency','Monetary_value'],
                      var_name='Attribute',
                      value_name='Value')
df_nor_melt.head()
# Visualize it
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.lineplot('Attribute', 'Value', hue='Cluster', data=df_nor_melt);

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
0	536365	85123A	WHITE HANGING HEART T-LIGHT HOLDER	6	12/1/2010 8:26	2.55	17850.0	United Kingdom
1	536365	71053	WHITE METAL LANTERN	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
2	536365	84406B	CREAM CUPID HEARTS COAT HANGER	8	12/1/2010 8:26	2.75	17850.0	United Kingdom
3	536365	84029G	KNITTED UNION FLAG HOT WATER BOTTLE	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
4	536365	84029E	RED WOOLLY HOTTIE WHITE HEART.	6	12/1/2010 8:26	3.39	17850.0	United Kingdom

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
0	536365	85123A	WHITE HANGING HEART T-LIGHT HOLDER	6	12/1/2010 8:26	2.55	17850.0	United Kingdom
1	536365	71053	WHITE METAL LANTERN	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
2	536365	84406B	CREAM CUPID HEARTS COAT HANGER	8	12/1/2010 8:26	2.75	17850.0	United Kingdom
3	536365	84029G	KNITTED UNION FLAG HOT WATER BOTTLE	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
4	536365	84029E	RED WOOLLY HOTTIE WHITE HEART.	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
...	...	...	...	...	...	...	...	...
541904	581587	22613	PACK OF 20 SPACEBOY NAPKINS	12	12/9/2011 12:50	0.85	12680.0	France
541905	581587	22899	CHILDREN'S APRON DOLLY GIRL	6	12/9/2011 12:50	2.10	12680.0	France
541906	581587	23254	CHILDRENS CUTLERY DOLLY GIRL	4	12/9/2011 12:50	4.15	12680.0	France
541907	581587	23255	CHILDRENS CUTLERY CIRCUS PARADE	4	12/9/2011 12:50	4.15	12680.0	France
541908	581587	22138	BAKING SET 9 PIECE RETROSPOT	3	12/9/2011 12:50	4.95	12680.0	France

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
299983	A563186	B	Adjust bad debt	1	8/12/2011 14:51	-11062.06	NaN	United Kingdom
299984	A563187	B	Adjust bad debt	1	8/12/2011 14:52	-11062.06	NaN	United Kingdom

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
0	536365	85123A	WHITE HANGING HEART T-LIGHT HOLDER	6	12/1/2010 8:26	2.55	17850.0	United Kingdom
1	536365	71053	WHITE METAL LANTERN	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
2	536365	84406B	CREAM CUPID HEARTS COAT HANGER	8	12/1/2010 8:26	2.75	17850.0	United Kingdom
3	536365	84029G	KNITTED UNION FLAG HOT WATER BOTTLE	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
4	536365	84029E	RED WOOLLY HOTTIE WHITE HEART.	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
...	...	...	...	...	...	...	...	...
541904	581587	22613	PACK OF 20 SPACEBOY NAPKINS	12	12/9/2011 12:50	0.85	12680.0	France
541905	581587	22899	CHILDREN'S APRON DOLLY GIRL	6	12/9/2011 12:50	2.10	12680.0	France
541906	581587	23254	CHILDRENS CUTLERY DOLLY GIRL	4	12/9/2011 12:50	4.15	12680.0	France
541907	581587	23255	CHILDRENS CUTLERY CIRCUS PARADE	4	12/9/2011 12:50	4.15	12680.0	France
541908	581587	22138	BAKING SET 9 PIECE RETROSPOT	3	12/9/2011 12:50	4.95	12680.0	France

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
0	536365	85123A	WHITE HANGING HEART T-LIGHT HOLDER	6	12/1/2010 8:26	2.55	17850.0	United Kingdom
1	536365	71053	WHITE METAL LANTERN	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
2	536365	84406B	CREAM CUPID HEARTS COAT HANGER	8	12/1/2010 8:26	2.75	17850.0	United Kingdom
3	536365	84029G	KNITTED UNION FLAG HOT WATER BOTTLE	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
4	536365	84029E	RED WOOLLY HOTTIE WHITE HEART.	6	12/1/2010 8:26	3.39	17850.0	United Kingdom

	CustomerID	RecentPurchaseDate
0	12346.0	2011-01-18
1	12347.0	2011-12-07
2	12348.0	2011-09-25
3	12349.0	2011-11-21
4	12350.0	2011-02-02

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country	Sales	Date	Year	Month	WeekOfYear	DayOfWeek	Time	Session	Hour
10257	537626	85116	BLACK CANDELABRA T-LIGHT HOLDER	12	2010-12-07 14:57:00	2.10	12347.0	Iceland	25.20	2010-12-07	2010	12	49	Tuesday	14:57:00	Afternoon	14
10258	537626	22375	AIRLINE BAG VINTAGE JET SET BROWN	4	2010-12-07 14:57:00	4.25	12347.0	Iceland	17.00	2010-12-07	2010	12	49	Tuesday	14:57:00	Afternoon	14
10259	537626	71477	COLOUR GLASS. STAR T-LIGHT HOLDER	12	2010-12-07 14:57:00	3.25	12347.0	Iceland	39.00	2010-12-07	2010	12	49	Tuesday	14:57:00	Afternoon	14
10260	537626	22492	MINI PAINT SET VINTAGE	36	2010-12-07 14:57:00	0.65	12347.0	Iceland	23.40	2010-12-07	2010	12	49	Tuesday	14:57:00	Afternoon	14
10261	537626	22771	CLEAR DRAWER KNOB ACRYLIC EDWARDIAN	12	2010-12-07 14:57:00	1.25	12347.0	Iceland	15.00	2010-12-07	2010	12	49	Tuesday	14:57:00	Afternoon	14
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
389322	581180	20719	WOODLAND CHARLOTTE BAG	10	2011-12-07 15:52:00	0.85	12347.0	Iceland	8.50	2011-12-07	2011	12	49	Wednesday	15:52:00	Afternoon	15
389323	581180	21265	PINK GOOSE FEATHER TREE 60CM	12	2011-12-07 15:52:00	1.95	12347.0	Iceland	23.40	2011-12-07	2011	12	49	Wednesday	15:52:00	Afternoon	15
389324	581180	23271	CHRISTMAS TABLE SILVER CANDLE SPIKE	16	2011-12-07 15:52:00	0.83	12347.0	Iceland	13.28	2011-12-07	2011	12	49	Wednesday	15:52:00	Afternoon	15
389325	581180	23506	MINI PLAYING CARDS SPACEBOY	20	2011-12-07 15:52:00	0.42	12347.0	Iceland	8.40	2011-12-07	2011	12	49	Wednesday	15:52:00	Afternoon	15
389326	581180	23508	MINI PLAYING CARDS DOLLY GIRL	20	2011-12-07 15:52:00	0.42	12347.0	Iceland	8.40	2011-12-07	2011	12	49	Wednesday	15:52:00	Afternoon	15

	CustomerID	Monetary_value
0	12346.0	77183.60
1	12347.0	4310.00
2	12348.0	1797.24
3	12349.0	1757.55
4	12350.0	334.40

	CustomerID	Monetary_value
0	12346.0	77183
1	12347.0	4309
2	12348.0	1797
3	12349.0	1757
4	12350.0	334

	Recency	Frequency	Monetary_value
count	4339.000000	4339.000000	4339.000000
mean	114.041484	4.271952	2047.727357
std	100.007757	7.705493	8984.250070
min	22.000000	1.000000	0.000000
25%	39.000000	1.000000	306.000000
50%	72.000000	2.000000	668.000000
75%	163.500000	5.000000	1659.500000
max	395.000000	210.000000	280206.000000

	rfm_group	Count_of_customers	Percentage_of_customers
0	212	911	20.995621
1	111	521	12.007375
2	211	468	10.785895
3	112	460	10.601521
4	223	320	7.374971
5	222	293	6.752708
6	323	273	6.291772
7	333	251	5.784743
8	312	230	5.300761
9	322	215	4.955059
10	311	105	2.419912
11	213	86	1.982024
12	233	79	1.820696
13	122	39	0.898825
14	113	33	0.760544
15	313	23	0.530076
16	123	12	0.276561
17	321	5	0.115234
18	121	4	0.092187
19	221	4	0.092187
20	133	4	0.092187
21	332	2	0.046094
22	232	1	0.023047

	rfm_group	Count_of_customers	Percentage_of_customers	Customer_segment
0	212	911	20.995621	Average Customers
1	111	521	12.007375	Churned Customers
2	211	468	10.785895	Rookies - Newest Customers
3	112	460	10.601521	Others
4	223	320	7.374971	Potential Loyalists
5	222	293	6.752708	Can’t Lose Them
6	323	273	6.291772	Potential Loyalists
7	333	251	5.784743	Champions
8	312	230	5.300761	New Big Spenders
9	322	215	4.955059	Potential Loyalists
10	311	105	2.419912	Rookies - Newest Customers
11	213	86	1.982024	Others
12	233	79	1.820696	Potential Loyalists
13	122	39	0.898825	Others
14	113	33	0.760544	Others
15	313	23	0.530076	New Big Spenders
16	123	12	0.276561	At Risk Customers
17	321	5	0.115234	Others
18	121	4	0.092187	Others
19	221	4	0.092187	Others
20	133	4	0.092187	At Risk Customers
21	332	2	0.046094	Potential Loyalists
22	232	1	0.023047	Others

	Recency	Frequency	Monetary_value
4334	3.234700	0.000000	5.646216
4335	3.119030	0.000000	4.308869
4336	2.371921	0.603042	5.625226
4337	2.300439	1.653490	12.693005
4338	2.715833	0.883793	12.247188

	Recency	Frequency	Monetary_value
	mean	mean	mean	count
Cluster
0	190.86	1.30	410.26	1857
1	63.24	3.57	1104.37	1764
2	40.15	13.68	8600.47	718

	Recency	Frequency	Monetary_value
	mean	mean	mean	count
Cluster
0	45.00	2.67	689.31	724
1	264.88	1.00	319.66	835
2	47.63	47.00	71716.24	38
3	34.26	16.11	7608.05	303
4	67.63	1.00	390.91	656
5	88.77	5.54	2339.34	483
6	171.91	2.54	693.88	712
7	32.72	6.74	2054.88	588

Importing the necessary packages¶

The data¶

Data Pre-processing¶

Feature Engineering¶

Exploratory Data Analysis (EDA)¶

Count of transactions for each country¶

Count of transactions, quantity of orders, and sales on each day of the year¶

Insights¶

Quantity of orders and sales on each month of the year¶

Insights¶

Sales on each week of the year¶

Insights¶

Sales on each day of the week¶

Count of orders on each hour of the day¶

Insights¶

Count of orders on each session of the day¶

Top 20 orders based on the description¶

Customer segmentation¶

Methodology¶

RFM Analysis¶

Recency¶

Frequency¶

Monetary value¶

Insights¶

Data Discretization for the RFM values¶

Business Recommendations¶

Champions (5.78% of the customers):¶

RFM Score: 333¶

Potential Loyalists (20.48%of the customers):¶

RFM Score: 322, 323, 332, 223, 233¶

Rookies - Newest Customers (13.20% of the customers):¶

RFM Score: 211, 311¶

At Risk Customers (0.36% of the customers):¶

RFM Score: 133, 123¶

Average Customers (20.99% of the customers):¶

RFM Score: 212¶

New Big Spenders (5.83% of the customers):¶

RFM Score: 313, 312¶

Can’t Lose Them (6.75% of the customers):¶

RFM Score: 222¶

Churned Customers (12% of the customers):¶

RFM Score: 111¶

Overall Insights¶

K-means Clustering¶

Removing skewness in the data¶

Standard scalar¶

Customer Segmentation using K-Means¶

Insights¶

Building a K-means clustering model with 3 clusters¶

Building a K-means clustering model with 8 clusters¶

Insights¶

Conclusion¶

Kishor Kumar Sridhar¶

kishorkumarsridhar@gmail.com¶