import pandas as pd
from matplotlib import pyplot as plt
import nltk
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from factor_analyzer import FactorAnalyzer
from sklearn.decomposition import FactorAnalysis
import random
import seaborn as sns
30) random.seed(
Create the Fake Customer Purchase data
We create two fake data sets with a discernible structures, but we don’t want the models to work too easily so we sample again from these data pairing both sets as if we have a customer and their products.
= make_blobs(n_samples=10000,
X, y =5,
centers=6,
n_features=0
random_state
)
= ['product_desc_' + str(x) for x in range(0, 6)]
prods = pd.DataFrame(data=X, columns=prods)
df_products 'Product_Class'] = y
df_products['Product_ID'] = df_products.index
df_products[
= make_blobs(n_samples=1000,
X, y =10,
centers=8,
n_features=0)
random_state= ['customer_desc_' + str(x) for x in range(0, 8)]
custs = pd.DataFrame(data=X,
df_customer =custs)
columns
'Customer_Class'] = y
df_customer['Customer_ID'] = df_customer.index
df_customer[
# Randomly Select some of the customers to pair with random purchases
= pd.DataFrame(zip(
day1 0, 1000) for x in range(0, 500)],
[random.randint(0, 10000) for x in range(0, 500)]
[random.randint(
), =['Customer_ID', 'Product_ID']
columns
)
= pd.DataFrame(zip(
day2 0, 1000) for x in range(0, 500)],
[random.randint(0, 10000) for x in range(0, 500)]
[random.randint(
), =['Customer_ID', 'Product_ID'])
columns
= pd.concat([day1, day2],
purchases =0,
axis=True)
ignore_index purchases.head()
Customer_ID | Product_ID | |
---|---|---|
0 | 552 | 773 |
1 | 827 | 8608 |
2 | 296 | 518 |
3 | 625 | 5394 |
4 | 30 | 8543 |
= None
df_purchases for purchase in range(0, len(purchases)):
= purchases['Customer_ID'][purchase]
cust_id = purchases['Product_ID'][purchase]
prod_id = df_customer[df_customer['Customer_ID'] ==
cust
cust_id]=True, drop=True)
cust.reset_index(inplace= df_products[df_products['Product_ID'] ==
prod
prod_id]=True, drop=True)
prod.reset_index(inplace= pd.concat([prod, cust], axis=1)
temp if df_purchases is None:
= pd.concat([prod, cust], axis=1)
df_purchases else:
= df_purchases.append(
df_purchases =1)
pd.concat([prod, cust], axis
)=True, drop=True)
df_purchases.reset_index(inplace df_purchases
product_desc_0 | product_desc_1 | product_desc_2 | product_desc_3 | product_desc_4 | product_desc_5 | Product_Class | Product_ID | customer_desc_0 | customer_desc_1 | customer_desc_2 | customer_desc_3 | customer_desc_4 | customer_desc_5 | customer_desc_6 | customer_desc_7 | Customer_Class | Customer_ID | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -1.233276 | 8.747108 | 9.165746 | -0.007541 | 4.652380 | 1.913970 | 1 | 773 | -6.374916 | -2.257586 | 7.688019 | -7.992522 | 7.593467 | -9.193683 | 10.376847 | -0.388405 | 8.0 | 552.0 |
1 | -0.170403 | 8.219852 | 9.710727 | -1.550177 | 5.811414 | 1.492459 | 1 | 8608 | 9.294317 | -2.231715 | 6.061894 | -0.438840 | 1.246116 | 8.820684 | -9.950039 | -7.391761 | 1.0 | 827.0 |
2 | -2.822567 | 8.036298 | 10.929105 | -0.935760 | 6.077699 | 1.265250 | 1 | 518 | -8.793580 | 4.805865 | 6.167272 | 5.770659 | 7.451190 | 4.144325 | 1.196351 | 5.414350 | 2.0 | 296.0 |
3 | 1.019780 | 6.766354 | -9.152836 | -8.611808 | -8.749707 | 6.637419 | 2 | 5394 | -7.047638 | -2.131113 | 5.768863 | -8.094387 | 6.223832 | -8.834962 | 9.629384 | -1.935227 | 8.0 | 625.0 |
4 | 4.321656 | 7.430975 | 8.847703 | 5.921750 | -0.868445 | 5.483210 | 3 | 8543 | -0.169121 | 1.346768 | -10.211423 | 1.709859 | 1.655568 | 1.891809 | 7.856076 | 4.333816 | 4.0 | 30.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
995 | 2.151425 | 3.179085 | 2.337353 | 0.559543 | -1.629433 | 2.493002 | 0 | 8383 | -4.926378 | -1.474424 | 2.848533 | -10.733950 | 4.237231 | 5.048239 | -5.263423 | -7.005510 | 5.0 | 272.0 |
996 | 0.475735 | 4.378010 | 2.597665 | 1.569648 | -1.131376 | 3.110801 | 0 | 6141 | 7.821350 | -1.948967 | 6.039587 | 1.739432 | 2.351800 | 8.325224 | -10.263796 | -7.450850 | 1.0 | 616.0 |
997 | 4.621406 | 7.791088 | 8.592775 | 6.423716 | 0.915721 | 4.727899 | 3 | 406 | -9.236352 | 2.760513 | -7.290240 | 9.169618 | -0.188279 | -2.443252 | -4.153840 | 6.140598 | 3.0 | 266.0 |
998 | -1.720297 | 8.174031 | 9.746428 | -1.849028 | 5.046019 | 0.951072 | 1 | 9737 | -1.411479 | 0.830250 | 5.167232 | -9.188249 | 3.176105 | 4.570698 | -6.300042 | -7.561958 | 5.0 | 263.0 |
999 | 2.804878 | 4.882689 | -0.262516 | 0.311810 | -1.867281 | 4.639076 | 0 | 3486 | -6.696620 | 1.945182 | -7.520248 | 8.545794 | 3.743541 | -3.216961 | -4.505348 | 3.400242 | 3.0 | 589.0 |
1000 rows × 18 columns
Preliminary Plotting: Do the Factors seperate the structure?
A worthwhile plot to apply with any dimensional reduction technique (such as PCA or factor analysis) is to check if and how the data seperates when plotted on the reduced plane. In lieu of knowledge of the data we can always compare this representation to the output of a clustering algorithm.
= [x for x in df_purchases.columns if
cust_desc 'customer_desc' in x]
= df_customer[cust_desc]
X = KMeans(init='k-means++',
kmeans =3,
n_clusters=30
n_init
)
kmeans.fit(X)= kmeans.predict(X)
clusters 'cluster'] = clusters
X[
= FactorAnalysis(n_components=2,
sklearn_fa ='varimax'
rotation
)= pd.DataFrame(sklearn_fa.fit_transform(X[cust_desc]))
Y_fa = pd.concat([X, Y_fa], axis=1)
X 0, 1, 'cluster']].plot.scatter(x=0,
X[[=1,
y='cluster',
c='viridis')
colormap"Factor Analysis Representation - Coloured by inferred Clusters")
plt.title("Factor 1")
plt.ylabel("Factor 2")
plt.xlabel('default')
plt.style.use( plt.show()
/Users/nathanielforde/.local/lib/python3.6/site-packages/ipykernel_launcher.py:9: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
if __name__ == '__main__':
We can see here that the factors do a pretty good job of seperating the classes (0, 1), but mix up (2, 1). In addition we can see that there are 7 distinct clusters on the factor analysis representation which suggests that our choice three clustering classes is too low.
Choosing the number of Factors
One suggestive way in which to determine the number of factors which we should extract is to build the skree plot of the eigenvalues and select the number of features where there is a higher relative eigenvalues.
import numpy as np
= ['customer_desc_' + str(x) for x in range(0, 8)]
feature_names
= FactorAnalyzer(n_factors=3)
fa 10)
fa.fit(X[feature_names], = fa.get_eigenvalues()
ev, v range(1,X[feature_names].shape[1]+1),ev)
plt.plot( plt.show()
On the basis of this plot we should probably choose no more than two factors at most but we’ll continue with 3 for purposes of illustration. The factor loadings are linear functions of the observed features and so we may interpret the newly created factors by observing which of the observed features play a greater role in their composition.
= pd.DataFrame(fa.loadings_,
fa_loading_matrix =['FA{}'.format(i) for
columnsin range(1, 3+1)],
i =feature_names)
index'Highest_loading'] = fa_loading_matrix.idxmax(axis=1)
fa_loading_matrix[= fa_loading_matrix.sort_values('Highest_loading')
fa_loading_matrix fa_loading_matrix
FA1 | FA2 | FA3 | Highest_loading | |
---|---|---|---|---|
customer_desc_1 | 0.727724 | 0.024771 | 0.091443 | FA1 |
customer_desc_3 | 0.826998 | 0.068252 | 0.014691 | FA1 |
customer_desc_5 | 0.637687 | -0.003234 | 0.489801 | FA1 |
customer_desc_7 | 0.787048 | 0.081877 | -0.414776 | FA1 |
customer_desc_4 | 0.015911 | 1.001115 | 0.175406 | FA2 |
customer_desc_6 | -0.172587 | 0.077989 | -0.684129 | FA2 |
customer_desc_0 | -0.229668 | -0.628541 | 0.355323 | FA3 |
customer_desc_2 | -0.304156 | 0.217303 | 0.440891 | FA3 |
We can see here that there is probably only one sensible factor to be derived from our dataset.
import seaborn as sns
=(25,5))
plt.figure(figsize
# plot the heatmap for correlation matrix
= sns.heatmap(fa_loading_matrix.drop('Highest_loading', axis=1).T,
ax =-1, vmax=1, center=0,
vmin=sns.diverging_palette(220, 20, n=200),
cmap=True, annot=True, fmt='.2f')
square
ax.set_yticklabels(
ax.get_yticklabels(),=0); rotation
= pd.DataFrame(fa.get_communalities(),
communalities =list(feature_names))
index= list(communalities[communalities[0] > 0.33].index)
features_comm print('Total variables/features with communalities >0.33: {}'.format(len(features_comm)))
communalities
Total variables/features with communalities >0.33: 8
0 | |
---|---|
customer_desc_0 | 0.574065 |
customer_desc_1 | 0.538558 |
customer_desc_2 | 0.334116 |
customer_desc_3 | 0.688800 |
customer_desc_4 | 1.033252 |
customer_desc_5 | 0.646560 |
customer_desc_6 | 0.503901 |
customer_desc_7 | 0.798188 |
Testing the Validity of a Hypothetical Factor
Cronbach’s Alpha is a statistical measure of the reliability of the factor analysis derived from a test of the covariances matrix of features which make up the proposed latent factor. A cronbach alpha closer to 1 is desired.
\(\alpha = \frac{K}{K-1}\left(1-\frac{\sum \sigma^2_{x_i}}{\sigma^2_T}\right)\)
where
\(\sigma^2_T = \sum \sigma^2_{x_i} + 2 \sum_{i < j}^K {\rm cov}(x_i,x_j)\)
a combination of the observational measure of variance and inter-metric covariances for each observational variable. This ties this measure to the Factor analysis model since the covariances can be re-expressed in terms of the factor loadings.
\(\sigma^2_T = \sum \sigma^2_{x_i} + 2 \sum_{i < j}^K (l_{i, 1} + \epsilon_{i})(l_{j, 1} + \epsilon_{j})\)
which means that if the factors loadings are fairly high relative the the random components of the variance then we’ll get a ratio that come close to one. Conversely low loadings will ensure that the denominator drags the ratio down.
import numpy as np
def CronbachAlpha(observed_measures):
= np.asarray(observed_measures)
observed_measures = observed_measures.var(axis=1, ddof=1)
sample_vars = observed_measures.sum(axis=0)
total_scores = len(observed_measures)
nitems
return nitems / (nitems-1.) * (1 - sample_vars.sum() / total_scores.var(ddof=1))
#Collate the observed features
= [X['customer_desc_1'], X['customer_desc_3'],
factor1 'customer_desc_5'], X['customer_desc_7']]
X[
= [X['customer_desc_4'], X['customer_desc_0']]
factor2
= [X['customer_desc_5'], X['customer_desc_6'],
factor3 'customer_desc_2']]
X[#Get cronbach alpha
= CronbachAlpha(factor1)
factor1_alpha #factor2_alpha = CronbachAlpha(factor2)
#factor3_alpha = CronbachAlpha(factor3)
print(factor1_alpha,
factor2_alpha,
factor3_alpha )
0.7889764629492505 -3.382959230225132 -0.8158399732248119
Which shows as expected that only one of the proposed factors is sensible.