You are on page 1of 1

In [ ]: #A TUTORIAL PYTHON CODE ABOUT PRINCIPAL COMPONENT ANALYSIS AND LOGISTIC REGRESSION:

#APPLICATION TO ANALYSIS AND CLASSIFICATION OF ROCK SAMPLES.


#
#Paolo Dell'Aversana - January 2019.

In [ ]: #Remark: a basic knowledge of Python is required.

In [ ]: # INTRODUCTION
# Principal Component Analysis is a statistical approach that converts
# a set of observations of possibly correlated variables (using an orthogonal
# transformation) into a set of values of linearly uncorrelated variables
# (Jolliffe, 2002).
# These are called “principal components”. The optimal matrix transformation is defined in
# such a way that the first principal component has the largest possible variance.
# It means that it takes into account for as much of the variability in the data as possible.
# Each succeeding component in turn has the highest variance possible under the constraint
# that it is orthogonal to the preceding components.
# In this tutorial, I explain how to build, step by step, an efficient Python code
# addressed to PCA of a data set of magmatic rock samples characterized by different
# content in oxides. Finally, I show how to train a Logistic Regression classifier
# using the results of PCA. The logistic regression algorithm performs efficiently on
# the feature subspace obtained through PCA. I modified and readapted the code written by
# Raschka and Mirjalili (2017) for the specific purposes of rock sample analysis.

In [ ]: #We start this tutorial session just with transforming our input data from xls to csv format.
#This is a useful preliminary step for who prefers working with csv files,
#but has the data available in Excel worksheets.

In [1]: #First, we import the module for reading the excel file:
import xlrd
#We can access to the different sheets of the Excel file.
#We will take the sheet with our data set.
workbook = xlrd.open_workbook('CAL_large_test.xlsx')
workbook
workbook.sheet_names()
#Remark: here, 'CAL_large_test.xlsx' is our imput file consisting of several rock samples
#characterized by different disribution of chemical features (oxides).

Out[1]: ['Table DR2']

In [ ]: #Now we can convert our input file from xlsx to csv format, using a function of pandas library.

In [3]: import pandas as pd


data_xls = pd.read_excel('CAL_large_test.xlsx', 'Table DR2', index_col=None)
#data_xls.to_csv('zzz.xlsx.csv', encoding='utf-8')
data_xls.to_csv('zzz.csv')
#Remark: here, "zzz.csv" is just a simple name given to our csv file.

In [2]: #After these preliminary operations, we can start the tutorial core:
#Unsupervised dimensionality reduction via Principal Component Analysis (Raschka and Mirjalili, 201
7).
#We go through all the steps of the Principal Component Analysis (PCA).
#We first use pandas' libraries, then we'll use a faster approach based ona a specific PCA library o
f
#Scikit-learn.

In [4]: import pandas as pd


#let us start by reading our input file and showing its first rows. We define a function for reading
this file.
df_rock = pd.read_csv('zzz.csv')
#the commented line below is just in case we desire to add a header (however it is already in our fi
le)
#df_rock.columns = ['Sample', 'Rock_type', 'SiO2', 'TiO2','Al2O3', 'Fe2O3', 'MnO','MgO', 'CaO', 'Na2
O','K2O', 'P2O5']

df_rock.head()

Out[4]:
Unnamed: 0 Sample Rock_type SiO2 TiO2 Al2O3 Fe2O3 MnO MgO CaO Na2O K2O P2O5

0 0 SAR-00-08 andesite 58.9 1.133 16.5 5.85 0.074 2.25 5.20 5.06 2.89 0.557

1 1 SAR-00-07 andesite 62.9 0.828 16.5 4.97 0.075 1.97 4.35 4.76 2.99 0.343

2 2 SAR-00-13 andesite 60.6 1.015 16.4 5.59 0.082 2.24 4.82 4.77 3.00 0.473

3 3 COTA-05-15 andesite 58.9 1.222 16.9 6.56 0.082 2.88 5.64 4.76 2.41 0.482

4 4 COTA-05-06 andesite 59.5 1.169 16.7 6.09 0.077 2.54 5.24 4.58 2.70 0.490

In [ ]: #Let us plot the number of rows and columns of our file

In [5]: df_rock.shape

Out[5]: (174, 13)

In [ ]: #Now, we split our data set in train and test data, then we assign the 10 features to a NumPy array
X.
#Using LabelEncoder, we transform the class labels from their original string representation into in
tegers.

In [6]: from sklearn.model_selection import train_test_split


from sklearn.preprocessing import LabelEncoder
X, y = df_rock.iloc[:, 3:].values, df_rock.iloc[:, 2].values

X_train, X_test, y_train, y_test = \


train_test_split(X, y, test_size=0.3,
stratify=y,
random_state=0)
le = LabelEncoder()
#Below we set the classes of rocks into an array of strings:
y = le.fit_transform(y)
le.classes_

Out[6]: array(['andesite', 'basaltic andesite', 'dacite', 'rhyolite',


'trachyandesite'], dtype=object)

In [81]: #Now we encode the string classes into integers

In [7]: le.transform(['andesite', 'basaltic andesite', 'dacite', 'rhyolite', 'trachyandesite'])

Out[7]: array([0, 1, 2, 3, 4], dtype=int64)

In [ ]: #If we like, we can make a print of our train and test files, just to check ...

In [65]: #print(X_train)

In [64]: #print(X_test)

In [8]: #If we like, we can transform the format of our input data
import numpy as np
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)

In [ ]: #Now we standardize the data. This operation is useful to make our instances comparable.
#Remark. If we have NaN's in our data we can remove them (see below).

In [ ]: #np.nan_to_num(X_train)
#p.nan_to_num(X_test)

In [9]: from sklearn.preprocessing import StandardScaler


sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

In [ ]: #Now we start the process of Pricipal Component Analysis (PCA)


#First we make Eigendecomposition of the covariance matrix.

In [10]: import numpy as np


cov_mat = np.cov(X_train_std.T)
eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)

print('\nEigenvalues \n%s' % eigen_vals)

Eigenvalues
[6.78285831 1.53376255 0.85976428 0.42218026 0.22690364 0.09508775
0.01612308 0.03970756 0.05683254 0.05011281]

In [11]: #Now we calculate the total and explained variance, print and plot them.

In [12]: tot = sum(eigen_vals)


var_exp = [(i / tot) for i in sorted(eigen_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)

In [13]: print(var_exp)

[0.6726801998183738, 0.15210869080035438, 0.08526588324049861, 0.04186911924860997, 0.02250284172


984673, 0.009430190579739013, 0.005636285581098437, 0.004969865534965985, 0.003937939965018102,
0.001598983501494715]

In [14]: print(cum_var_exp)

[0.6726802 0.82478889 0.91005477 0.95192389 0.97442673 0.98385693


0.98949321 0.99446308 0.99840102 1. ]

In [15]: import matplotlib.pyplot as plt

plt.bar(range(1, 11),var_exp, alpha=0.9, align='center',label='individual explained variance')


plt.step(range(1, 11), cum_var_exp, where='mid',label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal component index')
plt.legend(loc='best')
plt.tight_layout()
# plt.savefig('images/05_02.png', dpi=300)
plt.show()

In [ ]: #From the figure above, we can see that the most of the data variance is explained
#by the first 2 or 3 principal components.

In [ ]: #Finally we perform feature tranformation from their initial multidimentional space


#into a new principal components space.
#We use the projection matrix to transform the data onto the lower-dimensional subspace.
#We start by sorting the eigenpairs by decreasing order of the eigenvalues.
#Then, we collect the two eigenvectors that correspond to the two largest values to
#capture the most of the variance in this rock-chemical dataset.

In [16]: #We create a list of (eigenvalue, eigenvector) tuple:


eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:, i])
for i in range(len(eigen_vals))]

#Then we sort the (eigenvalue, eigenvector) tuples:


eigen_pairs.sort(key=lambda k: k[0], reverse=True)

In [17]: w = np.hstack((eigen_pairs[0][1][:, np.newaxis],


eigen_pairs[1][1][:, np.newaxis]))
print('Matrix W:\n', w)

Matrix W:
[[-0.37584162 -0.12101843]
[ 0.36437749 -0.06753255]
[ 0.30961264 -0.25640948]
[ 0.36819809 0.08421512]
[ 0.16022243 -0.32713229]
[ 0.32082161 0.37864227]
[ 0.37085905 0.17790127]
[ 0.08650101 -0.71953609]
[-0.34431539 -0.14348562]
[ 0.31995722 -0.29597502]]

In [ ]: #We have created a 10× 2 -dimensional projection matrix W from the top two eigenvectors.
#Using the projection matrix, we can transform a sample x (represented as 1×10-dimensional row vecto
r)
#onto the PCA subspace obtaining two-dimensional sample vector consisting of two new features
#(PC1 and PC2):

In [18]: X_train_std[0].dot(w)

Out[18]: array([-3.65015577, -1.35169258])

In [19]: #By generalizing that procedure, we can transform the entire training dataset onto the
#two principal components.

In [20]: X_train_pca = X_train_std.dot(w)


colors = ['r', 'b', 'g', 'c', 'm', 'y']
markers = ['s', 'x', 'o', 'v', '^', '*']

for l, c, m in zip(np.unique(y_train), colors, markers):


plt.scatter(X_train_pca[y_train == l, 0],
X_train_pca[y_train == l, 1],
c=c, label=l, marker=m)

plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.legend(loc='upper center')
plt.tight_layout()
# plt.savefig('images/05_03.png', dpi=300)
plt.show()

In [ ]: #We note that there are few outliers in this new representation, in the sense that the clusters
#are not perfectly separated. However, the different rock classes are pretty well
#distinguished in this new algbric representation.

In [ ]: #USING SCIKIT-LEARN
#
#We can perform the entire PCA analysis using a specific library of scikit-learn.
#Now, let's use the PCA from scikitlearn on the Rock training dataset, classify the transformed samp
les via logistic
#regression, and visualize the results.

In [21]: from sklearn.decomposition import PCA


pca = PCA()
X_train_pca = pca.fit_transform(X_train_std)
pca.explained_variance_ratio_

Out[21]: array([0.6726802 , 0.15210869, 0.08526588, 0.04186912, 0.02250284,


0.00943019, 0.00563629, 0.00496987, 0.00393794, 0.00159898])

In [ ]: #Let us plot again the explained variance ratio.

In [22]: plt.bar(range(1, 11), pca.explained_variance_ratio_, alpha=0.9, align='center')


plt.step(range(1, 11), np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.show()

In [ ]: #Then we apply the function "pca" to our training and testing data sets.

In [23]: pca = PCA(n_components=2)


X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

In [ ]: #We can plot our training data set in the PC1-PC2 space, without any color distinction, for now.

In [24]: plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1])


plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.show()

In [ ]: #Then, we'll distinguish the differnt classes, after training a classifier algorithm
#using the first 2 principal components.

In [25]: from matplotlib.colors import ListedColormap

def plot_decision_regions(X, y, classifier, resolution=0.02):

# setup marker generator and color map


colors = ['r', 'b', 'g', 'c', 'm', 'y']
markers = ['s', 'x', 'o', 'v', '^', '*']
cmap = ListedColormap(colors[:len(np.unique(y))])

# plot the decision surface (for the moment we skip this step, just commenting it)
# x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
# x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
# xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
# np.arange(x2_min, x2_max, resolution))
# Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
# Z = Z.reshape(xx1.shape)
# plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
# plt.xlim(xx1.min(), xx1.max())
# plt.ylim(xx2.min(), xx2.max())

# plot class samples


for idx, cl in enumerate(np.unique(y)):
plt.scatter(x=X[y == cl, 0],
y=X[y == cl, 1],
alpha=0.6,
c=cmap(idx),
edgecolor='black',
marker=markers[idx],
label=cl)

In [ ]: #We use a LOGISTIC REGRESSION CLASSIFIERS.


#Now we train a logistic regression classifier using the first 2 principal components.

In [26]: from sklearn.linear_model import LogisticRegression

pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

lr = LogisticRegression()
lr = lr.fit(X_train_pca, y_train)

In [ ]: #Here we plot our results in the PCA space.

In [27]: plot_decision_regions(X_train_pca, y_train, classifier=lr)


plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.legend(loc='upper center')
plt.tight_layout()
# plt.savefig('images/05_04.png', dpi=300)
plt.show()

In [ ]: #Finally, if we set "none" the n_component parameter in PCA, it would return all principal component
s
#in sorted order instead of performing a dimensionality reduction.

In [28]: pca = PCA(n_components=None)


X_train_pca = pca.fit_transform(X_train_std)
pca.explained_variance_ratio_

Out[28]: array([0.6726802 , 0.15210869, 0.08526588, 0.04186912, 0.02250284,


0.00943019, 0.00563629, 0.00496987, 0.00393794, 0.00159898])

In [ ]: #This tutorial test ends here.


#We learned how to create an entire Python workflow for PCA.
#Furthremore we learned to apply directely a PCA in a fast way, using dedicated libraries
#from Scikit-learn.
#Finally, we applied a Logistic Regression classifier using the first 2 principal components.

In [ ]: #References:
#
#1. Raschka, S. and Mirjalili, V., 2017. Python Machine Learning:
#Machine Learning and Deep Learning with Python, scikit-learn, and TensorFlow, 2nd Edition.
#Original code modified by Paolo Dell'Aversana and re-adapted for analysis of geo-chemical data.
#
#2. Jolliffe I.T., 2002. Principal Component Analysis,
#Series: Springer Series in Statistics, 2nd ed., Springer, NY, 2002, XXIX, 487 p. 28 illus.
#ISBN 978-0-387-95442-4.

You might also like