How to calculate correlation between all columns and remove highly correlated ones using pandas?
Solution 1:
The method here worked well for me, only a few lines of code: https://chrisalbon.com/machine_learning/feature_selection/drop_highly_correlated_features/
import numpy as np
# Create correlation matrix
corr_matrix = df.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Find features with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
# Drop features
df.drop(to_drop, axis=1, inplace=True)
Solution 2:
Here is the approach which I have used -
def correlation(dataset, threshold):
col_corr = set() # Set of all the names of deleted columns
corr_matrix = dataset.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if (corr_matrix.iloc[i, j] >= threshold) and (corr_matrix.columns[j] not in col_corr):
colname = corr_matrix.columns[i] # getting the name of column
col_corr.add(colname)
if colname in dataset.columns:
del dataset[colname] # deleting the column from the dataset
print(dataset)
Hope this helps!
Solution 3:
Here is an Auto ML class I created to eliminate multicollinearity between features.
What makes my code unique is that out two features that have high correlation, I have eliminated the feature that is least correlated with the target! I got the idea from this seminar by Vishal Patel Sir - https://www.youtube.com/watch?v=ioXKxulmwVQ&feature=youtu.be
#Feature selection class to eliminate multicollinearity
class MultiCollinearityEliminator():
#Class Constructor
def __init__(self, df, target, threshold):
self.df = df
self.target = target
self.threshold = threshold
#Method to create and return the feature correlation matrix dataframe
def createCorrMatrix(self, include_target = False):
#Checking we should include the target in the correlation matrix
if (include_target == False):
df_temp = self.df.drop([self.target], axis =1)
#Setting method to Pearson to prevent issues in case the default method for df.corr() gets changed
#Setting min_period to 30 for the sample size to be statistically significant (normal) according to
#central limit theorem
corrMatrix = df_temp.corr(method='pearson', min_periods=30).abs()
#Target is included for creating the series of feature to target correlation - Please refer the notes under the
#print statement to understand why we create the series of feature to target correlation
elif (include_target == True):
corrMatrix = self.df.corr(method='pearson', min_periods=30).abs()
return corrMatrix
#Method to create and return the feature to target correlation matrix dataframe
def createCorrMatrixWithTarget(self):
#After obtaining the list of correlated features, this method will help to view which variables
#(in the list of correlated features) are least correlated with the target
#This way, out the list of correlated features, we can ensure to elimate the feature that is
#least correlated with the target
#This not only helps to sustain the predictive power of the model but also helps in reducing model complexity
#Obtaining the correlation matrix of the dataframe (along with the target)
corrMatrix = self.createCorrMatrix(include_target = True)
#Creating the required dataframe, then dropping the target row
#and sorting by the value of correlation with target (in asceding order)
corrWithTarget = pd.DataFrame(corrMatrix.loc[:,self.target]).drop([self.target], axis = 0).sort_values(by = self.target)
print(corrWithTarget, '\n')
return corrWithTarget
#Method to create and return the list of correlated features
def createCorrelatedFeaturesList(self):
#Obtaining the correlation matrix of the dataframe (without the target)
corrMatrix = self.createCorrMatrix(include_target = False)
colCorr = []
#Iterating through the columns of the correlation matrix dataframe
for column in corrMatrix.columns:
#Iterating through the values (row wise) of the correlation matrix dataframe
for idx, row in corrMatrix.iterrows():
if(row[column]>self.threshold) and (row[column]<1):
#Adding the features that are not already in the list of correlated features
if (idx not in colCorr):
colCorr.append(idx)
if (column not in colCorr):
colCorr.append(column)
print(colCorr, '\n')
return colCorr
#Method to eliminate the least important features from the list of correlated features
def deleteFeatures(self, colCorr):
#Obtaining the feature to target correlation matrix dataframe
corrWithTarget = self.createCorrMatrixWithTarget()
for idx, row in corrWithTarget.iterrows():
print(idx, '\n')
if (idx in colCorr):
self.df = self.df.drop(idx, axis =1)
break
return self.df
#Method to run automatically eliminate multicollinearity
def autoEliminateMulticollinearity(self):
#Obtaining the list of correlated features
colCorr = self.createCorrelatedFeaturesList()
while colCorr != []:
#Obtaining the dataframe after deleting the feature (from the list of correlated features)
#that is least correlated with the taregt
self.df = self.deleteFeatures(colCorr)
#Obtaining the list of correlated features
colCorr = self.createCorrelatedFeaturesList()
return self.df
Solution 4:
You can test this code below ?
Load libraries import
pandas as pd
import numpy as np
# Create feature matrix with two highly correlated features
X = np.array([[1, 1, 1],
[2, 2, 0],
[3, 3, 1],
[4, 4, 0],
[5, 5, 1],
[6, 6, 0],
[7, 7, 1],
[8, 7, 0],
[9, 7, 1]])
# Convert feature matrix into DataFrame
df = pd.DataFrame(X)
# View the data frame
df
# Create correlation matrix
corr_matrix = df.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
# Drop features
df.drop(df[to_drop], axis=1)