# checking the number of rows and columnsnum_train_rows, num_train_columns = train_data.shapenum_test_rows, num_test_columns = test_data.shapenum_orignal_rows, num_orignal_columns = orignal_data.shapeprint('Training Data: ')print(f"Number of Rows: {num_train_rows}")print(f"Number of Columns: {num_train_columns}\n")print('Test Data: ')print(f"Number of Rows: {num_test_rows}")print(f"Number of Columns :{num_test_columns}\n")print("Orignal Data: ")print(f"Number of Rows: {num_orignal_rows}")print(f"Number of Columns: {num_orignal_columns}")
Training Data:
Number of Rows: 165034
Number of Columns: 14
Test Data:
Number of Rows: 110023
Number of Columns :13
Orignal Data:
Number of Rows: 10002
Number of Columns: 14
# create a table for missing values, unique values, and data typesmissing_values_train = pd.DataFrame({'Feature': train_data.columns,'[TRAIN] No. of Missing Values' : train_data.isnull().sum().values,'[TRAIN] % of Missing Values' : ((train_data.isnull().sum().values)/len(train_data)*100)})missing_values_test = pd.DataFrame({'Feature' : test_data.columns,'[TEST] No. of Missing Values' : test_data.isnull().sum().values,'[TEST]% of Missing Values': ((test_data.isnull().sum().values)/len(test_data)*100)})missing_values_orignal = pd.DataFrame({'Feature' : orignal_data.columns,'[ORIGNAL] No. of Missing Values': orignal_data.isnull().sum().values,'[ORIGNAL] % of Missing Values' : ((orignal_data.isnull().sum().values)/len(orignal_data)*100)})
merged_df = pd.merge(missing_values_train, missing_values_test, on='Feature', how='left')merged_df = pd.merge(merged_df, missing_values_orignal, on ='Feature', how ='left')merged_df = pd.merge(merged_df, unique_values, on='Feature', how='left')merged_df = pd.merge(merged_df, feature_types, on ='Feature', how='left')merged_df
Feature
[TRAIN] No. of Missing Values
[TRAIN] % of Missing Values
[TEST] No. of Missing Values
[TEST]% of Missing Values
[ORIGNAL] No. of Missing Values
[ORIGNAL] % of Missing Values
No. of Unique Values [FROM TRAIN]
DataType
0
id
0
0.0
0.0
0.0
NaN
NaN
165034
int64
1
CustomerId
0
0.0
0.0
0.0
0.0
0.000000
23221
int64
2
Surname
0
0.0
0.0
0.0
0.0
0.000000
2797
object
3
CreditScore
0
0.0
0.0
0.0
0.0
0.000000
457
int64
4
Geography
0
0.0
0.0
0.0
1.0
0.009998
3
object
5
Gender
0
0.0
0.0
0.0
0.0
0.000000
2
object
6
Age
0
0.0
0.0
0.0
1.0
0.009998
71
float64
7
Tenure
0
0.0
0.0
0.0
0.0
0.000000
11
int64
8
Balance
0
0.0
0.0
0.0
0.0
0.000000
30075
float64
9
NumOfProducts
0
0.0
0.0
0.0
0.0
0.000000
4
int64
10
HasCrCard
0
0.0
0.0
0.0
1.0
0.009998
2
float64
11
IsActiveMember
0
0.0
0.0
0.0
1.0
0.009998
2
float64
12
EstimatedSalary
0
0.0
0.0
0.0
0.0
0.000000
55298
float64
13
Exited
0
0.0
NaN
NaN
0.0
0.000000
2
int64
# count duplicate rows in train_datatrain_duplicates = train_data.duplicated().sum()# count duplicate rows in test_datatest_duplicates = test_data.duplicated().sum()# count duplicate rows in orignal_dataorignal_duplicates = orignal_data.duplicated().sum()# print resultsprint(f"Number of duplicate rows in train_data: {train_duplicates}")print(f"Number of duplicate rows in test_data: {test_duplicates}")print(f"Number of duplicate rows in orignal_data: {orignal_duplicates}")
Number of duplicate rows in train_data: 0
Number of duplicate rows in test_data: 0
Number of duplicate rows in orignal_data: 2
# description of all numerical columns in the datasettrain_data.describe().T#test_data.describe().T#orignal_data.describe().T
# Analysis# custom color pallete definecustom_palette = ['#3498db', '#e74c3c','#2ecc71']# add 'Dataset' column to distinguish between train and test datatrain_data['Dataset'] ='Train'test_data['Dataset'] ='Test'orignal_data['Dataset']='Orignal'variables = [col for col in train_data.columns if col in numerical_variables]# function to create and display a row of plots for a single variabledef create_variable_plots(variable): sns.set_style('whitegrid') fig, axes = plt.subplots(1,2, figsize= (12, 4))#Box plot plt.subplot(1, 2, 1) sns.boxplot(data = pd.concat([ train_data, test_data, orignal_data.dropna() ]), x= variable, y ="Dataset", palette= custom_palette) plt.xlabel(variable) plt.title(f"Box Plot for {variable}")# Seperate Histograms plt.subplot(1,2,2) sns.histplot(data = train_data, x = variable, color= custom_palette[0], kde=True, bins=30, label='Train') sns.histplot(data = test_data, x= variable, color= custom_palette[1], kde=True, bins=30, label='Test') sns.histplot(data = orignal_data.dropna(), x=variable, color=custom_palette[2], kde=True, bins=30, label="Original") plt.xlabel(variable) plt.ylabel('Frequency') plt.title(f'Histogram for {variable} [TRAIN, TEST and ORIGNAL]') plt.legend()#adjust spacing between subplots plt.tight_layout()# show the plots plt.show()# perform univariate analysis for each variablefor variable in variables: create_variable_plots(variable)# drop the 'Dataset' column after analysistrain_data.drop('Dataset', axis=1, inplace =True)test_data.drop('Dataset', axis=1, inplace=True)orignal_data.drop('Dataset', axis=1, inplace=True)
3.2 Categorical features
# Analysis of all CATEGORICAL features# Define a custom color palette for categorical featurescategorical_palette = ['#3498db', '#e74c3c', '#2ecc71', '#f39c12', '#9b59b6', '#bdc3c7', '#1abc9c', '#f1c40f', '#95a5a6', '#d35400']# List of categorical variablescategorical_variables = [col for col in categorical_variables]# Function to create and display a row of plots for a single categorical variabledef create_categorical_plots(variable): sns.set_style('whitegrid') fig, axes = plt.subplots(1, 2, figsize=(12, 4))# Pie Chart plt.subplot(1, 2, 1) train_data[variable].value_counts().plot.pie(autopct='%1.1f%%', colors=categorical_palette, wedgeprops=dict(width=0.3), startangle=140) plt.title(f"Pie Chart for {variable}")# Bar Graph plt.subplot(1, 2, 2) sns.countplot(data=pd.concat([ train_data, test_data, orignal_data.dropna() ]), x=variable, palette=categorical_palette) plt.xlabel(variable) plt.ylabel("Count") plt.title(f"Bar Graph for {variable} [TRAIN, TEST & ORIGINAL]")# Adjust spacing between subplots plt.tight_layout()# Show the plots plt.show()# Perform univariate analysis for each categorical variablefor variable in categorical_variables: create_categorical_plots(variable)
3.3 Target features
# Analysis of TARGET feature# Define a custom color palette for categorical featurestarget_palette = ['#3498db', '#e74c3c']fig, axes = plt.subplots(1, 2, figsize = (12, 4))# Pie Chartplt.subplot(1,2,1)train_data[target_variable].value_counts().plot.pie( autopct='%1.1f%%', colors= target_palette, wedgeprops=dict(width=0.3), startangle=140)plt.title(f"Pie Chart for Target Feature 'Exited'")# Bar Graphplt.subplot(1,2,2)sns.countplot(data=pd.concat([ train_data, orignal_data.dropna()]), x=target_variable, palette=target_palette)plt.xlabel(variable)plt.ylabel('Count')plt.title(f"Bar Graph for Target Feature 'Exited'")# adjust spacingplt.tight_layout()# showplt.show()
3.4 Bivariate Analysis
variables = [col for col in train_data.columns if col in numerical_variables]cat_variables_train = ['NumOfProducts', 'HasCrCard', 'IsActiveMember', 'Tenure', 'Exited']cat_variables_test = ['NumOfProducts', 'HasCrCard', 'IsActiveMember', 'Tenure']# Adding variables to the existing listtrain_variables = variables + cat_variables_traintest_variables = variables + cat_variables_test# Calculate correlation matrices for train_data and test_datacorr_train = train_data[train_variables].corr()corr_test = test_data[test_variables].corr()# Create masks for the upper trianglemask_train = np.triu(np.ones_like(corr_train, dtype=bool))mask_test = np.triu(np.ones_like(corr_test, dtype=bool))# Set the text size and rotationannot_kws = {"size": 8, "rotation": 45}# Generate heatmaps for train_dataplt.figure(figsize=(15, 5))plt.subplot(1, 2, 1)ax_train = sns.heatmap(corr_train, mask=mask_train, cmap='viridis', annot=True, square=True, linewidths=.5, xticklabels=1, yticklabels=1, annot_kws=annot_kws)plt.title('Correlation Heatmap - Train Data')# Generate heatmaps for test_dataplt.subplot(1, 2, 2)ax_test = sns.heatmap(corr_test, mask=mask_test, cmap='viridis', annot=True, square=True, linewidths=.5, xticklabels=1, yticklabels=1, annot_kws=annot_kws)plt.title('Correlation Heatmap - Test Data')# Adjust layoutplt.tight_layout()# Show the plotsplt.show()