In [62]:
# important needed libraries
import pandas as pd
import numpy as np
In [63]:
# load dataset
dataset = pd.read_csv('MGH_PredictionDataSet.csv')
In [64]:
dataset.head()
Out[64]:
sex age education currentSmoker cigsPerDay BPMeds prevalentStroke prevalentHyp diabetes totChol sysBP diaBP BMI heartRate glucose TenYearCHD
0 1 39 4.0 0 0.0 0.0 0 0 0 195.0 106.0 70.0 26.97 80.0 77.0 0
1 0 46 2.0 0 0.0 0.0 0 0 0 250.0 121.0 81.0 28.73 95.0 76.0 0
2 1 48 1.0 1 20.0 0.0 0 0 0 245.0 127.5 80.0 25.34 75.0 70.0 0
3 0 61 3.0 1 30.0 0.0 0 1 0 225.0 150.0 95.0 28.58 65.0 103.0 1
4 0 46 3.0 1 23.0 0.0 0 0 0 285.0 130.0 84.0 23.10 85.0 85.0 0
In [65]:
dataset.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4240 entries, 0 to 4239
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   sex              4240 non-null   int64  
 1   age              4240 non-null   int64  
 2   education        4135 non-null   float64
 3   currentSmoker    4240 non-null   int64  
 4   cigsPerDay       4211 non-null   float64
 5   BPMeds           4187 non-null   float64
 6   prevalentStroke  4240 non-null   int64  
 7   prevalentHyp     4240 non-null   int64  
 8   diabetes         4240 non-null   int64  
 9   totChol          4190 non-null   float64
 10  sysBP            4240 non-null   float64
 11  diaBP            4240 non-null   float64
 12  BMI              4221 non-null   float64
 13  heartRate        4239 non-null   float64
 14  glucose          3852 non-null   float64
 15  TenYearCHD       4240 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 530.1 KB

DATA CLEANING¶

In [66]:
dataset.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4240 entries, 0 to 4239
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   sex              4240 non-null   int64  
 1   age              4240 non-null   int64  
 2   education        4135 non-null   float64
 3   currentSmoker    4240 non-null   int64  
 4   cigsPerDay       4211 non-null   float64
 5   BPMeds           4187 non-null   float64
 6   prevalentStroke  4240 non-null   int64  
 7   prevalentHyp     4240 non-null   int64  
 8   diabetes         4240 non-null   int64  
 9   totChol          4190 non-null   float64
 10  sysBP            4240 non-null   float64
 11  diaBP            4240 non-null   float64
 12  BMI              4221 non-null   float64
 13  heartRate        4239 non-null   float64
 14  glucose          3852 non-null   float64
 15  TenYearCHD       4240 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 530.1 KB
In [67]:
# View sum of nulls in each column
dataset.isnull().sum()
Out[67]:
sex                  0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64
In [68]:
# Replace the nulls

# Categorical Data - Replace with Mode
dataset['education'].fillna(dataset['education'].mode()[0], inplace = True)
dataset['BPMeds'].fillna(dataset['BPMeds'].mode()[0], inplace = True)


# Continous Data - Replace with Mean
dataset['cigsPerDay'].fillna(dataset['cigsPerDay'].mean(), inplace = True)
dataset['totChol'].fillna(dataset['totChol'].mean(), inplace = True)
dataset['BMI'].fillna(dataset['BMI'].mean(), inplace = True)
dataset['heartRate'].fillna(dataset['heartRate'].mean(), inplace = True)
dataset['glucose'].fillna(dataset['glucose'].mean(), inplace = True)
In [69]:
# View sum of nulls in each column
dataset.isnull().sum()
Out[69]:
sex                0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

EXPLORATORY DATA ANALYSIS (EDA)¶

Exploring our dataset to understand its structure inorder to develop our hypotheses¶

In [70]:
# Summarizing the data 
dataset.describe()
Out[70]:
sex age education currentSmoker cigsPerDay BPMeds prevalentStroke prevalentHyp diabetes totChol sysBP diaBP BMI heartRate glucose TenYearCHD
count 4240.000000 4240.000000 4240.000000 4240.000000 4240.000000 4240.000000 4240.000000 4240.000000 4240.000000 4240.000000 4240.000000 4240.000000 4240.000000 4240.000000 4240.000000 4240.000000
mean 0.429245 49.580189 1.955189 0.494104 9.005937 0.029245 0.005896 0.310613 0.025708 236.699523 132.354599 82.897759 25.800801 75.878981 81.963655 0.151887
std 0.495027 8.572942 1.018522 0.500024 11.881610 0.168513 0.076569 0.462799 0.158280 44.327521 22.033300 11.910394 4.070687 12.023929 22.831748 0.358953
min 0.000000 32.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 107.000000 83.500000 48.000000 15.540000 44.000000 40.000000 0.000000
25% 0.000000 42.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 206.000000 117.000000 75.000000 23.077500 68.000000 72.000000 0.000000
50% 0.000000 49.000000 2.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 234.000000 128.000000 82.000000 25.410000 75.000000 80.000000 0.000000
75% 1.000000 56.000000 3.000000 1.000000 20.000000 0.000000 0.000000 1.000000 0.000000 262.000000 144.000000 90.000000 28.032500 83.000000 85.000000 0.000000
max 1.000000 70.000000 4.000000 1.000000 70.000000 1.000000 1.000000 1.000000 1.000000 696.000000 295.000000 142.500000 56.800000 143.000000 394.000000 1.000000
In [71]:
# Check for data correlation between the variables
dataset.corr(numeric_only = True)
Out[71]:
sex age education currentSmoker cigsPerDay BPMeds prevalentStroke prevalentHyp diabetes totChol sysBP diaBP BMI heartRate glucose TenYearCHD
sex 1.000000 -0.029014 0.013361 0.197026 0.316023 -0.051544 -0.004550 0.005853 0.015693 -0.070064 -0.035879 0.058199 0.081705 -0.116913 0.005718 0.088374
age -0.029014 1.000000 -0.165283 -0.213662 -0.192534 0.121011 0.057679 0.306799 0.101314 0.260691 0.394053 0.205586 0.135578 -0.012839 0.116951 0.225408
education 0.013361 -0.165283 1.000000 0.019399 0.010217 -0.010231 -0.032910 -0.078565 -0.038215 -0.024025 -0.126062 -0.062334 -0.139731 -0.049580 -0.034416 -0.053002
currentSmoker 0.197026 -0.213662 0.019399 1.000000 0.767055 -0.048348 -0.032980 -0.103710 -0.044285 -0.046211 -0.130281 -0.107933 -0.167483 0.062678 -0.054062 0.019448
cigsPerDay 0.316023 -0.192534 0.010217 0.767055 1.000000 -0.045684 -0.032711 -0.066444 -0.037086 -0.026182 -0.088523 -0.056473 -0.092888 0.075257 -0.056020 0.057646
BPMeds -0.051544 0.121011 -0.010231 -0.048348 -0.045684 1.000000 0.114614 0.258580 0.051407 0.078775 0.251479 0.192254 0.099681 0.015136 0.048876 0.086448
prevalentStroke -0.004550 0.057679 -0.032910 -0.032980 -0.032711 0.114614 1.000000 0.074791 0.006955 0.000105 0.057000 0.045153 0.024856 -0.017674 0.018065 0.061823
prevalentHyp 0.005853 0.306799 -0.078565 -0.103710 -0.066444 0.258580 0.074791 1.000000 0.077752 0.162683 0.696656 0.615840 0.300599 0.146777 0.082757 0.177458
diabetes 0.015693 0.101314 -0.038215 -0.044285 -0.037086 0.051407 0.006955 0.077752 1.000000 0.040161 0.111265 0.050260 0.086282 0.048986 0.605709 0.097344
totChol -0.070064 0.260691 -0.024025 -0.046211 -0.026182 0.078775 0.000105 0.162683 0.040161 1.000000 0.207436 0.163423 0.115013 0.090678 0.044710 0.081807
sysBP -0.035879 0.394053 -0.126062 -0.130281 -0.088523 0.251479 0.057000 0.696656 0.111265 0.207436 1.000000 0.783952 0.325172 0.182084 0.134561 0.216374
diaBP 0.058199 0.205586 -0.062334 -0.107933 -0.056473 0.192254 0.045153 0.615840 0.050260 0.163423 0.783952 1.000000 0.376317 0.181012 0.058499 0.145112
BMI 0.081705 0.135578 -0.139731 -0.167483 -0.092888 0.099681 0.024856 0.300599 0.086282 0.115013 0.325172 0.376317 1.000000 0.067318 0.082123 0.074788
heartRate -0.116913 -0.012839 -0.049580 0.062678 0.075257 0.015136 -0.017674 0.146777 0.048986 0.090678 0.182084 0.181012 0.067318 1.000000 0.089386 0.022892
glucose 0.005718 0.116951 -0.034416 -0.054062 -0.056020 0.048876 0.018065 0.082757 0.605709 0.044710 0.134561 0.058499 0.082123 0.089386 1.000000 0.120451
TenYearCHD 0.088374 0.225408 -0.053002 0.019448 0.057646 0.086448 0.061823 0.177458 0.097344 0.081807 0.216374 0.145112 0.074788 0.022892 0.120451 1.000000
In [72]:
# Importing libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns
In [73]:
heatmap_data = dataset.corr(numeric_only = True)
In [74]:
# Represent correlation matrix as a heatmap
fig, ax = plt.subplots(figsize =(15,15))
sns.heatmap(heatmap_data, annot = True,cmap = 'BuPu') 
Out[74]:
<Axes: >
In [75]:
# Histogram Using Seaborn
sns.histplot(dataset['age'], bins = 15)
plt.title('Age in Year')
plt.ylabel('Frequency')
Out[75]:
Text(0, 0.5, 'Frequency')
In [76]:
# Check distribution of Hypertension 
sns.countplot(x = 'prevalentHyp', data = dataset, palette = 'hls') 
plt.title('Distribution of TenYearCHD')
plt.ylabel('Count')
plt.xlabel('TenYearCHD')
Out[76]:
Text(0.5, 0, 'TenYearCHD')
In [77]:
# Check distribution of Education
sns.countplot(x = 'education', data = dataset, palette = 'hls') 
plt.title('Distribution of TehnYearCHD by Education')
plt.ylabel('Count')
Out[77]:
Text(0, 0.5, 'Count')
In [78]:
from matplotlib.pyplot import style
style.use('classic')
# Grouping data by BPMeds and TenYearCHD
bpmeds_chd_counts = dataset.groupby(['BPMeds', 'TenYearCHD']).size().unstack()

# Calculating percentages
total_no_bpmeds = bpmeds_chd_counts.loc[0].sum()
total_bpmeds = bpmeds_chd_counts.loc[1].sum()

percentage_no_bpmeds_at_risk = (bpmeds_chd_counts.loc[0, 1] / total_no_bpmeds) * 100
percentage_bpmeds_at_risk = (bpmeds_chd_counts.loc[1, 1] / total_bpmeds) * 100

# Plotting pie chart for No BPMeds
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.pie(bpmeds_chd_counts.loc[0], labels=['Not at Risk', f'At Risk ({round(percentage_no_bpmeds_at_risk, 2)}%)'], autopct='%1.1f%%', colors=['blue', 'orange'])
plt.title('Distribution of TenYearCHD Versus No BPMeds')

# Plotting pie chart for BPMeds
plt.subplot(1, 2, 2)
plt.pie(bpmeds_chd_counts.loc[1], labels=['Not at Risk', f'At Risk ({round(percentage_bpmeds_at_risk, 2)}%)'], autopct='%1.1f%%', colors=['blue', 'orange'])
plt.title('Distribution of TenYearCHD Versus BPMeds')

plt.tight_layout()
plt.show()
In [79]:
# Grouping data by currentSmoker and TenYearCHD
smoker_chd_counts = dataset.groupby(['currentSmoker', 'TenYearCHD']).size().unstack()

# Plotting the grouped bar chart
plt.figure(figsize=(8, 6))
ax = smoker_chd_counts.plot(kind='bar', stacked=False,color=['purple', 'navy'])
plt.title('Distribution of Current Smokers by TenYearCHD')
plt.xlabel('Current Smoker')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1], labels=['Non-Smoker', 'Smoker'], rotation=0)
plt.legend(['Not at Risk', 'At Risk'], loc='upper right')
Out[79]:
<matplotlib.legend.Legend at 0x1d8c0ff1950>
<Figure size 640x480 with 0 Axes>
In [80]:
# Defining age brackets in 10-year intervals
age_bins = [i for i in range(20, 101, 10)]

# Creating age brackets and group by TenYearCHD
dataset['age_group'] = pd.cut(dataset['age'], bins=age_bins)
age_chd_counts = dataset.groupby(['age_group', 'TenYearCHD']).size().unstack()

# Plotting the grouped bar chart
plt.figure(figsize=(12, 8))
ax = age_chd_counts.plot(kind='bar', stacked=False, color=['navy', 'turquoise'])
plt.title('Distribution of Age by TenYearCHD')
plt.xlabel('Age Group (years)')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(['Not at Risk', 'At Risk'], loc='upper right')
Out[80]:
<matplotlib.legend.Legend at 0x1d8c0f25950>
<Figure size 960x640 with 0 Axes>

FEATURE SELECTION¶

In [81]:
dataset.columns
Out[81]:
Index(['sex', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD', 'age_group'],
      dtype='object')
In [82]:
features = ['sex', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose']
In [83]:
features
Out[83]:
['sex',
 'age',
 'education',
 'currentSmoker',
 'cigsPerDay',
 'BPMeds',
 'prevalentStroke',
 'prevalentHyp',
 'diabetes',
 'totChol',
 'sysBP',
 'diaBP',
 'BMI',
 'heartRate',
 'glucose']
In [84]:
X = dataset[features]
In [85]:
X.head()
Out[85]:
sex age education currentSmoker cigsPerDay BPMeds prevalentStroke prevalentHyp diabetes totChol sysBP diaBP BMI heartRate glucose
0 1 39 4.0 0 0.0 0.0 0 0 0 195.0 106.0 70.0 26.97 80.0 77.0
1 0 46 2.0 0 0.0 0.0 0 0 0 250.0 121.0 81.0 28.73 95.0 76.0
2 1 48 1.0 1 20.0 0.0 0 0 0 245.0 127.5 80.0 25.34 75.0 70.0
3 0 61 3.0 1 30.0 0.0 0 1 0 225.0 150.0 95.0 28.58 65.0 103.0
4 0 46 3.0 1 23.0 0.0 0 0 0 285.0 130.0 84.0 23.10 85.0 85.0
In [86]:
# identify the target denoted by y
target = ['TenYearCHD']
In [87]:
target
Out[87]:
['TenYearCHD']
In [88]:
y = dataset[target]
In [89]:
y.head()
Out[89]:
TenYearCHD
0 0
1 0
2 0
3 1
4 0
In [ ]:
 

SPLITTING THE DATASET¶

In [90]:
from sklearn.model_selection import train_test_split

# Splitting the data into training and testing sets (80% train, 20% spolit)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
In [91]:
X_train.shape
Out[91]:
(3392, 15)
In [92]:
y_train.shape
Out[92]:
(3392, 1)
In [93]:
X_test.shape
Out[93]:
(848, 15)
In [94]:
y_train.shape
Out[94]:
(3392, 1)

MODEL SELECTION AND TRAINING¶

1. Decision Tree Classifier¶

In [95]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree Model
tree_model = DecisionTreeClassifier()
In [96]:
# Fitting the model with the training data
tree_model.fit(X_train, y_train)
Out[96]:
DecisionTreeClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier()
In [97]:
accuracy_score = tree_model.score(X_test, y_test)
accuracy_score
Out[97]:
0.7547169811320755

2. Logistical Regression¶

In [98]:
# Supress/Ignore warnings
import warnings
warnings.filterwarnings('ignore')
In [99]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression()
In [100]:
# Fiiting the model with the training data
logistic_model.fit(X_train,y_train)
Out[100]:
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
In [101]:
accuracy_score = logistic_model.score(X_test, y_test)
accuracy_score
Out[101]:
0.8573113207547169

3. KNN (K-Nearest Neighbors)¶

In [102]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors =5)
In [103]:
# Fitting the model with the training data
knn_model.fit(X_train,y_train)
Out[103]:
KNeighborsClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier()
In [104]:
accuracy_score = knn_model.score(X_test, y_test)
accuracy_score
Out[104]:
0.8419811320754716

4. Support Vector Machine SVM¶

In [105]:
from sklearn.svm import SVC
svm_model = SVC(kernel = 'linear')
In [106]:
# Fitting the model with training data
svm_model.fit(X_train,y_train)
Out[106]:
SVC(kernel='linear')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC(kernel='linear')
In [107]:
accuracy_score = svm_model.score(X_test, y_test)
accuracy_score
Out[107]:
0.8549528301886793

5. Random Forest¶

In [108]:
from sklearn.ensemble import RandomForestClassifier
forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
In [109]:
# Fitting the model with the training data
forest_model.fit(X_train,y_train)
Out[109]:
RandomForestClassifier(random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(random_state=42)
In [110]:
accuracy_score = forest_model.score(X_test, y_test)
accuracy_score
Out[110]:
0.8549528301886793

The best model chosen for this machine learning task is logistic regression as it gives the highest accuracy score of 86%¶

PREDICTION¶

  • Testing the Model using a random data
In [111]:
features
Out[111]:
['sex',
 'age',
 'education',
 'currentSmoker',
 'cigsPerDay',
 'BPMeds',
 'prevalentStroke',
 'prevalentHyp',
 'diabetes',
 'totChol',
 'sysBP',
 'diaBP',
 'BMI',
 'heartRate',
 'glucose']
In [112]:
# Define a sample list of feature values
random_features = [0,40,4,1,20,0,0,1,0,200,162,107,28,80,77]
In [113]:
# Convert the features into a dataframe
feature_df = pd.DataFrame([random_features])
In [114]:
# Prediction using the Logistic Regression Model
prediction = logistic_model.predict(feature_df)
In [115]:
prediction
Out[115]:
array([0], dtype=int64)

DEPLOYMENT¶