# important needed libraries
import pandas as pd
import numpy as np
# load dataset
dataset = pd.read_csv('MGH_PredictionDataSet.csv')
dataset.head()
sex | age | education | currentSmoker | cigsPerDay | BPMeds | prevalentStroke | prevalentHyp | diabetes | totChol | sysBP | diaBP | BMI | heartRate | glucose | TenYearCHD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 39 | 4.0 | 0 | 0.0 | 0.0 | 0 | 0 | 0 | 195.0 | 106.0 | 70.0 | 26.97 | 80.0 | 77.0 | 0 |
1 | 0 | 46 | 2.0 | 0 | 0.0 | 0.0 | 0 | 0 | 0 | 250.0 | 121.0 | 81.0 | 28.73 | 95.0 | 76.0 | 0 |
2 | 1 | 48 | 1.0 | 1 | 20.0 | 0.0 | 0 | 0 | 0 | 245.0 | 127.5 | 80.0 | 25.34 | 75.0 | 70.0 | 0 |
3 | 0 | 61 | 3.0 | 1 | 30.0 | 0.0 | 0 | 1 | 0 | 225.0 | 150.0 | 95.0 | 28.58 | 65.0 | 103.0 | 1 |
4 | 0 | 46 | 3.0 | 1 | 23.0 | 0.0 | 0 | 0 | 0 | 285.0 | 130.0 | 84.0 | 23.10 | 85.0 | 85.0 | 0 |
dataset.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4240 entries, 0 to 4239 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sex 4240 non-null int64 1 age 4240 non-null int64 2 education 4135 non-null float64 3 currentSmoker 4240 non-null int64 4 cigsPerDay 4211 non-null float64 5 BPMeds 4187 non-null float64 6 prevalentStroke 4240 non-null int64 7 prevalentHyp 4240 non-null int64 8 diabetes 4240 non-null int64 9 totChol 4190 non-null float64 10 sysBP 4240 non-null float64 11 diaBP 4240 non-null float64 12 BMI 4221 non-null float64 13 heartRate 4239 non-null float64 14 glucose 3852 non-null float64 15 TenYearCHD 4240 non-null int64 dtypes: float64(9), int64(7) memory usage: 530.1 KB
dataset.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4240 entries, 0 to 4239 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sex 4240 non-null int64 1 age 4240 non-null int64 2 education 4135 non-null float64 3 currentSmoker 4240 non-null int64 4 cigsPerDay 4211 non-null float64 5 BPMeds 4187 non-null float64 6 prevalentStroke 4240 non-null int64 7 prevalentHyp 4240 non-null int64 8 diabetes 4240 non-null int64 9 totChol 4190 non-null float64 10 sysBP 4240 non-null float64 11 diaBP 4240 non-null float64 12 BMI 4221 non-null float64 13 heartRate 4239 non-null float64 14 glucose 3852 non-null float64 15 TenYearCHD 4240 non-null int64 dtypes: float64(9), int64(7) memory usage: 530.1 KB
# View sum of nulls in each column
dataset.isnull().sum()
sex 0 age 0 education 105 currentSmoker 0 cigsPerDay 29 BPMeds 53 prevalentStroke 0 prevalentHyp 0 diabetes 0 totChol 50 sysBP 0 diaBP 0 BMI 19 heartRate 1 glucose 388 TenYearCHD 0 dtype: int64
# Replace the nulls
# Categorical Data - Replace with Mode
dataset['education'].fillna(dataset['education'].mode()[0], inplace = True)
dataset['BPMeds'].fillna(dataset['BPMeds'].mode()[0], inplace = True)
# Continous Data - Replace with Mean
dataset['cigsPerDay'].fillna(dataset['cigsPerDay'].mean(), inplace = True)
dataset['totChol'].fillna(dataset['totChol'].mean(), inplace = True)
dataset['BMI'].fillna(dataset['BMI'].mean(), inplace = True)
dataset['heartRate'].fillna(dataset['heartRate'].mean(), inplace = True)
dataset['glucose'].fillna(dataset['glucose'].mean(), inplace = True)
# View sum of nulls in each column
dataset.isnull().sum()
sex 0 age 0 education 0 currentSmoker 0 cigsPerDay 0 BPMeds 0 prevalentStroke 0 prevalentHyp 0 diabetes 0 totChol 0 sysBP 0 diaBP 0 BMI 0 heartRate 0 glucose 0 TenYearCHD 0 dtype: int64
# Summarizing the data
dataset.describe()
sex | age | education | currentSmoker | cigsPerDay | BPMeds | prevalentStroke | prevalentHyp | diabetes | totChol | sysBP | diaBP | BMI | heartRate | glucose | TenYearCHD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 4240.000000 | 4240.000000 | 4240.000000 | 4240.000000 | 4240.000000 | 4240.000000 | 4240.000000 | 4240.000000 | 4240.000000 | 4240.000000 | 4240.000000 | 4240.000000 | 4240.000000 | 4240.000000 | 4240.000000 | 4240.000000 |
mean | 0.429245 | 49.580189 | 1.955189 | 0.494104 | 9.005937 | 0.029245 | 0.005896 | 0.310613 | 0.025708 | 236.699523 | 132.354599 | 82.897759 | 25.800801 | 75.878981 | 81.963655 | 0.151887 |
std | 0.495027 | 8.572942 | 1.018522 | 0.500024 | 11.881610 | 0.168513 | 0.076569 | 0.462799 | 0.158280 | 44.327521 | 22.033300 | 11.910394 | 4.070687 | 12.023929 | 22.831748 | 0.358953 |
min | 0.000000 | 32.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 107.000000 | 83.500000 | 48.000000 | 15.540000 | 44.000000 | 40.000000 | 0.000000 |
25% | 0.000000 | 42.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 206.000000 | 117.000000 | 75.000000 | 23.077500 | 68.000000 | 72.000000 | 0.000000 |
50% | 0.000000 | 49.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 234.000000 | 128.000000 | 82.000000 | 25.410000 | 75.000000 | 80.000000 | 0.000000 |
75% | 1.000000 | 56.000000 | 3.000000 | 1.000000 | 20.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 262.000000 | 144.000000 | 90.000000 | 28.032500 | 83.000000 | 85.000000 | 0.000000 |
max | 1.000000 | 70.000000 | 4.000000 | 1.000000 | 70.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 696.000000 | 295.000000 | 142.500000 | 56.800000 | 143.000000 | 394.000000 | 1.000000 |
# Check for data correlation between the variables
dataset.corr(numeric_only = True)
sex | age | education | currentSmoker | cigsPerDay | BPMeds | prevalentStroke | prevalentHyp | diabetes | totChol | sysBP | diaBP | BMI | heartRate | glucose | TenYearCHD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
sex | 1.000000 | -0.029014 | 0.013361 | 0.197026 | 0.316023 | -0.051544 | -0.004550 | 0.005853 | 0.015693 | -0.070064 | -0.035879 | 0.058199 | 0.081705 | -0.116913 | 0.005718 | 0.088374 |
age | -0.029014 | 1.000000 | -0.165283 | -0.213662 | -0.192534 | 0.121011 | 0.057679 | 0.306799 | 0.101314 | 0.260691 | 0.394053 | 0.205586 | 0.135578 | -0.012839 | 0.116951 | 0.225408 |
education | 0.013361 | -0.165283 | 1.000000 | 0.019399 | 0.010217 | -0.010231 | -0.032910 | -0.078565 | -0.038215 | -0.024025 | -0.126062 | -0.062334 | -0.139731 | -0.049580 | -0.034416 | -0.053002 |
currentSmoker | 0.197026 | -0.213662 | 0.019399 | 1.000000 | 0.767055 | -0.048348 | -0.032980 | -0.103710 | -0.044285 | -0.046211 | -0.130281 | -0.107933 | -0.167483 | 0.062678 | -0.054062 | 0.019448 |
cigsPerDay | 0.316023 | -0.192534 | 0.010217 | 0.767055 | 1.000000 | -0.045684 | -0.032711 | -0.066444 | -0.037086 | -0.026182 | -0.088523 | -0.056473 | -0.092888 | 0.075257 | -0.056020 | 0.057646 |
BPMeds | -0.051544 | 0.121011 | -0.010231 | -0.048348 | -0.045684 | 1.000000 | 0.114614 | 0.258580 | 0.051407 | 0.078775 | 0.251479 | 0.192254 | 0.099681 | 0.015136 | 0.048876 | 0.086448 |
prevalentStroke | -0.004550 | 0.057679 | -0.032910 | -0.032980 | -0.032711 | 0.114614 | 1.000000 | 0.074791 | 0.006955 | 0.000105 | 0.057000 | 0.045153 | 0.024856 | -0.017674 | 0.018065 | 0.061823 |
prevalentHyp | 0.005853 | 0.306799 | -0.078565 | -0.103710 | -0.066444 | 0.258580 | 0.074791 | 1.000000 | 0.077752 | 0.162683 | 0.696656 | 0.615840 | 0.300599 | 0.146777 | 0.082757 | 0.177458 |
diabetes | 0.015693 | 0.101314 | -0.038215 | -0.044285 | -0.037086 | 0.051407 | 0.006955 | 0.077752 | 1.000000 | 0.040161 | 0.111265 | 0.050260 | 0.086282 | 0.048986 | 0.605709 | 0.097344 |
totChol | -0.070064 | 0.260691 | -0.024025 | -0.046211 | -0.026182 | 0.078775 | 0.000105 | 0.162683 | 0.040161 | 1.000000 | 0.207436 | 0.163423 | 0.115013 | 0.090678 | 0.044710 | 0.081807 |
sysBP | -0.035879 | 0.394053 | -0.126062 | -0.130281 | -0.088523 | 0.251479 | 0.057000 | 0.696656 | 0.111265 | 0.207436 | 1.000000 | 0.783952 | 0.325172 | 0.182084 | 0.134561 | 0.216374 |
diaBP | 0.058199 | 0.205586 | -0.062334 | -0.107933 | -0.056473 | 0.192254 | 0.045153 | 0.615840 | 0.050260 | 0.163423 | 0.783952 | 1.000000 | 0.376317 | 0.181012 | 0.058499 | 0.145112 |
BMI | 0.081705 | 0.135578 | -0.139731 | -0.167483 | -0.092888 | 0.099681 | 0.024856 | 0.300599 | 0.086282 | 0.115013 | 0.325172 | 0.376317 | 1.000000 | 0.067318 | 0.082123 | 0.074788 |
heartRate | -0.116913 | -0.012839 | -0.049580 | 0.062678 | 0.075257 | 0.015136 | -0.017674 | 0.146777 | 0.048986 | 0.090678 | 0.182084 | 0.181012 | 0.067318 | 1.000000 | 0.089386 | 0.022892 |
glucose | 0.005718 | 0.116951 | -0.034416 | -0.054062 | -0.056020 | 0.048876 | 0.018065 | 0.082757 | 0.605709 | 0.044710 | 0.134561 | 0.058499 | 0.082123 | 0.089386 | 1.000000 | 0.120451 |
TenYearCHD | 0.088374 | 0.225408 | -0.053002 | 0.019448 | 0.057646 | 0.086448 | 0.061823 | 0.177458 | 0.097344 | 0.081807 | 0.216374 | 0.145112 | 0.074788 | 0.022892 | 0.120451 | 1.000000 |
# Importing libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns
heatmap_data = dataset.corr(numeric_only = True)
# Represent correlation matrix as a heatmap
fig, ax = plt.subplots(figsize =(15,15))
sns.heatmap(heatmap_data, annot = True,cmap = 'BuPu')
<Axes: >
# Histogram Using Seaborn
sns.histplot(dataset['age'], bins = 15)
plt.title('Age in Year')
plt.ylabel('Frequency')
Text(0, 0.5, 'Frequency')
# Check distribution of Hypertension
sns.countplot(x = 'prevalentHyp', data = dataset, palette = 'hls')
plt.title('Distribution of TenYearCHD')
plt.ylabel('Count')
plt.xlabel('TenYearCHD')
Text(0.5, 0, 'TenYearCHD')
# Check distribution of Education
sns.countplot(x = 'education', data = dataset, palette = 'hls')
plt.title('Distribution of TehnYearCHD by Education')
plt.ylabel('Count')
Text(0, 0.5, 'Count')
from matplotlib.pyplot import style
style.use('classic')
# Grouping data by BPMeds and TenYearCHD
bpmeds_chd_counts = dataset.groupby(['BPMeds', 'TenYearCHD']).size().unstack()
# Calculating percentages
total_no_bpmeds = bpmeds_chd_counts.loc[0].sum()
total_bpmeds = bpmeds_chd_counts.loc[1].sum()
percentage_no_bpmeds_at_risk = (bpmeds_chd_counts.loc[0, 1] / total_no_bpmeds) * 100
percentage_bpmeds_at_risk = (bpmeds_chd_counts.loc[1, 1] / total_bpmeds) * 100
# Plotting pie chart for No BPMeds
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.pie(bpmeds_chd_counts.loc[0], labels=['Not at Risk', f'At Risk ({round(percentage_no_bpmeds_at_risk, 2)}%)'], autopct='%1.1f%%', colors=['blue', 'orange'])
plt.title('Distribution of TenYearCHD Versus No BPMeds')
# Plotting pie chart for BPMeds
plt.subplot(1, 2, 2)
plt.pie(bpmeds_chd_counts.loc[1], labels=['Not at Risk', f'At Risk ({round(percentage_bpmeds_at_risk, 2)}%)'], autopct='%1.1f%%', colors=['blue', 'orange'])
plt.title('Distribution of TenYearCHD Versus BPMeds')
plt.tight_layout()
plt.show()
# Grouping data by currentSmoker and TenYearCHD
smoker_chd_counts = dataset.groupby(['currentSmoker', 'TenYearCHD']).size().unstack()
# Plotting the grouped bar chart
plt.figure(figsize=(8, 6))
ax = smoker_chd_counts.plot(kind='bar', stacked=False,color=['purple', 'navy'])
plt.title('Distribution of Current Smokers by TenYearCHD')
plt.xlabel('Current Smoker')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1], labels=['Non-Smoker', 'Smoker'], rotation=0)
plt.legend(['Not at Risk', 'At Risk'], loc='upper right')
<matplotlib.legend.Legend at 0x1d8c0ff1950>
<Figure size 640x480 with 0 Axes>
# Defining age brackets in 10-year intervals
age_bins = [i for i in range(20, 101, 10)]
# Creating age brackets and group by TenYearCHD
dataset['age_group'] = pd.cut(dataset['age'], bins=age_bins)
age_chd_counts = dataset.groupby(['age_group', 'TenYearCHD']).size().unstack()
# Plotting the grouped bar chart
plt.figure(figsize=(12, 8))
ax = age_chd_counts.plot(kind='bar', stacked=False, color=['navy', 'turquoise'])
plt.title('Distribution of Age by TenYearCHD')
plt.xlabel('Age Group (years)')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(['Not at Risk', 'At Risk'], loc='upper right')
<matplotlib.legend.Legend at 0x1d8c0f25950>
<Figure size 960x640 with 0 Axes>
dataset.columns
Index(['sex', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD', 'age_group'], dtype='object')
features = ['sex', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds',
'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
'diaBP', 'BMI', 'heartRate', 'glucose']
features
['sex', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']
X = dataset[features]
X.head()
sex | age | education | currentSmoker | cigsPerDay | BPMeds | prevalentStroke | prevalentHyp | diabetes | totChol | sysBP | diaBP | BMI | heartRate | glucose | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 39 | 4.0 | 0 | 0.0 | 0.0 | 0 | 0 | 0 | 195.0 | 106.0 | 70.0 | 26.97 | 80.0 | 77.0 |
1 | 0 | 46 | 2.0 | 0 | 0.0 | 0.0 | 0 | 0 | 0 | 250.0 | 121.0 | 81.0 | 28.73 | 95.0 | 76.0 |
2 | 1 | 48 | 1.0 | 1 | 20.0 | 0.0 | 0 | 0 | 0 | 245.0 | 127.5 | 80.0 | 25.34 | 75.0 | 70.0 |
3 | 0 | 61 | 3.0 | 1 | 30.0 | 0.0 | 0 | 1 | 0 | 225.0 | 150.0 | 95.0 | 28.58 | 65.0 | 103.0 |
4 | 0 | 46 | 3.0 | 1 | 23.0 | 0.0 | 0 | 0 | 0 | 285.0 | 130.0 | 84.0 | 23.10 | 85.0 | 85.0 |
# identify the target denoted by y
target = ['TenYearCHD']
target
['TenYearCHD']
y = dataset[target]
y.head()
TenYearCHD | |
---|---|
0 | 0 |
1 | 0 |
2 | 0 |
3 | 1 |
4 | 0 |
from sklearn.model_selection import train_test_split
# Splitting the data into training and testing sets (80% train, 20% spolit)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train.shape
(3392, 15)
y_train.shape
(3392, 1)
X_test.shape
(848, 15)
y_train.shape
(3392, 1)
from sklearn.tree import DecisionTreeClassifier
# Initialize the Decision Tree Model
tree_model = DecisionTreeClassifier()
# Fitting the model with the training data
tree_model.fit(X_train, y_train)
DecisionTreeClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier()
accuracy_score = tree_model.score(X_test, y_test)
accuracy_score
0.7547169811320755
# Supress/Ignore warnings
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression()
# Fiiting the model with the training data
logistic_model.fit(X_train,y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression()
accuracy_score = logistic_model.score(X_test, y_test)
accuracy_score
0.8573113207547169
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors =5)
# Fitting the model with the training data
knn_model.fit(X_train,y_train)
KNeighborsClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KNeighborsClassifier()
accuracy_score = knn_model.score(X_test, y_test)
accuracy_score
0.8419811320754716
from sklearn.svm import SVC
svm_model = SVC(kernel = 'linear')
# Fitting the model with training data
svm_model.fit(X_train,y_train)
SVC(kernel='linear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC(kernel='linear')
accuracy_score = svm_model.score(X_test, y_test)
accuracy_score
0.8549528301886793
from sklearn.ensemble import RandomForestClassifier
forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
# Fitting the model with the training data
forest_model.fit(X_train,y_train)
RandomForestClassifier(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(random_state=42)
accuracy_score = forest_model.score(X_test, y_test)
accuracy_score
0.8549528301886793
features
['sex', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']
# Define a sample list of feature values
random_features = [0,40,4,1,20,0,0,1,0,200,162,107,28,80,77]
# Convert the features into a dataframe
feature_df = pd.DataFrame([random_features])
# Prediction using the Logistic Regression Model
prediction = logistic_model.predict(feature_df)
prediction
array([0], dtype=int64)