# important needed libraries
import pandas as pd
import numpy as np


# load dataset
dataset = pd.read_csv('MGH_PredictionDataSet.csv')


dataset.head()


dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4240 entries, 0 to 4239
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   sex              4240 non-null   int64  
 1   age              4240 non-null   int64  
 2   education        4135 non-null   float64
 3   currentSmoker    4240 non-null   int64  
 4   cigsPerDay       4211 non-null   float64
 5   BPMeds           4187 non-null   float64
 6   prevalentStroke  4240 non-null   int64  
 7   prevalentHyp     4240 non-null   int64  
 8   diabetes         4240 non-null   int64  
 9   totChol          4190 non-null   float64
 10  sysBP            4240 non-null   float64
 11  diaBP            4240 non-null   float64
 12  BMI              4221 non-null   float64
 13  heartRate        4239 non-null   float64
 14  glucose          3852 non-null   float64
 15  TenYearCHD       4240 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 530.1 KB


dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4240 entries, 0 to 4239
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   sex              4240 non-null   int64  
 1   age              4240 non-null   int64  
 2   education        4135 non-null   float64
 3   currentSmoker    4240 non-null   int64  
 4   cigsPerDay       4211 non-null   float64
 5   BPMeds           4187 non-null   float64
 6   prevalentStroke  4240 non-null   int64  
 7   prevalentHyp     4240 non-null   int64  
 8   diabetes         4240 non-null   int64  
 9   totChol          4190 non-null   float64
 10  sysBP            4240 non-null   float64
 11  diaBP            4240 non-null   float64
 12  BMI              4221 non-null   float64
 13  heartRate        4239 non-null   float64
 14  glucose          3852 non-null   float64
 15  TenYearCHD       4240 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 530.1 KB


# View sum of nulls in each column
dataset.isnull().sum()

sex                  0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64


# Replace the nulls

# Categorical Data - Replace with Mode
dataset['education'].fillna(dataset['education'].mode()[0], inplace = True)
dataset['BPMeds'].fillna(dataset['BPMeds'].mode()[0], inplace = True)


# Continous Data - Replace with Mean
dataset['cigsPerDay'].fillna(dataset['cigsPerDay'].mean(), inplace = True)
dataset['totChol'].fillna(dataset['totChol'].mean(), inplace = True)
dataset['BMI'].fillna(dataset['BMI'].mean(), inplace = True)
dataset['heartRate'].fillna(dataset['heartRate'].mean(), inplace = True)
dataset['glucose'].fillna(dataset['glucose'].mean(), inplace = True)


# View sum of nulls in each column
dataset.isnull().sum()

sex                0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64


# Summarizing the data 
dataset.describe()


# Check for data correlation between the variables
dataset.corr(numeric_only = True)


# Importing libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns


heatmap_data = dataset.corr(numeric_only = True)


# Represent correlation matrix as a heatmap
fig, ax = plt.subplots(figsize =(15,15))
sns.heatmap(heatmap_data, annot = True,cmap = 'BuPu')

<Axes: >


# Histogram Using Seaborn
sns.histplot(dataset['age'], bins = 15)
plt.title('Age in Year')
plt.ylabel('Frequency')

Text(0, 0.5, 'Frequency')


# Check distribution of Hypertension 
sns.countplot(x = 'prevalentHyp', data = dataset, palette = 'hls') 
plt.title('Distribution of TenYearCHD')
plt.ylabel('Count')
plt.xlabel('TenYearCHD')

Text(0.5, 0, 'TenYearCHD')


# Check distribution of Education
sns.countplot(x = 'education', data = dataset, palette = 'hls') 
plt.title('Distribution of TehnYearCHD by Education')
plt.ylabel('Count')

Text(0, 0.5, 'Count')


from matplotlib.pyplot import style
style.use('classic')
# Grouping data by BPMeds and TenYearCHD
bpmeds_chd_counts = dataset.groupby(['BPMeds', 'TenYearCHD']).size().unstack()

# Calculating percentages
total_no_bpmeds = bpmeds_chd_counts.loc[0].sum()
total_bpmeds = bpmeds_chd_counts.loc[1].sum()

percentage_no_bpmeds_at_risk = (bpmeds_chd_counts.loc[0, 1] / total_no_bpmeds) * 100
percentage_bpmeds_at_risk = (bpmeds_chd_counts.loc[1, 1] / total_bpmeds) * 100

# Plotting pie chart for No BPMeds
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.pie(bpmeds_chd_counts.loc[0], labels=['Not at Risk', f'At Risk ({round(percentage_no_bpmeds_at_risk, 2)}%)'], autopct='%1.1f%%', colors=['blue', 'orange'])
plt.title('Distribution of TenYearCHD Versus No BPMeds')

# Plotting pie chart for BPMeds
plt.subplot(1, 2, 2)
plt.pie(bpmeds_chd_counts.loc[1], labels=['Not at Risk', f'At Risk ({round(percentage_bpmeds_at_risk, 2)}%)'], autopct='%1.1f%%', colors=['blue', 'orange'])
plt.title('Distribution of TenYearCHD Versus BPMeds')

plt.tight_layout()
plt.show()


# Grouping data by currentSmoker and TenYearCHD
smoker_chd_counts = dataset.groupby(['currentSmoker', 'TenYearCHD']).size().unstack()

# Plotting the grouped bar chart
plt.figure(figsize=(8, 6))
ax = smoker_chd_counts.plot(kind='bar', stacked=False,color=['purple', 'navy'])
plt.title('Distribution of Current Smokers by TenYearCHD')
plt.xlabel('Current Smoker')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1], labels=['Non-Smoker', 'Smoker'], rotation=0)
plt.legend(['Not at Risk', 'At Risk'], loc='upper right')

<matplotlib.legend.Legend at 0x1d8c0ff1950>

<Figure size 640x480 with 0 Axes>


# Defining age brackets in 10-year intervals
age_bins = [i for i in range(20, 101, 10)]

# Creating age brackets and group by TenYearCHD
dataset['age_group'] = pd.cut(dataset['age'], bins=age_bins)
age_chd_counts = dataset.groupby(['age_group', 'TenYearCHD']).size().unstack()

# Plotting the grouped bar chart
plt.figure(figsize=(12, 8))
ax = age_chd_counts.plot(kind='bar', stacked=False, color=['navy', 'turquoise'])
plt.title('Distribution of Age by TenYearCHD')
plt.xlabel('Age Group (years)')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(['Not at Risk', 'At Risk'], loc='upper right')

<matplotlib.legend.Legend at 0x1d8c0f25950>

<Figure size 960x640 with 0 Axes>


dataset.columns

Index(['sex', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD', 'age_group'],
      dtype='object')


features = ['sex', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose']


features

['sex',
 'age',
 'education',
 'currentSmoker',
 'cigsPerDay',
 'BPMeds',
 'prevalentStroke',
 'prevalentHyp',
 'diabetes',
 'totChol',
 'sysBP',
 'diaBP',
 'BMI',
 'heartRate',
 'glucose']


X = dataset[features]


X.head()


# identify the target denoted by y
target = ['TenYearCHD']


target

['TenYearCHD']


y = dataset[target]


y.head()


from sklearn.model_selection import train_test_split

# Splitting the data into training and testing sets (80% train, 20% spolit)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


X_train.shape

(3392, 15)


y_train.shape

(3392, 1)


X_test.shape

(848, 15)


y_train.shape

(3392, 1)


from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree Model
tree_model = DecisionTreeClassifier()


# Fitting the model with the training data
tree_model.fit(X_train, y_train)

DecisionTreeClassifier()

DecisionTreeClassifier()


accuracy_score = tree_model.score(X_test, y_test)
accuracy_score

0.7547169811320755


# Supress/Ignore warnings
import warnings
warnings.filterwarnings('ignore')


from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression()


# Fiiting the model with the training data
logistic_model.fit(X_train,y_train)

LogisticRegression()

LogisticRegression()


accuracy_score = logistic_model.score(X_test, y_test)
accuracy_score

0.8573113207547169


from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors =5)


# Fitting the model with the training data
knn_model.fit(X_train,y_train)

KNeighborsClassifier()

KNeighborsClassifier()


accuracy_score = knn_model.score(X_test, y_test)
accuracy_score

0.8419811320754716


from sklearn.svm import SVC
svm_model = SVC(kernel = 'linear')


# Fitting the model with training data
svm_model.fit(X_train,y_train)

SVC(kernel='linear')

SVC(kernel='linear')


accuracy_score = svm_model.score(X_test, y_test)
accuracy_score

0.8549528301886793


from sklearn.ensemble import RandomForestClassifier
forest_model = RandomForestClassifier(n_estimators=100, random_state=42)


# Fitting the model with the training data
forest_model.fit(X_train,y_train)

RandomForestClassifier(random_state=42)

RandomForestClassifier(random_state=42)


accuracy_score = forest_model.score(X_test, y_test)
accuracy_score

0.8549528301886793


features

['sex',
 'age',
 'education',
 'currentSmoker',
 'cigsPerDay',
 'BPMeds',
 'prevalentStroke',
 'prevalentHyp',
 'diabetes',
 'totChol',
 'sysBP',
 'diaBP',
 'BMI',
 'heartRate',
 'glucose']


# Define a sample list of feature values
random_features = [0,40,4,1,20,0,0,1,0,200,162,107,28,80,77]


# Convert the features into a dataframe
feature_df = pd.DataFrame([random_features])


# Prediction using the Logistic Regression Model
prediction = logistic_model.predict(feature_df)


prediction

array([0], dtype=int64)

	sex	age	education	currentSmoker	cigsPerDay	prevalentHyp	totChol	sysBP	diaBP	BMI	heartRate	glucose	TenYearCHD
0	1	39	4.0	0	0.0	0	195.0	106.0	70.0	26.97	80.0	77.0	0
1	0	46	2.0	0	0.0	0	250.0	121.0	81.0	28.73	95.0	76.0	0
2	1	48	1.0	1	20.0	0	245.0	127.5	80.0	25.34	75.0	70.0	0
3	0	61	3.0	1	30.0	1	225.0	150.0	95.0	28.58	65.0	103.0	1
4	0	46	3.0	1	23.0	0	285.0	130.0	84.0	23.10	85.0	85.0	0

	sex	age	education	currentSmoker	cigsPerDay	BPMeds	prevalentStroke	prevalentHyp	diabetes	totChol	sysBP	diaBP	BMI	heartRate	glucose	TenYearCHD
count	4240.000000	4240.000000	4240.000000	4240.000000	4240.000000	4240.000000	4240.000000	4240.000000	4240.000000	4240.000000	4240.000000	4240.000000	4240.000000	4240.000000	4240.000000	4240.000000
mean	0.429245	49.580189	1.955189	0.494104	9.005937	0.029245	0.005896	0.310613	0.025708	236.699523	132.354599	82.897759	25.800801	75.878981	81.963655	0.151887
std	0.495027	8.572942	1.018522	0.500024	11.881610	0.168513	0.076569	0.462799	0.158280	44.327521	22.033300	11.910394	4.070687	12.023929	22.831748	0.358953
min	0.000000	32.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	107.000000	83.500000	48.000000	15.540000	44.000000	40.000000	0.000000
25%	0.000000	42.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	206.000000	117.000000	75.000000	23.077500	68.000000	72.000000	0.000000
50%	0.000000	49.000000	2.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	234.000000	128.000000	82.000000	25.410000	75.000000	80.000000	0.000000
75%	1.000000	56.000000	3.000000	1.000000	20.000000	0.000000	0.000000	1.000000	0.000000	262.000000	144.000000	90.000000	28.032500	83.000000	85.000000	0.000000
max	1.000000	70.000000	4.000000	1.000000	70.000000	1.000000	1.000000	1.000000	1.000000	696.000000	295.000000	142.500000	56.800000	143.000000	394.000000	1.000000

	sex	age	education	currentSmoker	cigsPerDay	BPMeds	prevalentStroke	prevalentHyp	diabetes	totChol	sysBP	diaBP	BMI	heartRate	glucose	TenYearCHD
sex	1.000000	-0.029014	0.013361	0.197026	0.316023	-0.051544	-0.004550	0.005853	0.015693	-0.070064	-0.035879	0.058199	0.081705	-0.116913	0.005718	0.088374
age	-0.029014	1.000000	-0.165283	-0.213662	-0.192534	0.121011	0.057679	0.306799	0.101314	0.260691	0.394053	0.205586	0.135578	-0.012839	0.116951	0.225408
education	0.013361	-0.165283	1.000000	0.019399	0.010217	-0.010231	-0.032910	-0.078565	-0.038215	-0.024025	-0.126062	-0.062334	-0.139731	-0.049580	-0.034416	-0.053002
currentSmoker	0.197026	-0.213662	0.019399	1.000000	0.767055	-0.048348	-0.032980	-0.103710	-0.044285	-0.046211	-0.130281	-0.107933	-0.167483	0.062678	-0.054062	0.019448
cigsPerDay	0.316023	-0.192534	0.010217	0.767055	1.000000	-0.045684	-0.032711	-0.066444	-0.037086	-0.026182	-0.088523	-0.056473	-0.092888	0.075257	-0.056020	0.057646
BPMeds	-0.051544	0.121011	-0.010231	-0.048348	-0.045684	1.000000	0.114614	0.258580	0.051407	0.078775	0.251479	0.192254	0.099681	0.015136	0.048876	0.086448
prevalentStroke	-0.004550	0.057679	-0.032910	-0.032980	-0.032711	0.114614	1.000000	0.074791	0.006955	0.000105	0.057000	0.045153	0.024856	-0.017674	0.018065	0.061823
prevalentHyp	0.005853	0.306799	-0.078565	-0.103710	-0.066444	0.258580	0.074791	1.000000	0.077752	0.162683	0.696656	0.615840	0.300599	0.146777	0.082757	0.177458
diabetes	0.015693	0.101314	-0.038215	-0.044285	-0.037086	0.051407	0.006955	0.077752	1.000000	0.040161	0.111265	0.050260	0.086282	0.048986	0.605709	0.097344
totChol	-0.070064	0.260691	-0.024025	-0.046211	-0.026182	0.078775	0.000105	0.162683	0.040161	1.000000	0.207436	0.163423	0.115013	0.090678	0.044710	0.081807
sysBP	-0.035879	0.394053	-0.126062	-0.130281	-0.088523	0.251479	0.057000	0.696656	0.111265	0.207436	1.000000	0.783952	0.325172	0.182084	0.134561	0.216374
diaBP	0.058199	0.205586	-0.062334	-0.107933	-0.056473	0.192254	0.045153	0.615840	0.050260	0.163423	0.783952	1.000000	0.376317	0.181012	0.058499	0.145112
BMI	0.081705	0.135578	-0.139731	-0.167483	-0.092888	0.099681	0.024856	0.300599	0.086282	0.115013	0.325172	0.376317	1.000000	0.067318	0.082123	0.074788
heartRate	-0.116913	-0.012839	-0.049580	0.062678	0.075257	0.015136	-0.017674	0.146777	0.048986	0.090678	0.182084	0.181012	0.067318	1.000000	0.089386	0.022892
glucose	0.005718	0.116951	-0.034416	-0.054062	-0.056020	0.048876	0.018065	0.082757	0.605709	0.044710	0.134561	0.058499	0.082123	0.089386	1.000000	0.120451
TenYearCHD	0.088374	0.225408	-0.053002	0.019448	0.057646	0.086448	0.061823	0.177458	0.097344	0.081807	0.216374	0.145112	0.074788	0.022892	0.120451	1.000000

	sex	age	education	currentSmoker	cigsPerDay	prevalentHyp	totChol	sysBP	diaBP	BMI	heartRate	glucose
0	1	39	4.0	0	0.0	0	195.0	106.0	70.0	26.97	80.0	77.0
1	0	46	2.0	0	0.0	0	250.0	121.0	81.0	28.73	95.0	76.0
2	1	48	1.0	1	20.0	0	245.0	127.5	80.0	25.34	75.0	70.0
3	0	61	3.0	1	30.0	1	225.0	150.0	95.0	28.58	65.0	103.0
4	0	46	3.0	1	23.0	0	285.0	130.0	84.0	23.10	85.0	85.0

DATA CLEANING¶

EXPLORATORY DATA ANALYSIS (EDA)¶

Exploring our dataset to understand its structure inorder to develop our hypotheses¶

FEATURE SELECTION¶

SPLITTING THE DATASET¶

MODEL SELECTION AND TRAINING¶

1. Decision Tree Classifier¶

2. Logistical Regression¶

3. KNN (K-Nearest Neighbors)¶

4. Support Vector Machine SVM¶

5. Random Forest¶

The best model chosen for this machine learning task is logistic regression as it gives the highest accuracy score of 86%¶

PREDICTION¶

DEPLOYMENT¶