Skip to content
Snippets Groups Projects
Commit d9dbc0b4 authored by Taha Silat (tas574)'s avatar Taha Silat (tas574)
Browse files

Code for generating figures for final results report

parent d5e564b5
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, brier_score_loss
```
%% Cell type:code id: tags:
``` python
df = pd.read_csv("NGS_data.csv")
```
%% Cell type:code id: tags:
``` python
# Filter out the colums we are interested in studying
columns_of_interest = ['STL_160E', 'FATHEDP', 'MOTHEDP']
df = df[columns_of_interest]
```
%% Cell type:code id: tags:
``` python
# Filter out all the entries where the respondents did not answer any of the questions
df = df[(df['FATHEDP'] != 99) & (df['MOTHEDP'] != 99) & (df['STL_160E'] != 9)]
df = df.reset_index(drop=True)
```
%% Cell type:code id: tags:
``` python
# Data preprocessing to make it easier for logistic regression
df['STL_160E'] = df['STL_160E'].replace(2, 0)
```
%% Cell type:code id: tags:
``` python
x = df[['MOTHEDP', 'FATHEDP']]
y = df['STL_160E']
```
%% Cell type:code id: tags:
``` python
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42)
```
%% Cell type:code id: tags:
``` python
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
```
%% Cell type:code id: tags:
``` python
model = LogisticRegression(random_state=42, solver='lbfgs')
model.fit(x_train, y_train)
```
%% Output
LogisticRegression(random_state=42)
%% Cell type:code id: tags:
``` python
num_features = x_train.shape[1]
degFreedom = num_features - 1
chi2_stats = model.coef_[0] ** 2
p_values = 1 - chi2.cdf(chi2_stats, degFreedom)
```
%% Cell type:code id: tags:
``` python
coefficients = model.coef_[0]
# Create a DataFrame to display coefficients and p-values
coefficients_df = pd.DataFrame({'Feature': x.columns, 'Coefficient': coefficients, 'P-Value': p_values})
# Sort the coefficients
coefficients_df = coefficients_df.sort_values(by='P-Value')
coefficients_df
```
%% Output
Feature Coefficient P-Value
0 MOTHEDP 0.236758 0.812844
1 FATHEDP 0.082363 0.934358
%% Cell type:code id: tags:
``` python
# TODO STEP 6: make your prediction on the test dataset
y_pred = model.predict(x_test)
y_pred
```
%% Output
array([False, False, False, ..., False, False, False])
%% Cell type:code id: tags:
``` python
classification_rep = classification_report(y_test, y_pred, zero_division=1)
print(classification_rep)
```
%% Output
precision recall f1-score support
False 0.63 1.00 0.77 2208
True 1.00 0.00 0.00 1302
accuracy 0.63 3510
macro avg 0.81 0.50 0.39 3510
weighted avg 0.77 0.63 0.49 3510
%% Cell type:code id: tags:
``` python
brier_score = brier_score_loss(y_test, y_pred)
brier_score
```
%% Output
0.37094017094017095
%% Cell type:code id: tags:
``` python
x_min, x_max = 0, 7
y_min, y_max = 0, 1
```
%% Cell type:code id: tags:
``` python
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
```
%% Cell type:code id: tags:
``` python
father = x['FATHEDP']
father_array = father.to_numpy()
father_array
```
%% Output
array([6, 2, 6, ..., 3, 1, 2], dtype=int64)
%% Cell type:code id: tags:
``` python
mother = x['MOTHEDP']
mother_array = mother.to_numpy()
mother_array
```
%% Output
array([6, 2, 6, ..., 4, 5, 5], dtype=int64)
%% Cell type:code id: tags:
``` python
plt.contourf(xx, yy, Z, alpha=0.8)
colors = ['blue' if label == 0 else 'orange' for label in y]
scatter = plt.scatter(father_array, mother_array, c=y, edgecolors='k', marker='o', s=80, linewidth=0.8)
legend_labels = ['Did not use scholarships', 'Used scholarships']
plt.legend(handles=scatter.legend_elements()[0] , labels=legend_labels, title='Legend', loc=3, fontsize='x-small')
plt.title('Logistic Regression Decision Boundary')
plt.xlabel('Mother\'s education level')
plt.ylabel('Father\'s education level')
plt.show()
```
%% Output
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment