From d5e564b5948587e6d3ea8a6a7f01f245753ce67b Mon Sep 17 00:00:00 2001
From: "Taha Silat (tas574)" <taha.silat@usask.ca>
Date: Mon, 11 Dec 2023 00:17:37 -0600
Subject: [PATCH] Code for generating figures for methods report

---
 methods_report.ipynb | 542 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 542 insertions(+)
 create mode 100644 methods_report.ipynb

diff --git a/methods_report.ipynb b/methods_report.ipynb
new file mode 100644
index 0000000..232ec35
--- /dev/null
+++ b/methods_report.ipynb
@@ -0,0 +1,542 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: statsmodels in c:\\users\\tahas\\anaconda3\\lib\\site-packages (0.13.2)\n",
+      "Requirement already satisfied: numpy>=1.17 in c:\\users\\tahas\\anaconda3\\lib\\site-packages (from statsmodels) (1.21.5)\n",
+      "Requirement already satisfied: scipy>=1.3 in c:\\users\\tahas\\anaconda3\\lib\\site-packages (from statsmodels) (1.7.3)\n",
+      "Requirement already satisfied: pandas>=0.25 in c:\\users\\tahas\\anaconda3\\lib\\site-packages (from statsmodels) (1.4.2)\n",
+      "Requirement already satisfied: patsy>=0.5.2 in c:\\users\\tahas\\anaconda3\\lib\\site-packages (from statsmodels) (0.5.2)\n",
+      "Requirement already satisfied: packaging>=21.3 in c:\\users\\tahas\\anaconda3\\lib\\site-packages (from statsmodels) (21.3)\n",
+      "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in c:\\users\\tahas\\anaconda3\\lib\\site-packages (from packaging>=21.3->statsmodels) (3.0.4)\n",
+      "Requirement already satisfied: pytz>=2020.1 in c:\\users\\tahas\\anaconda3\\lib\\site-packages (from pandas>=0.25->statsmodels) (2021.3)\n",
+      "Requirement already satisfied: python-dateutil>=2.8.1 in c:\\users\\tahas\\anaconda3\\lib\\site-packages (from pandas>=0.25->statsmodels) (2.8.2)\n",
+      "Requirement already satisfied: six in c:\\users\\tahas\\anaconda3\\lib\\site-packages (from patsy>=0.5.2->statsmodels) (1.16.0)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install statsmodels"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
+    "from scipy.stats import chi2\n",
+    "import statsmodels.api as sm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(\"NGS_data.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>PUMFID</th>\n",
+       "      <th>PGM_P405</th>\n",
+       "      <th>FATHEDP</th>\n",
+       "      <th>MOTHEDP</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>28111</td>\n",
+       "      <td>1</td>\n",
+       "      <td>6</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>28113</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>28114</td>\n",
+       "      <td>3</td>\n",
+       "      <td>6</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>28116</td>\n",
+       "      <td>2</td>\n",
+       "      <td>4</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>28117</td>\n",
+       "      <td>1</td>\n",
+       "      <td>4</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19559</th>\n",
+       "      <td>63858</td>\n",
+       "      <td>1</td>\n",
+       "      <td>6</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19560</th>\n",
+       "      <td>63864</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19561</th>\n",
+       "      <td>63865</td>\n",
+       "      <td>2</td>\n",
+       "      <td>3</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19562</th>\n",
+       "      <td>63866</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19563</th>\n",
+       "      <td>63868</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>19564 rows × 4 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       PUMFID  PGM_P405  FATHEDP  MOTHEDP\n",
+       "0       28111         1        6        6\n",
+       "1       28113         2        2        2\n",
+       "2       28114         3        6        6\n",
+       "3       28116         2        4        5\n",
+       "4       28117         1        4        4\n",
+       "...       ...       ...      ...      ...\n",
+       "19559   63858         1        6        6\n",
+       "19560   63864         1        5        5\n",
+       "19561   63865         2        3        4\n",
+       "19562   63866         1        1        5\n",
+       "19563   63868         1        2        5\n",
+       "\n",
+       "[19564 rows x 4 columns]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "columns_of_interest = ['PUMFID', 'PGM_P405', 'FATHEDP', 'MOTHEDP']\n",
+    "df = df[columns_of_interest]\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>PUMFID</th>\n",
+       "      <th>grade_average</th>\n",
+       "      <th>FATHEDP</th>\n",
+       "      <th>MOTHEDP</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>28111</td>\n",
+       "      <td>1</td>\n",
+       "      <td>6</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>28113</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>28114</td>\n",
+       "      <td>3</td>\n",
+       "      <td>6</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>28116</td>\n",
+       "      <td>2</td>\n",
+       "      <td>4</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>28117</td>\n",
+       "      <td>1</td>\n",
+       "      <td>4</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17582</th>\n",
+       "      <td>63858</td>\n",
+       "      <td>1</td>\n",
+       "      <td>6</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17583</th>\n",
+       "      <td>63864</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17584</th>\n",
+       "      <td>63865</td>\n",
+       "      <td>2</td>\n",
+       "      <td>3</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17585</th>\n",
+       "      <td>63866</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17586</th>\n",
+       "      <td>63868</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>17587 rows × 4 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       PUMFID  grade_average  FATHEDP  MOTHEDP\n",
+       "0       28111              1        6        6\n",
+       "1       28113              2        2        2\n",
+       "2       28114              3        6        6\n",
+       "3       28116              2        4        5\n",
+       "4       28117              1        4        4\n",
+       "...       ...            ...      ...      ...\n",
+       "17582   63858              1        6        6\n",
+       "17583   63864              1        5        5\n",
+       "17584   63865              2        3        4\n",
+       "17585   63866              1        1        5\n",
+       "17586   63868              1        2        5\n",
+       "\n",
+       "[17587 rows x 4 columns]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Assuming your dataframe is named df\n",
+    "# Filter the rows based on the specified conditions\n",
+    "df = df[(df['FATHEDP'] != 99) & (df['MOTHEDP'] != 99) & (df['PGM_P405'] != 9)]\n",
+    "\n",
+    "# Rename the 'PGM_P405' column to 'grade_average'\n",
+    "df = df.rename(columns={'PGM_P405': 'grade_average'})\n",
+    "\n",
+    "# If you want to reset the index after filtering\n",
+    "df = df.reset_index(drop=True)\n",
+    "\n",
+    "# Display the resulting dataframe\n",
+    "df\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 720x432 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "\n",
+    "# Define the mapping for substitutions\n",
+    "education_mapping = {\n",
+    "    1: '< High School',\n",
+    "    2: 'High school',\n",
+    "    3: 'Trade certificate/diploma',\n",
+    "    4: 'College/other non-university certificate',\n",
+    "    5: 'University <= Bachelor',\n",
+    "    6: 'University > Bachelor'\n",
+    "}\n",
+    "\n",
+    "education_df = df.copy()\n",
+    "\n",
+    "# Replace values in the 'FATHEDP' and 'MOTHEDP' columns using the mapping\n",
+    "education_df['FATHEDP'] = education_df['FATHEDP'].map(education_mapping)\n",
+    "education_df['MOTHEDP'] = education_df['MOTHEDP'].map(education_mapping)\n",
+    "\n",
+    "# Count the occurrences of each education level\n",
+    "fath_edu_counts = education_df['FATHEDP'].value_counts()\n",
+    "moth_edu_counts = education_df['MOTHEDP'].value_counts()\n",
+    "\n",
+    "# Plotting the bar chart\n",
+    "fig, ax = plt.subplots(figsize=(10, 6))\n",
+    "\n",
+    "bar_width = 0.35\n",
+    "bar_positions = range(len(fath_edu_counts))\n",
+    "\n",
+    "ax.bar(bar_positions, fath_edu_counts, width=bar_width, label='Father')\n",
+    "ax.bar([pos + bar_width for pos in bar_positions], moth_edu_counts, width=bar_width, label='Mother')\n",
+    "\n",
+    "ax.set_xticks([pos + bar_width/2 for pos in bar_positions])\n",
+    "ax.set_xticklabels(fath_edu_counts.index, rotation=45, ha='right')\n",
+    "\n",
+    "ax.set_ylabel('Number of Graduates')\n",
+    "ax.set_title('Education Level of Father and Mother')\n",
+    "ax.legend()\n",
+    "\n",
+    "plt.show()\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "grade_mapping = {\n",
+    "    1: 'A',\n",
+    "    2: 'B',\n",
+    "    3: 'C - D',\n",
+    "    4: 'No grade assigned',\n",
+    "}\n",
+    "\n",
+    "education_df['grade_average'] = education_df['grade_average'].map(grade_mapping)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Optimization terminated successfully.\n",
+      "         Current function value: 0.690982\n",
+      "         Iterations 4\n",
+      "                           Logit Regression Results                           \n",
+      "==============================================================================\n",
+      "Dep. Variable:           grade_binary   No. Observations:                14069\n",
+      "Model:                          Logit   Df Residuals:                    14063\n",
+      "Method:                           MLE   Df Model:                            5\n",
+      "Date:                Tue, 21 Nov 2023   Pseudo R-squ.:                0.003039\n",
+      "Time:                        18:09:29   Log-Likelihood:                -9721.4\n",
+      "converged:                       True   LL-Null:                       -9751.1\n",
+      "Covariance Type:            nonrobust   LLR p-value:                 1.720e-11\n",
+      "=======================================================================================================================\n",
+      "                                                          coef    std err          z      P>|z|      [0.025      0.975]\n",
+      "-----------------------------------------------------------------------------------------------------------------------\n",
+      "const                                                  -0.1134      0.053     -2.128      0.033      -0.218      -0.009\n",
+      "MOTHER_EDU_College/other non-university certificate     0.0083      0.066      0.127      0.899      -0.120       0.137\n",
+      "MOTHER_EDU_High school                                  0.1146      0.063      1.807      0.071      -0.010       0.239\n",
+      "MOTHER_EDU_Trade certificate/diploma                    0.0532      0.082      0.648      0.517      -0.108       0.214\n",
+      "MOTHER_EDU_University <= Bachelor                       0.1861      0.062      2.985      0.003       0.064       0.308\n",
+      "MOTHER_EDU_University > Bachelor                        0.4358      0.073      5.987      0.000       0.293       0.578\n",
+      "=======================================================================================================================\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create dummy variables for education levels\n",
+    "education_df = pd.get_dummies(education_df, columns=['MOTHEDP'], prefix='MOTHER_EDU')\n",
+    "\n",
+    "# Map grade to binary outcome (1 if A, 0 otherwise)\n",
+    "education_df['grade_binary'] = (education_df['grade_average'] == 'A').astype(int)\n",
+    "\n",
+    "# Define independent variables (mother's education levels) and dependent variable (grade)\n",
+    "X = education_df.iloc[:, 4:-1]  # Independent variables (education levels)\n",
+    "y = education_df['grade_binary']  # Dependent variable (grade)\n",
+    "\n",
+    "# Add a constant to the independent variables for the logistic regression\n",
+    "X = sm.add_constant(X)\n",
+    "\n",
+    "# Split the data into training and testing sets\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
+    "\n",
+    "# Fit logistic regression model on the training set\n",
+    "logit_model = sm.Logit(y_train, X_train)\n",
+    "result = logit_model.fit()\n",
+    "\n",
+    "# Display logistic regression summary\n",
+    "print(result.summary())\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 576x432 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Visualize logistic regression\n",
+    "sns.set(style=\"whitegrid\")\n",
+    "plt.figure(figsize=(8, 6))\n",
+    "sns.regplot(x=result.fittedvalues, y=y_train, logistic=True, scatter_kws={'s': 10})\n",
+    "plt.title('Logistic Regression - Fitted Values vs. Observed')\n",
+    "plt.xlabel('Fitted Values')\n",
+    "plt.ylabel('Observed (Actual)')\n",
+    "plt.show()\n",
+    "\n",
+    "# Predict on the test set\n",
+    "y_pred = result.predict(X_test)\n",
+    "\n",
+    "# You can use y_pred for further evaluation if needed"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
-- 
GitLab