commit 9135e7794252a08f79b0ed321dd482ca07f1bb78 Author: wuhanstudio Date: Mon Jan 3 09:49:45 2022 +0000 Initial Commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..87620ac --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.ipynb_checkpoints/ diff --git a/2_type_with_ct_score.ipynb b/2_type_with_ct_score.ipynb new file mode 100644 index 0000000..70fc1f7 --- /dev/null +++ b/2_type_with_ct_score.ipynb @@ -0,0 +1,2724 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Interpretable ML - COVID19\n", + "> Interpretable ML Research for COVID19\n", + "- toc:true\n", + "- branch: master\n", + "- badges: true\n", + "- comments: true\n", + "- author: Han Wu\n", + "- categories: [jupyter]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 0. Load Data" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "np.set_printoptions(suppress=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "covid = pd.read_csv(\"dataset/covid.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AVG age for severity 0: 36.833333333333336\n", + "AVG age for severity 1: 47.45283018867924\n", + "AVG age for severity 2: 54.3125\n", + "AVG age for severity 3: 69.4\n" + ] + } + ], + "source": [ + "print(\"AVG age for severity 0:\", np.mean(covid[covid.Severity03 == 0].Age.to_numpy()))\n", + "print(\"AVG age for severity 1:\", np.mean(covid[covid.Severity03 == 1].Age.to_numpy()))\n", + "print(\"AVG age for severity 2:\", np.mean(covid[covid.Severity03 == 2].Age.to_numpy()))\n", + "print(\"AVG age for severity 3:\", np.mean(covid[covid.Severity03 == 3].Age.to_numpy()))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(92, 74)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "covid.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Data Wash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Remove Features that has NULL value" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "remove_columns = ['MedNum', 'LVEF', 'SO2', 'PO2', 'YHZS', 'RML', 'RUL', 'RLL', 'LUL', 'LLL']" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "covid = covid.drop(remove_columns, axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Remove Features that records time rather than biomarkers" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "remove_columns = ['Onset2Admi', 'Onset2CT1', 'Onset2CTPositive1', 'Onset2CTPeak']" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "covid = covid.drop(remove_columns, axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Remove Patients that have no records" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "covid = covid[covid.Weight != \" \"]" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "covid = covid[covid.cTnI != \" \"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "String to Float" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "covid['Weight'] = covid['Weight'].astype(np.float64)\n", + "covid['Height'] = covid['Height'].astype(np.float64)\n", + "covid['cTnITimes'] = covid['cTnITimes'].astype(np.float64)\n", + "covid['cTnI'] = covid['cTnI'].astype(np.float64)\n", + "covid['NTproBNP'] = covid['NTproBNP'].astype(np.float64)\n", + "covid['Cr'] = covid['Cr'].astype(np.float64)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Train Test Split" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import preprocessing\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "y = covid.Severity01.to_numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "# Use Both\n", + "covid = covid.drop([\"Severity01\", \"Severity03\"], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "# Use None\n", + "# covid = covid.drop([\"Severity01\", \"Severity03\", \"CTScore\", \"AIVolumeP\"], axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Patient No. is irrrelevant" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "covid = covid.drop([\"No\"], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "X = covid\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.9, random_state = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((77, 57), (9, 57))" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.shape, X_test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Sex', 'Age', 'AgeG1', 'Height', 'Weight', 'BMI', 'Temp', 'CTScore',\n", + " 'AIVolumeP', 'cTnITimes', 'cTnI', 'cTnICKMBOrdinal1',\n", + " 'cTnICKMBOrdinal2', 'AST', 'LDH', 'CK', 'CKMB', 'HBDH', 'HiCKMB',\n", + " 'NTproBNP', 'Cr', 'PCT1', 'WBC1', 'NEU1', 'LYM1', 'N2L1', 'CRP1',\n", + " 'ALB1', 'PCT2', 'WBC2', 'NEU2', 'LYM2', 'N2L2', 'CRP2', 'ALB2',\n", + " 'Sympton', 'Fever', 'Cough', 'Phlegm', 'Hemoptysis', 'SoreThroat',\n", + " 'Catarrh', 'Headache', 'ChestPain', 'Fatigue', 'SoreMuscle',\n", + " 'Stomachache', 'Diarrhea', 'PoorAppetite', 'NauseaNVomit',\n", + " 'Hypertention', 'Hyperlipedia', 'DM', 'Lung', 'CAD', 'Arrythmia',\n", + " 'Cancer'],\n", + " dtype='object')" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 1, 0, 0, 1, 1, 0, 1, 0], dtype=int64)" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_test" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Feature Selection" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.1 Basic Methods" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3.1.1 Drop constant and Quasi-constant features" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_selection import VarianceThreshold" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "def drop_features(X_train, X_test, threshhold):\n", + " sel = VarianceThreshold(threshold=threshhold)\n", + " sel.fit(X_train)\n", + " print(\"No. of constant features:\",\n", + " len([\n", + " x for x in X_train.columns\n", + " if x not in X_train.columns[sel.get_support()]\n", + " ])\n", + " )\n", + " constant_features = [x for x in X_train.columns if x not in X_train.columns[sel.get_support()]]\n", + "\n", + " print(constant_features)\n", + " X_train.drop(labels=constant_features, axis=1, inplace=True)\n", + " X_test.drop(labels=constant_features, axis=1, inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Drop constant and quasi-constant features" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No. of constant features: 2\n", + "['PCT2', 'Stomachache']\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\pandas\\core\\frame.py:4305: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " return super().drop(\n" + ] + } + ], + "source": [ + "drop_features(X_train, X_test, 0.01)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((77, 55), (9, 55))" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.shape, X_test.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3.1.2 Drop Duplicated Features" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Maybe some symptoms are correlated" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No. of Duplicated Features: 1\n", + "['Arrythmia']\n" + ] + } + ], + "source": [ + "covid_t = covid.T\n", + "print(\"No. of Duplicated Features:\", covid_t.duplicated().sum())\n", + "print(covid_t[covid_t.duplicated()].index.values)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Print out duplicated features" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CAD\n", + "Arrythmia\n" + ] + } + ], + "source": [ + "duplicated_feat = []\n", + "for i in range(0, len(X_train.columns)):\n", + " col_1 = X_train.columns[i]\n", + " for col_2 in X_train.columns[i + 1 : ]:\n", + " if X_train[col_1].equals(X_train[col_2]):\n", + " print(col_1)\n", + " print(col_2) \n", + " duplicated_feat.append(col_2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Drop duplicated features" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "# covid_unique = covid_t.drop_duplicates(keep='first').T" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "X_train.drop(labels=covid_t[covid_t.duplicated()].index.values, axis=1, inplace=True)\n", + "X_test.drop(labels=covid_t[covid_t.duplicated()].index.values, axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((77, 54), (9, 54))" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.shape, X_test.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.2 Correlations" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "categorical_features = ['Sex', 'AgeG1', \n", + " 'Fever', 'Cough', 'Phlegm', 'Hemoptysis', 'SoreThroat', 'Catarrh', 'Headache', 'ChestPain', 'Fatigue', 'SoreMuscle', # 'Stomachache', \n", + " 'Diarrhea', 'PoorAppetite', 'NauseaNVomit', \n", + " 'Hypertention', 'Hyperlipedia', 'DM', 'Lung', #'CAD', 'Arrythmia', \n", + " 'Cancer']" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "numerical_features = ['Age', 'Height', 'Weight', 'BMI', 'Temp', 'cTnITimes', 'cTnI', 'cTnICKMBOrdinal1', 'cTnICKMBOrdinal2', 'AST',\n", + " 'LDH', 'CK', 'CKMB', 'HBDH', 'HiCKMB', 'NTproBNP', 'Cr', 'PCT1', 'WBC1',\n", + " 'NEU1', 'LYM1', 'N2L1', 'CRP1', 'ALB1', 'WBC2', 'NEU2', 'LYM2', 'N2L2',\n", + " 'CRP2', 'ALB2']" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "# numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']\n", + "# numerical_vars = list(covid.select_dtypes(include=numerics).columns)\n", + "# data = covid[numerical_vars]" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "corrmat = X_train.corr()" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots()\n", + "fig.set_size_inches(11, 11)\n", + "sns.heatmap(corrmat)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
feature1feature2corr
0LDHHBDH0.958191
1HBDHLDH0.958191
2HeightPoorAppetite0.911704
3PoorAppetiteHeight0.911704
4WBC2NEU20.911419
5NEU2WBC20.911419
6WBC1NEU10.903520
7NEU1WBC10.903520
8AgeG1Age0.893413
9AgeAgeG10.893413
10AIVolumePCTScore0.874107
11CTScoreAIVolumeP0.874107
12cTnICKMBOrdinal1cTnICKMBOrdinal20.853741
13cTnICKMBOrdinal2cTnICKMBOrdinal10.853741
14LYM1LYM20.842688
15LYM2LYM10.842688
16BMIWeight0.842409
17WeightBMI0.842409
18NTproBNPN2L20.808767
19N2L2NTproBNP0.808767
\n", + "
" + ], + "text/plain": [ + " feature1 feature2 corr\n", + "0 LDH HBDH 0.958191\n", + "1 HBDH LDH 0.958191\n", + "2 Height PoorAppetite 0.911704\n", + "3 PoorAppetite Height 0.911704\n", + "4 WBC2 NEU2 0.911419\n", + "5 NEU2 WBC2 0.911419\n", + "6 WBC1 NEU1 0.903520\n", + "7 NEU1 WBC1 0.903520\n", + "8 AgeG1 Age 0.893413\n", + "9 Age AgeG1 0.893413\n", + "10 AIVolumeP CTScore 0.874107\n", + "11 CTScore AIVolumeP 0.874107\n", + "12 cTnICKMBOrdinal1 cTnICKMBOrdinal2 0.853741\n", + "13 cTnICKMBOrdinal2 cTnICKMBOrdinal1 0.853741\n", + "14 LYM1 LYM2 0.842688\n", + "15 LYM2 LYM1 0.842688\n", + "16 BMI Weight 0.842409\n", + "17 Weight BMI 0.842409\n", + "18 NTproBNP N2L2 0.808767\n", + "19 N2L2 NTproBNP 0.808767" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "corrmat = X_train.corr()\n", + "corrmat = corrmat.abs().unstack()\n", + "corrmat = corrmat.sort_values(ascending=False)\n", + "corrmat = corrmat[corrmat >= 0.8]\n", + "corrmat = corrmat[corrmat < 1]\n", + "corrmat = pd.DataFrame(corrmat).reset_index()\n", + "corrmat.columns = ['feature1', 'feature2', 'corr']\n", + "corrmat" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "found 10 correlated groups\n", + "out of 54 total features\n" + ] + } + ], + "source": [ + "# find groups of correlated features\n", + "\n", + "grouped_feature_ls = []\n", + "correlated_groups = []\n", + "\n", + "for feature in corrmat.feature1.unique():\n", + " if feature not in grouped_feature_ls:\n", + "\n", + " # find all features correlated to a single feature\n", + " correlated_block = corrmat[corrmat.feature1 == feature]\n", + " grouped_feature_ls = grouped_feature_ls + list(\n", + " correlated_block.feature2.unique()) + [feature]\n", + "\n", + " # append the block of features to the list\n", + " correlated_groups.append(correlated_block)\n", + "\n", + "print('found {} correlated groups'.format(len(correlated_groups)))\n", + "print('out of {} total features'.format(X_train.shape[1]))" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " feature1 feature2 corr\n", + "0 LDH HBDH 0.958191\n", + "\n", + " feature1 feature2 corr\n", + "2 Height PoorAppetite 0.911704\n", + "\n", + " feature1 feature2 corr\n", + "4 WBC2 NEU2 0.911419\n", + "\n", + " feature1 feature2 corr\n", + "6 WBC1 NEU1 0.90352\n", + "\n", + " feature1 feature2 corr\n", + "8 AgeG1 Age 0.893413\n", + "\n", + " feature1 feature2 corr\n", + "10 AIVolumeP CTScore 0.874107\n", + "\n", + " feature1 feature2 corr\n", + "12 cTnICKMBOrdinal1 cTnICKMBOrdinal2 0.853741\n", + "\n", + " feature1 feature2 corr\n", + "14 LYM1 LYM2 0.842688\n", + "\n", + " feature1 feature2 corr\n", + "16 BMI Weight 0.842409\n", + "\n", + " feature1 feature2 corr\n", + "18 NTproBNP N2L2 0.808767\n", + "\n" + ] + } + ], + "source": [ + "# now we can visualise each group. We see that some groups contain\n", + "# only 2 correlated features, some other groups present several features \n", + "# that are correlated among themselves.\n", + "\n", + "for group in correlated_groups:\n", + " print(group)\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "def correlation(dataset, threshold):\n", + " col_corr = set()\n", + " corr_matrix = dataset.corr()\n", + " for i in range(len(corr_matrix.columns)):\n", + " for j in range(i):\n", + " if abs(corr_matrix.iloc[i, j] >= threshold):\n", + " colname = corr_matrix.columns[i]\n", + " col_corr.add(colname)\n", + " return col_corr" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['LYM2', 'NEU1', 'NEU2', 'BMI', 'N2L2', 'AgeG1', 'AIVolumeP', 'cTnICKMBOrdinal2', 'HBDH']\n" + ] + } + ], + "source": [ + "corr_features = list((correlation(X_train, 0.8)))\n", + "print(corr_features)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "for i in corr_features:\n", + " if i in categorical_features:\n", + " corr_features.remove(i)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "for i in corr_features:\n", + " if i in numerical_features:\n", + " numerical_features.remove(i)\n", + "\n", + "for i in corr_features:\n", + " if i in categorical_features:\n", + " categorical_features.remove(i)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['LYM2',\n", + " 'NEU1',\n", + " 'NEU2',\n", + " 'BMI',\n", + " 'N2L2',\n", + " 'AIVolumeP',\n", + " 'cTnICKMBOrdinal2',\n", + " 'HBDH']" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "corr_features" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\pandas\\core\\frame.py:4305: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " return super().drop(\n" + ] + } + ], + "source": [ + "X_train.drop(labels=corr_features, axis=1, inplace=True)\n", + "X_test.drop(labels=corr_features, axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((77, 46), (9, 46))" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.shape, X_test.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.3 Statistical Methods" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3.3.1 Mutual Information" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_selection import mutual_info_classif, mutual_info_regression\n", + "from sklearn.feature_selection import SelectKBest, SelectPercentile" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [], + "source": [ + "mi = mutual_info_classif(X_train, y_train)\n", + "mi = pd.Series(mi)\n", + "mi.index = X_train.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Features on the left side have more mutual information with y" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "mi.sort_values(ascending=False).plot.bar(figsize=(20, 8))" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [], + "source": [ + "sel_ = SelectKBest(mutual_info_classif, k = 40).fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [], + "source": [ + "mi_features = list(X_train.columns[ ~ sel_.get_support()].values)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "for i in mi_features:\n", + " if i in categorical_features:\n", + " mi_features.remove(i)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [], + "source": [ + "for i in mi_features:\n", + " if i in numerical_features:\n", + " numerical_features.remove(i)\n", + "\n", + "for i in mi_features:\n", + " if i in categorical_features:\n", + " categorical_features.remove(i)" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Height', 'CK', 'HiCKMB', 'Cr', 'PCT1']" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mi_features" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\pandas\\core\\frame.py:4305: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " return super().drop(\n" + ] + } + ], + "source": [ + "X_train.drop(labels=mi_features, axis=1, inplace=True)\n", + "X_test.drop(labels=mi_features, axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((77, 41), (9, 41))" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.shape, X_test.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3.3.2 Fisher Score" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_selection import chi2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "categorical features" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [], + "source": [ + "f_score = chi2(X_train[categorical_features], y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The smaller ones have more correlations" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Cancer 0.887949\n", + "SoreThroat 0.842057\n", + "Cough 0.703238\n", + "Headache 0.638344\n", + "Hemoptysis 0.594525\n", + "NauseaNVomit 0.356552\n", + "ChestPain 0.356552\n", + "Diarrhea 0.333947\n", + "Sex 0.302537\n", + "Fever 0.202574\n", + "Catarrh 0.159040\n", + "Hypertention 0.154388\n", + "SoreMuscle 0.105717\n", + "Hyperlipedia 0.099153\n", + "Lung 0.062605\n", + "PoorAppetite 0.060289\n", + "Phlegm 0.046410\n", + "AgeG1 0.037459\n", + "DM 0.008457\n", + "Fatigue 0.000049\n", + "dtype: float64" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p_values = pd.Series(f_score[1])\n", + "p_values.index = X_train[categorical_features].columns\n", + "p_values.sort_values(ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['AgeG1', 'Phlegm', 'Fatigue', 'DM'], dtype=object)" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p_values[p_values<0.05].index.values" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [], + "source": [ + "# for c in categorical_features:\n", + "# if c not in p_values[p_values<0.05].index.values:\n", + "# categorical_features.remove(c)\n", + "# print(c)\n", + "# X_train.drop(labels=c, axis=1, inplace=True)\n", + "# X_test.drop(labels=c, axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((77, 41), (9, 41))" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.shape, X_test.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3.3.3 Univariate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Non-categorical features" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_selection import f_classif, f_regression" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "univariate = f_classif(X_train[numerical_features], y_train)\n", + "univariate = pd.Series(univariate[1])\n", + "univariate.index = X_train[numerical_features].columns\n", + "univariate.sort_values(ascending=False, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "univariate.sort_values(ascending=False).plot.bar(figsize=(20, 8))" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Temp 0.437049\n", + "WBC1 0.430778\n", + "AST 0.376614\n", + "CKMB 0.351257\n", + "WBC2 0.207957\n", + "Weight 0.191531\n", + "dtype: float64" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "univariate[univariate > 0.05]" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Weight\n", + "AST\n", + "CKMB\n", + "WBC1\n", + "WBC2\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\pandas\\core\\frame.py:4305: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " return super().drop(\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\pandas\\core\\frame.py:4305: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " return super().drop(\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\pandas\\core\\frame.py:4305: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " return super().drop(\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\pandas\\core\\frame.py:4305: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " return super().drop(\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\pandas\\core\\frame.py:4305: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " return super().drop(\n" + ] + } + ], + "source": [ + "for n in numerical_features:\n", + " if n in univariate[univariate > 0.05].index.values:\n", + " numerical_features.remove(n)\n", + " print(n)\n", + " X_train.drop(labels=n, axis=1, inplace=True)\n", + " X_test.drop(labels=n, axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(77, 36)" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3.3.4 ROC-AUC" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor\n", + "from sklearn.metrics import roc_auc_score, mean_squared_error" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [], + "source": [ + "# loop to build a tree, make predictions and get the roc-auc\n", + "# for each feature of the train set\n", + "\n", + "roc_values = []\n", + "for feature in X_train.columns:\n", + " clf = DecisionTreeClassifier()\n", + " clf.fit(X_train[feature].fillna(0).to_frame(), y_train)\n", + " y_scored = clf.predict_proba(X_test[feature].fillna(0).to_frame())\n", + " roc_values.append(roc_auc_score(y_test, y_scored[:, 1]))" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "cTnITimes 0.775\n", + "Fever 0.675\n", + "CRP2 0.675\n", + "SoreThroat 0.650\n", + "NTproBNP 0.650\n", + "ALB2 0.625\n", + "Headache 0.625\n", + "LYM1 0.625\n", + "N2L1 0.625\n", + "CTScore 0.600\n", + "AgeG1 0.550\n", + "ChestPain 0.500\n", + "Diarrhea 0.500\n", + "Hemoptysis 0.500\n", + "Phlegm 0.500\n", + "Age 0.500\n", + "NauseaNVomit 0.500\n", + "Lung 0.500\n", + "PoorAppetite 0.500\n", + "SoreMuscle 0.500\n", + "CRP1 0.500\n", + "cTnICKMBOrdinal1 0.500\n", + "cTnI 0.500\n", + "CAD 0.500\n", + "Fatigue 0.500\n", + "LDH 0.500\n", + "Sex 0.450\n", + "ALB1 0.450\n", + "Hypertention 0.425\n", + "DM 0.400\n", + "Hyperlipedia 0.400\n", + "Cancer 0.400\n", + "Catarrh 0.375\n", + "Sympton 0.375\n", + "Cough 0.325\n", + "Temp 0.175\n", + "dtype: float64" + ] + }, + "execution_count": 88, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let's add the variable names and order it for clearer visualisation\n", + "roc_values = pd.Series(roc_values)\n", + "roc_values.index = X_train.columns\n", + "roc_values.sort_values(ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# and now let's plot\n", + "roc_values.sort_values(ascending=False).plot.bar(figsize=(20, 8))" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "11" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# a roc auc value of 0.5 indicates random decision\n", + "# let's check how many features show a roc-auc value\n", + "# higher than random\n", + "\n", + "len(roc_values[roc_values > 0.5])" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Sex 0.450\n", + "Temp 0.175\n", + "LDH 0.500\n", + "ALB1 0.450\n", + "Sympton 0.375\n", + "Cough 0.325\n", + "Catarrh 0.375\n", + "Hypertention 0.425\n", + "Hyperlipedia 0.400\n", + "DM 0.400\n", + "Cancer 0.400\n", + "dtype: float64" + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "roc_values[roc_values < 0.5]" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [], + "source": [ + "roc_features = roc_values[roc_values < 0.5].index.values" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [], + "source": [ + "for i in roc_features:\n", + " if i in numerical_features:\n", + " numerical_features.remove(i)\n", + "\n", + "for i in roc_features:\n", + " if i in categorical_features:\n", + " categorical_features.remove(i)" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Sex', 'Temp', 'LDH', 'ALB1', 'Sympton', 'Cough', 'Catarrh',\n", + " 'Hypertention', 'Hyperlipedia', 'DM', 'Cancer'], dtype=object)" + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "roc_features" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [], + "source": [ + "# X_train.drop(labels=roc_features, axis=1, inplace=True)\n", + "# X_test.drop(labels=roc_features, axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((77, 36), (9, 36))" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.shape, X_test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Sex', 'Age', 'AgeG1', 'Temp', 'CTScore', 'cTnITimes', 'cTnI',\n", + " 'cTnICKMBOrdinal1', 'LDH', 'NTproBNP', 'LYM1', 'N2L1', 'CRP1', 'ALB1',\n", + " 'CRP2', 'ALB2', 'Sympton', 'Fever', 'Cough', 'Phlegm', 'Hemoptysis',\n", + " 'SoreThroat', 'Catarrh', 'Headache', 'ChestPain', 'Fatigue',\n", + " 'SoreMuscle', 'Diarrhea', 'PoorAppetite', 'NauseaNVomit',\n", + " 'Hypertention', 'Hyperlipedia', 'DM', 'Lung', 'CAD', 'Cancer'],\n", + " dtype='object')" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [], + "source": [ + "import sklearn\n", + "import sklearn.ensemble\n", + "import sklearn.metrics\n", + "import xgboost as xgb" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Cross Validation**" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import cross_val_score" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [], + "source": [ + "def cv_score(classifier, X, y, scoring):\n", + " return cross_val_score(classifier, X, y, cv=5, scoring=scoring)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Decision Tree**" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DecisionTreeClassifier()" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dt = sklearn.tree.DecisionTreeClassifier()\n", + "\n", + "dt_f1 = cv_score(dt, X_train, y_train, 'f1')\n", + "\n", + "dt.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.5642857142857143\n" + ] + } + ], + "source": [ + "print(np.mean(dt_f1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Random Forest**" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RandomForestClassifier()" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rf = sklearn.ensemble.RandomForestClassifier(n_estimators=100)\n", + "\n", + "rf_f1 = cv_score(rf, X_train, y_train, 'f1')\n", + "\n", + "rf.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.66\n" + ] + } + ], + "source": [ + "print(np.mean(rf_f1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**SVM**" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.svm import SVC" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "SVC(probability=True)" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "svc = SVC(probability=True)\n", + "\n", + "svc_f1 = cv_score(svc, X_train, y_train, 'f1')\n", + "\n", + "svc.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.6733333333333333\n" + ] + } + ], + "source": [ + "print(np.mean(svc_f1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**XGBoost**" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[21:07:04] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\xgboost\\sklearn.py:892: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n", + " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\xgboost\\sklearn.py:892: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n", + " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[21:07:04] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n", + "[21:07:04] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\xgboost\\sklearn.py:892: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n", + " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\xgboost\\sklearn.py:892: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n", + " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\xgboost\\sklearn.py:892: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n", + " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[21:07:04] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n", + "[21:07:04] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n", + "[21:07:04] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\xgboost\\sklearn.py:892: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n", + " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n" + ] + }, + { + "data": { + "text/plain": [ + "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.7,\n", + " colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,\n", + " importance_type='gain', interaction_constraints='',\n", + " learning_rate=0.300000012, max_delta_step=0, max_depth=4,\n", + " min_child_weight=0, missing=nan, monotone_constraints='()',\n", + " n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,\n", + " reg_alpha=4, reg_lambda=1, scale_pos_weight=9, subsample=0.8,\n", + " tree_method='exact', validate_parameters=1, verbosity=None)" + ] + }, + "execution_count": 109, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create a model\n", + "# Params from: https://www.kaggle.com/aharless/swetha-s-xgboost-revised\n", + "xgbc = xgb.XGBClassifier(\n", + " max_depth = 4,\n", + " subsample = 0.8,\n", + " colsample_bytree = 0.7,\n", + " colsample_bylevel = 0.7,\n", + " scale_pos_weight = 9,\n", + " min_child_weight = 0,\n", + " reg_alpha = 4,\n", + " objective = 'binary:logistic'\n", + ")\n", + "\n", + "xgbc_f1 = cv_score(xgbc, X_train, y_train, 'f1')\n", + "\n", + "# Fit the models\n", + "xgbc.fit(np.array(X_train), np.array(y_train))" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.6714285714285715\n" + ] + } + ], + "source": [ + "print(np.mean(xgbc_f1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Save Models" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "metadata": {}, + "outputs": [], + "source": [ + "import pickle" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"2-type-model-4-ct.pkl\", 'wb') as f:\n", + " pickle.dump([dt, rf, svc, xgbc], f)\n", + "with open(\"dataset/2-type-dataset-ct.pkl\", 'wb') as f:\n", + " pickle.dump([X_train, X_test, y_train, y_test], f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load Models" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"2-type-model-4-ct.pkl\", 'rb') as f:\n", + " [dt, rf, svc, xgbc] = pickle.load(f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"dataset/2-type-dataset-ct.pkl\", 'rb') as f:\n", + " [X_train, X_test, y_train, y_test] = pickle.load(f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Prediction**" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import sklearn" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "dt_pred = dt.predict(X_test)\n", + "rf_pred = rf.predict(X_test)\n", + "svc_pred = svc.predict(X_test)\n", + "xgbc_pred = xgbc.predict(np.array(X_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "metadata": {}, + "outputs": [], + "source": [ + "import math" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Decision Tree\n", + "Precision: 0.7777777777777778\n", + "Recal: 0.75\n", + "F1: 0.75\n", + "CI: 0.27161661029914536\n", + "\n", + "Random Forest\n", + "Precision: 0.6666666666666666\n", + "Recal: 0.5\n", + "F1: 0.5714285714285715\n", + "CI: 0.3079842869168074\n", + "\n", + "SVC\n", + "Precision: 0.6666666666666666\n", + "Recal: 0.25\n", + "F1: 0.4\n", + "CI: 0.3079842869168074\n", + "\n", + "XGBoost\n", + "Precision: 0.7777777777777778\n", + "Recal: 1.0\n", + "F1: 0.8\n", + "CI: 0.27161661029914536\n" + ] + } + ], + "source": [ + "print(\"Decision Tree\")\n", + "print(\"Precision: \", sklearn.metrics.accuracy_score(y_test, dt_pred))\n", + "print(\"Recal: \", sklearn.metrics.recall_score(y_test, dt_pred))\n", + "print(\"F1: \", sklearn.metrics.f1_score(y_test, dt_pred))\n", + "print(\"CI:\", 1.96 * math.sqrt( ((1 - sklearn.metrics.accuracy_score(y_test, dt_pred)) * (sklearn.metrics.accuracy_score(y_test, dt_pred))) / len(X_test)))\n", + "\n", + "print()\n", + "\n", + "print(\"Random Forest\")\n", + "print(\"Precision: \", sklearn.metrics.accuracy_score(y_test, rf_pred))\n", + "print(\"Recal: \", sklearn.metrics.recall_score(y_test, rf_pred))\n", + "print(\"F1: \", sklearn.metrics.f1_score(y_test, rf_pred))\n", + "print(\"CI:\", 1.96 * math.sqrt( ((1 - sklearn.metrics.accuracy_score(y_test, rf_pred)) * (sklearn.metrics.accuracy_score(y_test, rf_pred))) / len(X_test)))\n", + "\n", + "print()\n", + "\n", + "print(\"SVC\")\n", + "print(\"Precision: \", sklearn.metrics.accuracy_score(y_test, svc_pred))\n", + "print(\"Recal: \", sklearn.metrics.recall_score(y_test, svc_pred))\n", + "print(\"F1: \", sklearn.metrics.f1_score(y_test, svc_pred))\n", + "print(\"CI:\", 1.96 * math.sqrt( ((1 - sklearn.metrics.accuracy_score(y_test, svc_pred)) * (sklearn.metrics.accuracy_score(y_test, svc_pred))) / len(X_test)))\n", + "\n", + "print()\n", + "\n", + "print(\"XGBoost\")\n", + "print(\"Precision: \", sklearn.metrics.accuracy_score(y_test, xgbc_pred))\n", + "print(\"Recal: \", sklearn.metrics.recall_score(y_test, xgbc_pred))\n", + "print(\"F1: \", sklearn.metrics.f1_score(y_test, xgbc_pred))\n", + "print(\"CI:\", 1.96 * math.sqrt( ((1 - sklearn.metrics.accuracy_score(y_test, xgbc_pred)) * (sklearn.metrics.accuracy_score(y_test, xgbc_pred))) / len(X_test)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7 Interpreatation" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": {}, + "outputs": [], + "source": [ + "class_names = ['normal', 'severe']" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[27]" + ] + }, + "execution_count": 123, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Patient No. 21 --> 27 after shuffuling\n", + "[i for i, x in enumerate(X_train['LDH']==254) if x]" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[59]" + ] + }, + "execution_count": 124, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Patient No. 36 --> 59 after shuffuling\n", + "[i for i, x in enumerate(X_train['NTproBNP']==384) if x]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Patient No. 21 (27) & 36 (59)**" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": {}, + "outputs": [], + "source": [ + "from lime import lime_tabular\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Sex', 'Age', 'AgeG1', 'Temp', 'CTScore', 'cTnITimes', 'cTnI',\n", + " 'cTnICKMBOrdinal1', 'LDH', 'NTproBNP', 'LYM1', 'N2L1', 'CRP1', 'ALB1',\n", + " 'CRP2', 'ALB2', 'Sympton', 'Fever', 'Cough', 'Phlegm', 'Hemoptysis',\n", + " 'SoreThroat', 'Catarrh', 'Headache', 'ChestPain', 'Fatigue',\n", + " 'SoreMuscle', 'Diarrhea', 'PoorAppetite', 'NauseaNVomit',\n", + " 'Hypertention', 'Hyperlipedia', 'DM', 'Lung', 'CAD', 'Cancer'],\n", + " dtype='object')" + ] + }, + "execution_count": 126, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "metadata": {}, + "outputs": [], + "source": [ + "categorical_features = [0, 2, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]\n", + "categorical_names = {}\n", + "for c in categorical_features:\n", + " categorical_names[c] = [\"False\", \"True\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Decision Tree**" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Patient id: 59\n", + "Probability(normal) = 0.0\n", + "True class: severe\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "idx = 59\n", + "class_names = ['normal', 'severe']\n", + "\n", + "print('Patient id: %d' % idx)\n", + "print('Probability(normal) =', dt.predict_proba(np.array(X_train)[idx, :].reshape(1, -1))[0][0])\n", + "print('True class: %s' % class_names[y_train[idx]])\n", + "\n", + "explainer = lime_tabular.LimeTabularExplainer(np.array(X_train), \n", + " feature_names= X_train.columns, class_names = class_names, \n", + " categorical_features = categorical_features, categorical_names = categorical_names,\n", + " discretize_continuous=True)\n", + "exp = explainer.explain_instance(np.array(X_train)[idx, :], predict_fn = dt.predict_proba, num_features = 10)\n", + "\n", + "%matplotlib inline\n", + "fig = exp.as_pyplot_figure()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Random Forest**" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Patient id: 59\n", + "Probability(normal) = 0.26\n", + "True class: severe\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "idx = 59\n", + "class_names = ['normal', 'severe']\n", + "\n", + "print('Patient id: %d' % idx)\n", + "print('Probability(normal) =', rf.predict_proba(np.array(X_train)[idx, :].reshape(1, -1))[0][0])\n", + "print('True class: %s' % class_names[y_train[idx]])\n", + "explainer = lime_tabular.LimeTabularExplainer(np.array(X_train), \n", + " feature_names= X_train.columns, class_names = class_names, \n", + " categorical_features = categorical_features, categorical_names = categorical_names,\n", + " discretize_continuous=True)\n", + "exp = explainer.explain_instance(np.array(X_train)[idx, :], predict_fn = rf.predict_proba, num_features = 10)\n", + "\n", + "%matplotlib inline\n", + "fig = exp.as_pyplot_figure()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**SVM**" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Patient id: 59\n", + "Probability(normal) = 0.8825628851481154\n", + "True class: severe\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "idx = 59\n", + "class_names = ['normal', 'severe']\n", + "\n", + "print('Patient id: %d' % idx)\n", + "print('Probability(normal) =', svc.predict_proba(np.array(X_train)[idx, :].reshape(1, -1))[0][0])\n", + "print('True class: %s' % class_names[y_train[idx]])\n", + "explainer = lime_tabular.LimeTabularExplainer(np.array(X_train), \n", + " feature_names= X_train.columns, class_names = class_names, \n", + " categorical_features = categorical_features, categorical_names = categorical_names,\n", + " discretize_continuous=True)\n", + "exp = explainer.explain_instance(np.array(X_train)[idx, :], predict_fn = svc.predict_proba, num_features = 10)\n", + "\n", + "%matplotlib inline\n", + "fig = exp.as_pyplot_figure()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Gradient Boosted Trees**" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": {}, + "outputs": [], + "source": [ + "def predict_fn_xbg(X):\n", + " X_data = pd.DataFrame(data=X, columns=xgbc.get_booster().feature_names)\n", + " return xgbc.predict_proba(X_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Patient id: 59\n", + "Probability(normal) = 0.047768414\n", + "True class: severe\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "idx = 59\n", + "print('Patient id: %d' % idx)\n", + "print('Probability(normal) =', xgbc.predict_proba(X_train[idx:idx+1])[0][0])\n", + "print('True class: %s' % class_names[y_train[idx]])\n", + "explainer = lime_tabular.LimeTabularExplainer(np.array(X_train), \n", + " feature_names= X_train.columns, class_names = ['normal', 'severe'], \n", + " categorical_features = categorical_features, categorical_names = categorical_names,\n", + " discretize_continuous=True)\n", + "exp = explainer.explain_instance(np.array(X_train)[idx, :], predict_fn = predict_fn_xbg, num_features = 10)\n", + "\n", + "%matplotlib inline\n", + "fig = exp.as_pyplot_figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/2_type_without_ct_score.ipynb b/2_type_without_ct_score.ipynb new file mode 100644 index 0000000..92b659f --- /dev/null +++ b/2_type_without_ct_score.ipynb @@ -0,0 +1,2712 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Interpretable ML - COVID19\n", + "> Interpretable ML Research for COVID19\n", + "- toc:true\n", + "- branch: master\n", + "- badges: true\n", + "- comments: true\n", + "- author: Han Wu\n", + "- categories: [jupyter]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 0. Load Data" + ] + }, + { + "cell_type": "code", + "execution_count": 211, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 212, + "metadata": {}, + "outputs": [], + "source": [ + "np.set_printoptions(suppress=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 213, + "metadata": {}, + "outputs": [], + "source": [ + "covid = pd.read_csv(\"dataset/covid.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 214, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AVG age for severity 0: 36.833333333333336\n", + "AVG age for severity 1: 47.45283018867924\n", + "AVG age for severity 2: 54.3125\n", + "AVG age for severity 3: 69.4\n" + ] + } + ], + "source": [ + "print(\"AVG age for severity 0:\", np.mean(covid[covid.Severity03 == 0].Age.to_numpy()))\n", + "print(\"AVG age for severity 1:\", np.mean(covid[covid.Severity03 == 1].Age.to_numpy()))\n", + "print(\"AVG age for severity 2:\", np.mean(covid[covid.Severity03 == 2].Age.to_numpy()))\n", + "print(\"AVG age for severity 3:\", np.mean(covid[covid.Severity03 == 3].Age.to_numpy()))" + ] + }, + { + "cell_type": "code", + "execution_count": 215, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(92, 74)" + ] + }, + "execution_count": 215, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "covid.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Data Wash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Remove Features that has NULL value" + ] + }, + { + "cell_type": "code", + "execution_count": 216, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "remove_columns = ['MedNum', 'LVEF', 'SO2', 'PO2', 'YHZS', 'RML', 'RUL', 'RLL', 'LUL', 'LLL']" + ] + }, + { + "cell_type": "code", + "execution_count": 217, + "metadata": {}, + "outputs": [], + "source": [ + "covid = covid.drop(remove_columns, axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Remove Features that records time rather than biomarkers" + ] + }, + { + "cell_type": "code", + "execution_count": 218, + "metadata": {}, + "outputs": [], + "source": [ + "remove_columns = ['Onset2Admi', 'Onset2CT1', 'Onset2CTPositive1', 'Onset2CTPeak']" + ] + }, + { + "cell_type": "code", + "execution_count": 219, + "metadata": {}, + "outputs": [], + "source": [ + "covid = covid.drop(remove_columns, axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Remove Patients that have no records" + ] + }, + { + "cell_type": "code", + "execution_count": 220, + "metadata": {}, + "outputs": [], + "source": [ + "covid = covid[covid.Weight != \" \"]" + ] + }, + { + "cell_type": "code", + "execution_count": 221, + "metadata": {}, + "outputs": [], + "source": [ + "covid = covid[covid.cTnI != \" \"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "String to Float" + ] + }, + { + "cell_type": "code", + "execution_count": 222, + "metadata": {}, + "outputs": [], + "source": [ + "covid['Weight'] = covid['Weight'].astype(np.float64)\n", + "covid['Height'] = covid['Height'].astype(np.float64)\n", + "covid['cTnITimes'] = covid['cTnITimes'].astype(np.float64)\n", + "covid['cTnI'] = covid['cTnI'].astype(np.float64)\n", + "covid['NTproBNP'] = covid['NTproBNP'].astype(np.float64)\n", + "covid['Cr'] = covid['Cr'].astype(np.float64)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Train Test Split" + ] + }, + { + "cell_type": "code", + "execution_count": 223, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import preprocessing\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 224, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 3\n", + "2 1\n", + "3 1\n", + "5 2\n", + "7 0\n", + " ..\n", + "87 2\n", + "88 1\n", + "89 2\n", + "90 1\n", + "91 0\n", + "Name: Severity03, Length: 86, dtype: int64" + ] + }, + "execution_count": 224, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "covid.Severity03" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "y = covid.Severity01.to_numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# Use Both\n", + "# covid = covid.drop([\"Severity01\", \"Severity03\"], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# Use None\n", + "covid = covid.drop([\"Severity01\", \"Severity03\", \"CTScore\", \"AIVolumeP\"], axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Patient No. is irrrelevant" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "covid = covid.drop([\"No\"], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "X = covid\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.9, random_state = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((77, 55), (9, 55))" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.shape, X_test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Sex', 'Age', 'AgeG1', 'Height', 'Weight', 'BMI', 'Temp', 'cTnITimes',\n", + " 'cTnI', 'cTnICKMBOrdinal1', 'cTnICKMBOrdinal2', 'AST', 'LDH', 'CK',\n", + " 'CKMB', 'HBDH', 'HiCKMB', 'NTproBNP', 'Cr', 'PCT1', 'WBC1', 'NEU1',\n", + " 'LYM1', 'N2L1', 'CRP1', 'ALB1', 'PCT2', 'WBC2', 'NEU2', 'LYM2', 'N2L2',\n", + " 'CRP2', 'ALB2', 'Sympton', 'Fever', 'Cough', 'Phlegm', 'Hemoptysis',\n", + " 'SoreThroat', 'Catarrh', 'Headache', 'ChestPain', 'Fatigue',\n", + " 'SoreMuscle', 'Stomachache', 'Diarrhea', 'PoorAppetite', 'NauseaNVomit',\n", + " 'Hypertention', 'Hyperlipedia', 'DM', 'Lung', 'CAD', 'Arrythmia',\n", + " 'Cancer'],\n", + " dtype='object')" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 1, 0, 0, 1, 1, 0, 1, 0], dtype=int64)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_test" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Feature Selection" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.1 Basic Methods" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3.1.1 Drop constant and Quasi-constant features" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_selection import VarianceThreshold" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "def drop_features(X_train, X_test, threshhold):\n", + " sel = VarianceThreshold(threshold=threshhold)\n", + " sel.fit(X_train)\n", + " print(\"No. of constant features:\",\n", + " len([\n", + " x for x in X_train.columns\n", + " if x not in X_train.columns[sel.get_support()]\n", + " ])\n", + " )\n", + " constant_features = [x for x in X_train.columns if x not in X_train.columns[sel.get_support()]]\n", + "\n", + " print(constant_features)\n", + " X_train.drop(labels=constant_features, axis=1, inplace=True)\n", + " X_test.drop(labels=constant_features, axis=1, inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Drop constant and quasi-constant features" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No. of constant features: 2\n", + "['PCT2', 'Stomachache']\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\pandas\\core\\frame.py:4305: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " return super().drop(\n" + ] + } + ], + "source": [ + "drop_features(X_train, X_test, 0.01)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((77, 53), (9, 53))" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.shape, X_test.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3.1.2 Drop Duplicated Features" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Maybe some symptoms are correlated" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No. of Duplicated Features: 1\n", + "['Arrythmia']\n" + ] + } + ], + "source": [ + "covid_t = covid.T\n", + "print(\"No. of Duplicated Features:\", covid_t.duplicated().sum())\n", + "print(covid_t[covid_t.duplicated()].index.values)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Print out duplicated features" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CAD\n", + "Arrythmia\n" + ] + } + ], + "source": [ + "duplicated_feat = []\n", + "for i in range(0, len(X_train.columns)):\n", + " col_1 = X_train.columns[i]\n", + " for col_2 in X_train.columns[i + 1 : ]:\n", + " if X_train[col_1].equals(X_train[col_2]):\n", + " print(col_1)\n", + " print(col_2) \n", + " duplicated_feat.append(col_2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Drop duplicated features" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "# covid_unique = covid_t.drop_duplicates(keep='first').T" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "X_train.drop(labels=covid_t[covid_t.duplicated()].index.values, axis=1, inplace=True)\n", + "X_test.drop(labels=covid_t[covid_t.duplicated()].index.values, axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((77, 52), (9, 52))" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.shape, X_test.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.2 Correlations" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "categorical_features = ['Sex', 'AgeG1', \n", + " 'Fever', 'Cough', 'Phlegm', 'Hemoptysis', 'SoreThroat', 'Catarrh', 'Headache', 'ChestPain', 'Fatigue', 'SoreMuscle', # 'Stomachache', \n", + " 'Diarrhea', 'PoorAppetite', 'NauseaNVomit', \n", + " 'Hypertention', 'Hyperlipedia', 'DM', 'Lung', #'CAD', 'Arrythmia', \n", + " 'Cancer']" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "numerical_features = ['Age', 'Height', 'Weight', 'BMI', 'Temp', 'cTnITimes', 'cTnI', 'cTnICKMBOrdinal1', 'cTnICKMBOrdinal2', 'AST',\n", + " 'LDH', 'CK', 'CKMB', 'HBDH', 'HiCKMB', 'NTproBNP', 'Cr', 'PCT1', 'WBC1',\n", + " 'NEU1', 'LYM1', 'N2L1', 'CRP1', 'ALB1', 'WBC2', 'NEU2', 'LYM2', 'N2L2',\n", + " 'CRP2', 'ALB2']" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "# numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']\n", + "# numerical_vars = list(covid.select_dtypes(include=numerics).columns)\n", + "# data = covid[numerical_vars]" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "corrmat = X_train.corr()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots()\n", + "fig.set_size_inches(11, 11)\n", + "sns.heatmap(corrmat)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
feature1feature2corr
0LDHHBDH0.958191
1HBDHLDH0.958191
2HeightPoorAppetite0.911704
3PoorAppetiteHeight0.911704
4WBC2NEU20.911419
5NEU2WBC20.911419
6WBC1NEU10.903520
7NEU1WBC10.903520
8AgeG1Age0.893413
9AgeAgeG10.893413
10cTnICKMBOrdinal2cTnICKMBOrdinal10.853741
11cTnICKMBOrdinal1cTnICKMBOrdinal20.853741
12LYM1LYM20.842688
13LYM2LYM10.842688
14WeightBMI0.842409
15BMIWeight0.842409
16N2L2NTproBNP0.808767
17NTproBNPN2L20.808767
\n", + "
" + ], + "text/plain": [ + " feature1 feature2 corr\n", + "0 LDH HBDH 0.958191\n", + "1 HBDH LDH 0.958191\n", + "2 Height PoorAppetite 0.911704\n", + "3 PoorAppetite Height 0.911704\n", + "4 WBC2 NEU2 0.911419\n", + "5 NEU2 WBC2 0.911419\n", + "6 WBC1 NEU1 0.903520\n", + "7 NEU1 WBC1 0.903520\n", + "8 AgeG1 Age 0.893413\n", + "9 Age AgeG1 0.893413\n", + "10 cTnICKMBOrdinal2 cTnICKMBOrdinal1 0.853741\n", + "11 cTnICKMBOrdinal1 cTnICKMBOrdinal2 0.853741\n", + "12 LYM1 LYM2 0.842688\n", + "13 LYM2 LYM1 0.842688\n", + "14 Weight BMI 0.842409\n", + "15 BMI Weight 0.842409\n", + "16 N2L2 NTproBNP 0.808767\n", + "17 NTproBNP N2L2 0.808767" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "corrmat = X_train.corr()\n", + "corrmat = corrmat.abs().unstack()\n", + "corrmat = corrmat.sort_values(ascending=False)\n", + "corrmat = corrmat[corrmat >= 0.8]\n", + "corrmat = corrmat[corrmat < 1]\n", + "corrmat = pd.DataFrame(corrmat).reset_index()\n", + "corrmat.columns = ['feature1', 'feature2', 'corr']\n", + "corrmat" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "found 9 correlated groups\n", + "out of 52 total features\n" + ] + } + ], + "source": [ + "# find groups of correlated features\n", + "\n", + "grouped_feature_ls = []\n", + "correlated_groups = []\n", + "\n", + "for feature in corrmat.feature1.unique():\n", + " if feature not in grouped_feature_ls:\n", + "\n", + " # find all features correlated to a single feature\n", + " correlated_block = corrmat[corrmat.feature1 == feature]\n", + " grouped_feature_ls = grouped_feature_ls + list(\n", + " correlated_block.feature2.unique()) + [feature]\n", + "\n", + " # append the block of features to the list\n", + " correlated_groups.append(correlated_block)\n", + "\n", + "print('found {} correlated groups'.format(len(correlated_groups)))\n", + "print('out of {} total features'.format(X_train.shape[1]))" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " feature1 feature2 corr\n", + "0 LDH HBDH 0.958191\n", + "\n", + " feature1 feature2 corr\n", + "2 Height PoorAppetite 0.911704\n", + "\n", + " feature1 feature2 corr\n", + "4 WBC2 NEU2 0.911419\n", + "\n", + " feature1 feature2 corr\n", + "6 WBC1 NEU1 0.90352\n", + "\n", + " feature1 feature2 corr\n", + "8 AgeG1 Age 0.893413\n", + "\n", + " feature1 feature2 corr\n", + "10 cTnICKMBOrdinal2 cTnICKMBOrdinal1 0.853741\n", + "\n", + " feature1 feature2 corr\n", + "12 LYM1 LYM2 0.842688\n", + "\n", + " feature1 feature2 corr\n", + "14 Weight BMI 0.842409\n", + "\n", + " feature1 feature2 corr\n", + "16 N2L2 NTproBNP 0.808767\n", + "\n" + ] + } + ], + "source": [ + "# now we can visualise each group. We see that some groups contain\n", + "# only 2 correlated features, some other groups present several features \n", + "# that are correlated among themselves.\n", + "\n", + "for group in correlated_groups:\n", + " print(group)\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "def correlation(dataset, threshold):\n", + " col_corr = set()\n", + " corr_matrix = dataset.corr()\n", + " for i in range(len(corr_matrix.columns)):\n", + " for j in range(i):\n", + " if abs(corr_matrix.iloc[i, j] >= threshold):\n", + " colname = corr_matrix.columns[i]\n", + " col_corr.add(colname)\n", + " return col_corr" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['BMI', 'cTnICKMBOrdinal2', 'NEU2', 'N2L2', 'NEU1', 'AgeG1', 'LYM2', 'HBDH']\n" + ] + } + ], + "source": [ + "corr_features = list((correlation(X_train, 0.8)))\n", + "print(corr_features)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "for i in corr_features:\n", + " if i in categorical_features:\n", + " corr_features.remove(i)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "for i in corr_features:\n", + " if i in numerical_features:\n", + " numerical_features.remove(i)\n", + "\n", + "for i in corr_features:\n", + " if i in categorical_features:\n", + " categorical_features.remove(i)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['BMI', 'cTnICKMBOrdinal2', 'NEU2', 'N2L2', 'NEU1', 'LYM2', 'HBDH']" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "corr_features" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\pandas\\core\\frame.py:4305: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " return super().drop(\n" + ] + } + ], + "source": [ + "X_train.drop(labels=corr_features, axis=1, inplace=True)\n", + "X_test.drop(labels=corr_features, axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((77, 45), (9, 45))" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.shape, X_test.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.3 Statistical Methods" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3.3.1 Mutual Information" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_selection import mutual_info_classif, mutual_info_regression\n", + "from sklearn.feature_selection import SelectKBest, SelectPercentile" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "mi = mutual_info_classif(X_train, y_train)\n", + "mi = pd.Series(mi)\n", + "mi.index = X_train.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Features on the left side have more mutual information with y" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "mi.sort_values(ascending=False).plot.bar(figsize=(20, 8))" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "sel_ = SelectKBest(mutual_info_classif, k = 40).fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "mi_features = list(X_train.columns[ ~ sel_.get_support()].values)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "for i in mi_features:\n", + " if i in categorical_features:\n", + " mi_features.remove(i)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "for i in mi_features:\n", + " if i in numerical_features:\n", + " numerical_features.remove(i)\n", + "\n", + "for i in mi_features:\n", + " if i in categorical_features:\n", + " categorical_features.remove(i)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Height', 'Temp', 'CK', 'HiCKMB']" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mi_features" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\pandas\\core\\frame.py:4305: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " return super().drop(\n" + ] + } + ], + "source": [ + "X_train.drop(labels=mi_features, axis=1, inplace=True)\n", + "X_test.drop(labels=mi_features, axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((77, 41), (9, 41))" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.shape, X_test.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3.3.2 Fisher Score" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_selection import chi2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "categorical features" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "f_score = chi2(X_train[categorical_features], y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The smaller ones have more correlations" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Cancer 0.887949\n", + "SoreThroat 0.842057\n", + "Cough 0.703238\n", + "Headache 0.638344\n", + "Hemoptysis 0.594525\n", + "NauseaNVomit 0.356552\n", + "ChestPain 0.356552\n", + "Diarrhea 0.333947\n", + "Sex 0.302537\n", + "Fever 0.202574\n", + "Catarrh 0.159040\n", + "Hypertention 0.154388\n", + "SoreMuscle 0.105717\n", + "Hyperlipedia 0.099153\n", + "Lung 0.062605\n", + "PoorAppetite 0.060289\n", + "Phlegm 0.046410\n", + "AgeG1 0.037459\n", + "DM 0.008457\n", + "Fatigue 0.000049\n", + "dtype: float64" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p_values = pd.Series(f_score[1])\n", + "p_values.index = X_train[categorical_features].columns\n", + "p_values.sort_values(ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['AgeG1', 'Phlegm', 'Fatigue', 'DM'], dtype=object)" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p_values[p_values<0.05].index.values" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "# for c in categorical_features:\n", + "# if c not in p_values[p_values<0.05].index.values:\n", + "# categorical_features.remove(c)\n", + "# print(c)\n", + "# X_train.drop(labels=c, axis=1, inplace=True)\n", + "# X_test.drop(labels=c, axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((77, 41), (9, 41))" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.shape, X_test.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3.3.3 Univariate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Non-categorical features" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_selection import f_classif, f_regression" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [], + "source": [ + "univariate = f_classif(X_train[numerical_features], y_train)\n", + "univariate = pd.Series(univariate[1])\n", + "univariate.index = X_train[numerical_features].columns\n", + "univariate.sort_values(ascending=False, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "univariate.sort_values(ascending=False).plot.bar(figsize=(20, 8))" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "PCT1 0.802511\n", + "WBC1 0.430778\n", + "AST 0.376614\n", + "CKMB 0.351257\n", + "WBC2 0.207957\n", + "Weight 0.191531\n", + "Cr 0.062507\n", + "dtype: float64" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "univariate[univariate > 0.05]" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Weight\n", + "AST\n", + "CKMB\n", + "Cr\n", + "WBC1\n", + "WBC2\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\pandas\\core\\frame.py:4305: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " return super().drop(\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\pandas\\core\\frame.py:4305: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " return super().drop(\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\pandas\\core\\frame.py:4305: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " return super().drop(\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\pandas\\core\\frame.py:4305: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " return super().drop(\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\pandas\\core\\frame.py:4305: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " return super().drop(\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\pandas\\core\\frame.py:4305: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " return super().drop(\n" + ] + } + ], + "source": [ + "for n in numerical_features:\n", + " if n in univariate[univariate > 0.05].index.values:\n", + " numerical_features.remove(n)\n", + " print(n)\n", + " X_train.drop(labels=n, axis=1, inplace=True)\n", + " X_test.drop(labels=n, axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(77, 35)" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3.3.4 ROC-AUC" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor\n", + "from sklearn.metrics import roc_auc_score, mean_squared_error" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [], + "source": [ + "# loop to build a tree, make predictions and get the roc-auc\n", + "# for each feature of the train set\n", + "\n", + "roc_values = []\n", + "for feature in X_train.columns:\n", + " clf = DecisionTreeClassifier()\n", + " clf.fit(X_train[feature].fillna(0).to_frame(), y_train)\n", + " y_scored = clf.predict_proba(X_test[feature].fillna(0).to_frame())\n", + " roc_values.append(roc_auc_score(y_test, y_scored[:, 1]))" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "cTnITimes 0.775\n", + "CRP2 0.675\n", + "Fever 0.675\n", + "NTproBNP 0.650\n", + "SoreThroat 0.650\n", + "Headache 0.625\n", + "LYM1 0.625\n", + "N2L1 0.625\n", + "ALB2 0.625\n", + "PCT1 0.550\n", + "AgeG1 0.550\n", + "Fatigue 0.500\n", + "ChestPain 0.500\n", + "Age 0.500\n", + "Lung 0.500\n", + "Hemoptysis 0.500\n", + "Phlegm 0.500\n", + "CAD 0.500\n", + "PoorAppetite 0.500\n", + "Diarrhea 0.500\n", + "NauseaNVomit 0.500\n", + "CRP1 0.500\n", + "cTnICKMBOrdinal1 0.500\n", + "cTnI 0.500\n", + "SoreMuscle 0.500\n", + "LDH 0.500\n", + "Sex 0.450\n", + "ALB1 0.450\n", + "Hypertention 0.425\n", + "DM 0.400\n", + "Cancer 0.400\n", + "Hyperlipedia 0.400\n", + "Catarrh 0.375\n", + "Sympton 0.375\n", + "Cough 0.325\n", + "dtype: float64" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let's add the variable names and order it for clearer visualisation\n", + "roc_values = pd.Series(roc_values)\n", + "roc_values.index = X_train.columns\n", + "roc_values.sort_values(ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# and now let's plot\n", + "roc_values.sort_values(ascending=False).plot.bar(figsize=(20, 8))" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "11" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# a roc auc value of 0.5 indicates random decision\n", + "# let's check how many features show a roc-auc value\n", + "# higher than random\n", + "\n", + "len(roc_values[roc_values > 0.5])" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Sex 0.450\n", + "LDH 0.500\n", + "ALB1 0.450\n", + "Sympton 0.375\n", + "Cough 0.325\n", + "Catarrh 0.375\n", + "Hypertention 0.425\n", + "Hyperlipedia 0.400\n", + "DM 0.400\n", + "Cancer 0.400\n", + "dtype: float64" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "roc_values[roc_values < 0.5]" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [], + "source": [ + "roc_features = roc_values[roc_values < 0.5].index.values" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [], + "source": [ + "for i in roc_features:\n", + " if i in numerical_features:\n", + " numerical_features.remove(i)\n", + "\n", + "for i in roc_features:\n", + " if i in categorical_features:\n", + " categorical_features.remove(i)" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Sex', 'LDH', 'ALB1', 'Sympton', 'Cough', 'Catarrh',\n", + " 'Hypertention', 'Hyperlipedia', 'DM', 'Cancer'], dtype=object)" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "roc_features" + ] + }, + { + "cell_type": "code", + "execution_count": 225, + "metadata": {}, + "outputs": [], + "source": [ + "for i in corr_features:\n", + " if i in categorical_features:\n", + " corr_features.remove(i)" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [], + "source": [ + "# X_train.drop(labels=roc_features, axis=1, inplace=True)\n", + "# X_test.drop(labels=roc_features, axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((77, 35), (9, 35))" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.shape, X_test.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [], + "source": [ + "import sklearn\n", + "import sklearn.ensemble\n", + "import sklearn.metrics\n", + "import xgboost as xgb" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Cross Validation**" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import cross_val_score" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "def cv_score(classifier, X, y, scoring):\n", + " return cross_val_score(classifier, X, y, cv=5, scoring=scoring)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Decision Tree**" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DecisionTreeClassifier()" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dt = sklearn.tree.DecisionTreeClassifier()\n", + "\n", + "dt_f1 = cv_score(dt, X_train, y_train, 'f1')\n", + "\n", + "dt.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.5476190476190477\n" + ] + } + ], + "source": [ + "print(np.mean(dt_f1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Random Forest**" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RandomForestClassifier()" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rf = sklearn.ensemble.RandomForestClassifier(n_estimators=100)\n", + "\n", + "rf_f1 = cv_score(rf, X_train, y_train, 'f1')\n", + "\n", + "rf.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.6457142857142858\n" + ] + } + ], + "source": [ + "print(np.mean(rf_f1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**SVM**" + ] + }, + { + "cell_type": "code", + "execution_count": 198, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.svm import SVC" + ] + }, + { + "cell_type": "code", + "execution_count": 199, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "SVC(probability=True)" + ] + }, + "execution_count": 199, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "svc = SVC(probability=True)\n", + "\n", + "svc_f1 = cv_score(svc, X_train, y_train, 'f1')\n", + "\n", + "svc.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 200, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.6733333333333333\n" + ] + } + ], + "source": [ + "print(np.mean(svc_f1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**XGBoost**" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[20:15:04] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n", + "[20:15:04] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\xgboost\\sklearn.py:892: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n", + " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\xgboost\\sklearn.py:892: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n", + " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[20:15:04] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n", + "[20:15:04] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n", + "[20:15:04] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\xgboost\\sklearn.py:892: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n", + " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\xgboost\\sklearn.py:892: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n", + " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\xgboost\\sklearn.py:892: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n", + " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\xgboost\\sklearn.py:892: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n", + " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[20:15:04] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n" + ] + }, + { + "data": { + "text/plain": [ + "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.7,\n", + " colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,\n", + " importance_type='gain', interaction_constraints='',\n", + " learning_rate=0.300000012, max_delta_step=0, max_depth=4,\n", + " min_child_weight=0, missing=nan, monotone_constraints='()',\n", + " n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,\n", + " reg_alpha=4, reg_lambda=1, scale_pos_weight=9, subsample=0.8,\n", + " tree_method='exact', validate_parameters=1, verbosity=None)" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create a model\n", + "# Params from: https://www.kaggle.com/aharless/swetha-s-xgboost-revised\n", + "xgbc = xgb.XGBClassifier(\n", + " max_depth = 4,\n", + " subsample = 0.8,\n", + " colsample_bytree = 0.7,\n", + " colsample_bylevel = 0.7,\n", + " scale_pos_weight = 9,\n", + " min_child_weight = 0,\n", + " reg_alpha = 4,\n", + " objective = 'binary:logistic'\n", + ")\n", + "\n", + "xgbc_f1 = cv_score(xgbc, X_train, y_train, 'f1')\n", + "\n", + "# Fit the models\n", + "xgbc.fit(np.array(X_train), np.array(y_train))" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.698095238095238\n" + ] + } + ], + "source": [ + "print(np.mean(xgbc_f1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Save Models" + ] + }, + { + "cell_type": "code", + "execution_count": 201, + "metadata": {}, + "outputs": [], + "source": [ + "import pickle" + ] + }, + { + "cell_type": "code", + "execution_count": 226, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"2-type-model-4.pkl\", 'wb') as f:\n", + " pickle.dump([dt, rf, svc, xgbc], f)\n", + "with open(\"dataset/2-type-dataset.pkl\", 'wb') as f:\n", + " pickle.dump([X_train, X_test, y_train, y_test], f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load Models" + ] + }, + { + "cell_type": "code", + "execution_count": 203, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"2-type-model-4.pkl\", 'rb') as f:\n", + " [dt, rf, svc, xgbc] = pickle.load(f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 204, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"dataset/2-type-dataset.pkl\", 'rb') as f:\n", + " [X_train, X_test, y_train, y_test] = pickle.load(f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Prediction**" + ] + }, + { + "cell_type": "code", + "execution_count": 205, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import sklearn" + ] + }, + { + "cell_type": "code", + "execution_count": 206, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "dt_pred = dt.predict(X_test)\n", + "rf_pred = rf.predict(X_test)\n", + "svc_pred = svc.predict(X_test)\n", + "xgbc_pred = xgbc.predict(np.array(X_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 207, + "metadata": {}, + "outputs": [], + "source": [ + "import math" + ] + }, + { + "cell_type": "code", + "execution_count": 208, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Decision Tree\n", + "Precision: 0.8888888888888888\n", + "Recal: 0.75\n", + "F1: 0.8571428571428571\n", + "CI: 0.20532285794453828\n", + "\n", + "Random Forest\n", + "Precision: 0.7777777777777778\n", + "Recal: 0.75\n", + "F1: 0.75\n", + "CI: 0.27161661029914536\n", + "\n", + "SVC\n", + "Precision: 0.6666666666666666\n", + "Recal: 0.25\n", + "F1: 0.4\n", + "CI: 0.3079842869168074\n", + "\n", + "XGBoost\n", + "Precision: 0.7777777777777778\n", + "Recal: 1.0\n", + "F1: 0.8\n", + "CI: 0.27161661029914536\n" + ] + } + ], + "source": [ + "print(\"Decision Tree\")\n", + "print(\"Precision: \", sklearn.metrics.accuracy_score(y_test, dt_pred))\n", + "print(\"Recal: \", sklearn.metrics.recall_score(y_test, dt_pred))\n", + "print(\"F1: \", sklearn.metrics.f1_score(y_test, dt_pred))\n", + "print(\"CI:\", 1.96 * math.sqrt( ((1 - sklearn.metrics.accuracy_score(y_test, dt_pred)) * (sklearn.metrics.accuracy_score(y_test, dt_pred))) / len(X_test)))\n", + "\n", + "print()\n", + "\n", + "print(\"Random Forest\")\n", + "print(\"Precision: \", sklearn.metrics.accuracy_score(y_test, rf_pred))\n", + "print(\"Recal: \", sklearn.metrics.recall_score(y_test, rf_pred))\n", + "print(\"F1: \", sklearn.metrics.f1_score(y_test, rf_pred))\n", + "print(\"CI:\", 1.96 * math.sqrt( ((1 - sklearn.metrics.accuracy_score(y_test, rf_pred)) * (sklearn.metrics.accuracy_score(y_test, rf_pred))) / len(X_test)))\n", + "\n", + "print()\n", + "\n", + "print(\"SVC\")\n", + "print(\"Precision: \", sklearn.metrics.accuracy_score(y_test, svc_pred))\n", + "print(\"Recal: \", sklearn.metrics.recall_score(y_test, svc_pred))\n", + "print(\"F1: \", sklearn.metrics.f1_score(y_test, svc_pred))\n", + "print(\"CI:\", 1.96 * math.sqrt( ((1 - sklearn.metrics.accuracy_score(y_test, svc_pred)) * (sklearn.metrics.accuracy_score(y_test, svc_pred))) / len(X_test)))\n", + "\n", + "print()\n", + "\n", + "print(\"XGBoost\")\n", + "print(\"Precision: \", sklearn.metrics.accuracy_score(y_test, xgbc_pred))\n", + "print(\"Recal: \", sklearn.metrics.recall_score(y_test, xgbc_pred))\n", + "print(\"F1: \", sklearn.metrics.f1_score(y_test, xgbc_pred))\n", + "print(\"CI:\", 1.96 * math.sqrt( ((1 - sklearn.metrics.accuracy_score(y_test, xgbc_pred)) * (sklearn.metrics.accuracy_score(y_test, xgbc_pred))) / len(X_test)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7 Interpreatation" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": {}, + "outputs": [], + "source": [ + "class_names = ['normal', 'severe']" + ] + }, + { + "cell_type": "code", + "execution_count": 184, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[27]" + ] + }, + "execution_count": 184, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Patient No. 21 --> 27 after shuffuling\n", + "[i for i, x in enumerate(X_train['LDH']==254) if x]" + ] + }, + { + "cell_type": "code", + "execution_count": 187, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[59]" + ] + }, + "execution_count": 187, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Patient No. 36 --> 59 after shuffuling\n", + "[i for i, x in enumerate(X_train['NTproBNP']==384) if x]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Patient No. 21 (27) & 36 (59)**" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "metadata": {}, + "outputs": [], + "source": [ + "from lime import lime_tabular\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 188, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Sex', 'Age', 'AgeG1', 'cTnITimes', 'cTnI', 'cTnICKMBOrdinal1', 'LDH',\n", + " 'NTproBNP', 'PCT1', 'LYM1', 'N2L1', 'CRP1', 'ALB1', 'CRP2', 'ALB2',\n", + " 'Sympton', 'Fever', 'Cough', 'Phlegm', 'Hemoptysis', 'SoreThroat',\n", + " 'Catarrh', 'Headache', 'ChestPain', 'Fatigue', 'SoreMuscle', 'Diarrhea',\n", + " 'PoorAppetite', 'NauseaNVomit', 'Hypertention', 'Hyperlipedia', 'DM',\n", + " 'Lung', 'CAD', 'Cancer'],\n", + " dtype='object')" + ] + }, + "execution_count": 188, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 189, + "metadata": {}, + "outputs": [], + "source": [ + "categorical_features = [0, 2, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]\n", + "categorical_names = {}\n", + "for c in categorical_features:\n", + " categorical_names[c] = [\"False\", \"True\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Decision Tree**" + ] + }, + { + "cell_type": "code", + "execution_count": 191, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Patient id: 59\n", + "Probability(normal) = 0.0\n", + "True class: severe\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "idx = 59\n", + "class_names = ['normal', 'severe']\n", + "\n", + "print('Patient id: %d' % idx)\n", + "print('Probability(normal) =', dt.predict_proba(np.array(X_train)[idx, :].reshape(1, -1))[0][0])\n", + "print('True class: %s' % class_names[y_train[idx]])\n", + "\n", + "explainer = lime_tabular.LimeTabularExplainer(np.array(X_train), \n", + " feature_names= X_train.columns, class_names = class_names, \n", + " categorical_features = categorical_features, categorical_names = categorical_names,\n", + " discretize_continuous=True)\n", + "exp = explainer.explain_instance(np.array(X_train)[idx, :], predict_fn = dt.predict_proba, num_features = 10)\n", + "\n", + "%matplotlib inline\n", + "fig = exp.as_pyplot_figure()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Random Forest**" + ] + }, + { + "cell_type": "code", + "execution_count": 194, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Patient id: 59\n", + "Probability(normal) = 0.23\n", + "True class: severe\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "idx = 59\n", + "class_names = ['normal', 'severe']\n", + "\n", + "print('Patient id: %d' % idx)\n", + "print('Probability(normal) =', rf.predict_proba(np.array(X_train)[idx, :].reshape(1, -1))[0][0])\n", + "print('True class: %s' % class_names[y_train[idx]])\n", + "explainer = lime_tabular.LimeTabularExplainer(np.array(X_train), \n", + " feature_names= X_train.columns, class_names = class_names, \n", + " categorical_features = categorical_features, categorical_names = categorical_names,\n", + " discretize_continuous=True)\n", + "exp = explainer.explain_instance(np.array(X_train)[idx, :], predict_fn = rf.predict_proba, num_features = 10)\n", + "\n", + "%matplotlib inline\n", + "fig = exp.as_pyplot_figure()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**SVM**" + ] + }, + { + "cell_type": "code", + "execution_count": 209, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Patient id: 59\n", + "Probability(normal) = 0.8801943355780342\n", + "True class: severe\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "idx = 59\n", + "class_names = ['normal', 'severe']\n", + "\n", + "print('Patient id: %d' % idx)\n", + "print('Probability(normal) =', svc.predict_proba(np.array(X_train)[idx, :].reshape(1, -1))[0][0])\n", + "print('True class: %s' % class_names[y_train[idx]])\n", + "explainer = lime_tabular.LimeTabularExplainer(np.array(X_train), \n", + " feature_names= X_train.columns, class_names = class_names, \n", + " categorical_features = categorical_features, categorical_names = categorical_names,\n", + " discretize_continuous=True)\n", + "exp = explainer.explain_instance(np.array(X_train)[idx, :], predict_fn = svc.predict_proba, num_features = 10)\n", + "\n", + "%matplotlib inline\n", + "fig = exp.as_pyplot_figure()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Gradient Boosted Trees**" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "metadata": {}, + "outputs": [], + "source": [ + "def predict_fn_xbg(X):\n", + " X_data = pd.DataFrame(data=X, columns=xgbc.get_booster().feature_names)\n", + " return xgbc.predict_proba(X_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 196, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Patient id: 59\n", + "Probability(normal) = 0.05998808\n", + "True class: severe\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "idx = 59\n", + "print('Patient id: %d' % idx)\n", + "print('Probability(normal) =', xgbc.predict_proba(X_train[idx:idx+1])[0][0])\n", + "print('True class: %s' % class_names[y_train[idx]])\n", + "explainer = lime_tabular.LimeTabularExplainer(np.array(X_train), \n", + " feature_names= X_train.columns, class_names = ['normal', 'severe'], \n", + " categorical_features = categorical_features, categorical_names = categorical_names,\n", + " discretize_continuous=True)\n", + "exp = explainer.explain_instance(np.array(X_train)[idx, :], predict_fn = predict_fn_xbg, num_features = 10)\n", + "\n", + "%matplotlib inline\n", + "fig = exp.as_pyplot_figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/4_type_with_ct_score.ipynb b/4_type_with_ct_score.ipynb new file mode 100644 index 0000000..7b8269a --- /dev/null +++ b/4_type_with_ct_score.ipynb @@ -0,0 +1,1183 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Interpretable ML - COVID19\n", + "> Interpretable ML Research for COVID19\n", + "- toc:true\n", + "- branch: master\n", + "- badges: true\n", + "- comments: true\n", + "- author: Han Wu\n", + "- categories: [jupyter]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 0. Load Data" + ] + }, + { + "cell_type": "code", + "execution_count": 493, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 494, + "metadata": {}, + "outputs": [], + "source": [ + "np.set_printoptions(suppress=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 495, + "metadata": {}, + "outputs": [], + "source": [ + "covid = pd.read_csv(\"dataset/covid.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 496, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AVG age for severity 0: 36.833333333333336\n", + "AVG age for severity 1: 47.45283018867924\n", + "AVG age for severity 2: 54.3125\n", + "AVG age for severity 3: 69.4\n" + ] + } + ], + "source": [ + "print(\"AVG age for severity 0:\", np.mean(covid[covid.Severity03 == 0].Age.to_numpy()))\n", + "print(\"AVG age for severity 1:\", np.mean(covid[covid.Severity03 == 1].Age.to_numpy()))\n", + "print(\"AVG age for severity 2:\", np.mean(covid[covid.Severity03 == 2].Age.to_numpy()))\n", + "print(\"AVG age for severity 3:\", np.mean(covid[covid.Severity03 == 3].Age.to_numpy()))" + ] + }, + { + "cell_type": "code", + "execution_count": 497, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(92, 74)" + ] + }, + "execution_count": 497, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "covid.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Data Wash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Remove Features that has NULL value" + ] + }, + { + "cell_type": "code", + "execution_count": 498, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "remove_columns = ['MedNum', 'LVEF', 'SO2', 'PO2', 'YHZS', 'RML', 'RUL', 'RLL', 'LUL', 'LLL']" + ] + }, + { + "cell_type": "code", + "execution_count": 499, + "metadata": {}, + "outputs": [], + "source": [ + "covid = covid.drop(remove_columns, axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Remove Features that records time rather than biomarkers" + ] + }, + { + "cell_type": "code", + "execution_count": 500, + "metadata": {}, + "outputs": [], + "source": [ + "remove_columns = ['Onset2Admi', 'Onset2CT1', 'Onset2CTPositive1', 'Onset2CTPeak']" + ] + }, + { + "cell_type": "code", + "execution_count": 501, + "metadata": {}, + "outputs": [], + "source": [ + "covid = covid.drop(remove_columns, axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Remove Patients that have no records" + ] + }, + { + "cell_type": "code", + "execution_count": 502, + "metadata": {}, + "outputs": [], + "source": [ + "covid = covid[covid.Weight != \" \"]" + ] + }, + { + "cell_type": "code", + "execution_count": 503, + "metadata": {}, + "outputs": [], + "source": [ + "covid = covid[covid.cTnI != \" \"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "String to Float" + ] + }, + { + "cell_type": "code", + "execution_count": 504, + "metadata": {}, + "outputs": [], + "source": [ + "covid['Weight'] = covid['Weight'].astype(np.float64)\n", + "covid['Height'] = covid['Height'].astype(np.float64)\n", + "covid['cTnITimes'] = covid['cTnITimes'].astype(np.float64)\n", + "covid['cTnI'] = covid['cTnI'].astype(np.float64)\n", + "covid['NTproBNP'] = covid['NTproBNP'].astype(np.float64)\n", + "covid['Cr'] = covid['Cr'].astype(np.float64)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Train Test Split" + ] + }, + { + "cell_type": "code", + "execution_count": 505, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import preprocessing\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 506, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import OneHotEncoder" + ] + }, + { + "cell_type": "code", + "execution_count": 507, + "metadata": {}, + "outputs": [], + "source": [ + "ohe = OneHotEncoder(drop='if_binary').fit(covid.Severity03.to_numpy().reshape(-1, 1))" + ] + }, + { + "cell_type": "code", + "execution_count": 508, + "metadata": {}, + "outputs": [], + "source": [ + "y = covid.Severity03.to_numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": 509, + "metadata": {}, + "outputs": [], + "source": [ + "y = ohe.transform(y.reshape(-1, 1)).toarray()" + ] + }, + { + "cell_type": "code", + "execution_count": 510, + "metadata": {}, + "outputs": [], + "source": [ + "# Use Both\n", + "covid = covid.drop([\"Severity01\", \"Severity03\"], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 511, + "metadata": {}, + "outputs": [], + "source": [ + "# Use None\n", + "# covid = covid.drop([\"Severity01\", \"Severity03\", \"CTScore\", \"AIVolumeP\"], axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Patient No. is irrrelevant" + ] + }, + { + "cell_type": "code", + "execution_count": 512, + "metadata": {}, + "outputs": [], + "source": [ + "covid = covid.drop([\"No\"], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 513, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "X = covid\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.9, random_state = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 514, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((77, 57), (9, 57))" + ] + }, + "execution_count": 514, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.shape, X_test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 515, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Sex', 'Age', 'AgeG1', 'Height', 'Weight', 'BMI', 'Temp', 'CTScore',\n", + " 'AIVolumeP', 'cTnITimes', 'cTnI', 'cTnICKMBOrdinal1',\n", + " 'cTnICKMBOrdinal2', 'AST', 'LDH', 'CK', 'CKMB', 'HBDH', 'HiCKMB',\n", + " 'NTproBNP', 'Cr', 'PCT1', 'WBC1', 'NEU1', 'LYM1', 'N2L1', 'CRP1',\n", + " 'ALB1', 'PCT2', 'WBC2', 'NEU2', 'LYM2', 'N2L2', 'CRP2', 'ALB2',\n", + " 'Sympton', 'Fever', 'Cough', 'Phlegm', 'Hemoptysis', 'SoreThroat',\n", + " 'Catarrh', 'Headache', 'ChestPain', 'Fatigue', 'SoreMuscle',\n", + " 'Stomachache', 'Diarrhea', 'PoorAppetite', 'NauseaNVomit',\n", + " 'Hypertention', 'Hyperlipedia', 'DM', 'Lung', 'CAD', 'Arrythmia',\n", + " 'Cancer'],\n", + " dtype='object')" + ] + }, + "execution_count": 515, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 384, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0., 1., 0., 0.],\n", + " [0., 0., 1., 0.],\n", + " [0., 1., 0., 0.],\n", + " [0., 1., 0., 0.],\n", + " [0., 0., 1., 0.],\n", + " [0., 0., 1., 0.],\n", + " [0., 1., 0., 0.],\n", + " [0., 0., 1., 0.],\n", + " [1., 0., 0., 0.]])" + ] + }, + "execution_count": 384, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_test" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Feature Selection (Same as 2-type)" + ] + }, + { + "cell_type": "code", + "execution_count": 385, + "metadata": {}, + "outputs": [], + "source": [ + "cor_features = ['PCT2', 'Stomachache', 'Arrythmia', \n", + " 'LYM2',\n", + " 'NEU1',\n", + " 'NEU2',\n", + " 'BMI',\n", + " 'N2L2',\n", + "# 'AIVolumeP',\n", + " 'cTnICKMBOrdinal2',\n", + " 'HBDH',\n", + " 'Height', 'CK', 'HiCKMB', 'Cr', 'PCT1',\n", + " 'Weight', 'AST', 'CKMB', 'WBC1', 'WBC2']" + ] + }, + { + "cell_type": "code", + "execution_count": 386, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\pandas\\core\\frame.py:4305: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " return super().drop(\n" + ] + } + ], + "source": [ + "# X_train.drop(labels=cor_features, axis=1, inplace=True)\n", + "# X_test.drop(labels=cor_features, axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 387, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((77, 37), (9, 37))" + ] + }, + "execution_count": 387, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# X_train.shape, X_test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 388, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Sex', 'Age', 'AgeG1', 'Temp', 'CTScore', 'AIVolumeP', 'cTnITimes',\n", + " 'cTnI', 'cTnICKMBOrdinal1', 'LDH', 'NTproBNP', 'LYM1', 'N2L1', 'CRP1',\n", + " 'ALB1', 'CRP2', 'ALB2', 'Sympton', 'Fever', 'Cough', 'Phlegm',\n", + " 'Hemoptysis', 'SoreThroat', 'Catarrh', 'Headache', 'ChestPain',\n", + " 'Fatigue', 'SoreMuscle', 'Diarrhea', 'PoorAppetite', 'NauseaNVomit',\n", + " 'Hypertention', 'Hyperlipedia', 'DM', 'Lung', 'CAD', 'Cancer'],\n", + " dtype='object')" + ] + }, + "execution_count": 388, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 516, + "metadata": {}, + "outputs": [], + "source": [ + "import sklearn\n", + "import sklearn.ensemble\n", + "import sklearn.metrics\n", + "import xgboost as xgb" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Cross Validation**" + ] + }, + { + "cell_type": "code", + "execution_count": 517, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import cross_val_score" + ] + }, + { + "cell_type": "code", + "execution_count": 518, + "metadata": {}, + "outputs": [], + "source": [ + "def cv_score(classifier, X, y, scoring):\n", + " return cross_val_score(classifier, X, y, cv=5, scoring=scoring)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Decision Tree**" + ] + }, + { + "cell_type": "code", + "execution_count": 519, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DecisionTreeClassifier()" + ] + }, + "execution_count": 519, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dt = sklearn.tree.DecisionTreeClassifier()\n", + "\n", + "dt_f1 = cv_score(dt, X_train, y_train, 'f1_micro')\n", + "\n", + "dt.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 520, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7116666666666667\n" + ] + } + ], + "source": [ + "print(np.mean(dt_f1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Random Forest**" + ] + }, + { + "cell_type": "code", + "execution_count": 573, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 574, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RandomForestClassifier()" + ] + }, + "execution_count": 574, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rf = sklearn.ensemble.RandomForestClassifier(n_estimators=100)\n", + "\n", + "rf_f1 = cv_score(rf, X_train, y_train, 'f1_micro')\n", + "\n", + "rf.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 575, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7682480737653151\n" + ] + } + ], + "source": [ + "print(np.mean(rf_f1))" + ] + }, + { + "cell_type": "code", + "execution_count": 576, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Random Forest\n", + "Precision: 0.5555555555555556\n", + "Recal: 0.5555555555555556\n", + "F1: 0.6666666666666667\n", + "CI: 0.32464394339996944\n" + ] + } + ], + "source": [ + "rf_pred = rf.predict(X_test)\n", + "print(\"Random Forest\")\n", + "print(\"Precision: \", sklearn.metrics.accuracy_score(y_test, rf_pred))\n", + "print(\"Recal: \", sklearn.metrics.recall_score(y_test, rf_pred, average='micro'))\n", + "print(\"F1: \", sklearn.metrics.f1_score(y_test, rf_pred, average='micro'))\n", + "print(\"CI:\", 1.96 * math.sqrt( ((1 - sklearn.metrics.accuracy_score(y_test, rf_pred)) * (sklearn.metrics.accuracy_score(y_test, rf_pred))) / len(X_test)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**SVM**" + ] + }, + { + "cell_type": "code", + "execution_count": 525, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.svm import SVC" + ] + }, + { + "cell_type": "code", + "execution_count": 526, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " return f(*args, **kwargs)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " return f(*args, **kwargs)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " return f(*args, **kwargs)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " return f(*args, **kwargs)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " return f(*args, **kwargs)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " return f(*args, **kwargs)\n" + ] + }, + { + "data": { + "text/plain": [ + "SVC(decision_function_shape='ovo', probability=True)" + ] + }, + "execution_count": 526, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "svc = SVC(probability=True, decision_function_shape='ovo')\n", + "\n", + "svc_f1 = cv_score(svc, X_train, ohe.inverse_transform(y_train), 'f1_micro')\n", + "\n", + "svc.fit(X_train, ohe.inverse_transform(y_train))" + ] + }, + { + "cell_type": "code", + "execution_count": 527, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.6375\n" + ] + } + ], + "source": [ + "print(np.mean(svc_f1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**XGBoost**" + ] + }, + { + "cell_type": "code", + "execution_count": 538, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\xgboost\\sklearn.py:892: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n", + " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " return f(*args, **kwargs)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[22:29:23] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:541: \n", + "Parameters: { scale_pos_weight } might not be used.\n", + "\n", + " This may not be accurate due to some parameters are only used in language bindings but\n", + " passed down to XGBoost core. Or some parameters are not used but slip through this\n", + " verification. Please open an issue if you find above cases.\n", + "\n", + "\n", + "[22:29:23] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\xgboost\\sklearn.py:892: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n", + " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " return f(*args, **kwargs)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[22:29:23] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:541: \n", + "Parameters: { scale_pos_weight } might not be used.\n", + "\n", + " This may not be accurate due to some parameters are only used in language bindings but\n", + " passed down to XGBoost core. Or some parameters are not used but slip through this\n", + " verification. Please open an issue if you find above cases.\n", + "\n", + "\n", + "[22:29:23] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\xgboost\\sklearn.py:892: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n", + " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " return f(*args, **kwargs)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[22:29:23] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:541: \n", + "Parameters: { scale_pos_weight } might not be used.\n", + "\n", + " This may not be accurate due to some parameters are only used in language bindings but\n", + " passed down to XGBoost core. Or some parameters are not used but slip through this\n", + " verification. Please open an issue if you find above cases.\n", + "\n", + "\n", + "[22:29:23] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\xgboost\\sklearn.py:892: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n", + " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " return f(*args, **kwargs)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[22:29:24] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:541: \n", + "Parameters: { scale_pos_weight } might not be used.\n", + "\n", + " This may not be accurate due to some parameters are only used in language bindings but\n", + " passed down to XGBoost core. Or some parameters are not used but slip through this\n", + " verification. Please open an issue if you find above cases.\n", + "\n", + "\n", + "[22:29:24] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\xgboost\\sklearn.py:892: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n", + " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " return f(*args, **kwargs)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[22:29:24] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:541: \n", + "Parameters: { scale_pos_weight } might not be used.\n", + "\n", + " This may not be accurate due to some parameters are only used in language bindings but\n", + " passed down to XGBoost core. Or some parameters are not used but slip through this\n", + " verification. Please open an issue if you find above cases.\n", + "\n", + "\n", + "[22:29:24] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\xgboost\\sklearn.py:892: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n", + " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " return f(*args, **kwargs)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[22:29:24] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:541: \n", + "Parameters: { scale_pos_weight } might not be used.\n", + "\n", + " This may not be accurate due to some parameters are only used in language bindings but\n", + " passed down to XGBoost core. Or some parameters are not used but slip through this\n", + " verification. Please open an issue if you find above cases.\n", + "\n", + "\n", + "[22:29:24] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n" + ] + }, + { + "data": { + "text/plain": [ + "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.7,\n", + " colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,\n", + " importance_type='gain', interaction_constraints='',\n", + " learning_rate=0.300000012, max_delta_step=0, max_depth=4,\n", + " min_child_weight=1, missing=nan, monotone_constraints='()',\n", + " n_estimators=100, n_jobs=8, num_parallel_tree=1,\n", + " objective='multi:softprob', random_state=0, reg_alpha=4,\n", + " reg_lambda=1, scale_pos_weight=9, subsample=0.8,\n", + " tree_method='exact', validate_parameters=1, verbosity=None)" + ] + }, + "execution_count": 538, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create a model\n", + "# Params from: https://www.kaggle.com/aharless/swetha-s-xgboost-revised\n", + "xgbc = xgb.XGBClassifier(\n", + "# booster = 'gbtree',\n", + " max_depth = 4,\n", + "# eta = 0.3,\n", + " subsample = 0.8,\n", + " colsample_bytree = 0.7,\n", + " colsample_bylevel = 0.7,\n", + " scale_pos_weight = 9,\n", + " min_child_weight = 1,\n", + " reg_alpha = 4,\n", + " objective = 'multi:softmax'\n", + ")\n", + "\n", + "xgbc_f1 = cv_score(xgbc, X_train, ohe.inverse_transform(y_train), 'f1_micro')\n", + "\n", + "# Fit the models\n", + "xgbc.fit(np.array(X_train), np.array(ohe.inverse_transform(y_train)))" + ] + }, + { + "cell_type": "code", + "execution_count": 539, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.8441666666666666\n" + ] + } + ], + "source": [ + "print(np.mean(xgbc_f1))" + ] + }, + { + "cell_type": "code", + "execution_count": 540, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "XGBoost\n", + "Precision: 0.7777777777777778\n", + "Recal: 0.7777777777777778\n", + "F1: 0.7777777777777778\n", + "CI: 0.27161661029914536\n" + ] + } + ], + "source": [ + "xgbc_pred = ohe.transform(xgbc.predict(np.array(X_test)).reshape(-1, 1)).toarray()\n", + "print(\"XGBoost\")\n", + "print(\"Precision: \", sklearn.metrics.accuracy_score(y_test, xgbc_pred))\n", + "print(\"Recal: \", sklearn.metrics.recall_score(y_test, xgbc_pred, average='micro'))\n", + "print(\"F1: \", sklearn.metrics.f1_score(y_test, xgbc_pred, average='micro'))\n", + "print(\"CI:\", 1.96 * math.sqrt( ((1 - sklearn.metrics.accuracy_score(y_test, xgbc_pred)) * (sklearn.metrics.accuracy_score(y_test, xgbc_pred))) / len(X_test)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Save Models" + ] + }, + { + "cell_type": "code", + "execution_count": 485, + "metadata": {}, + "outputs": [], + "source": [ + "import pickle" + ] + }, + { + "cell_type": "code", + "execution_count": 577, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"4-type-model-4-ct.pkl\", 'wb') as f:\n", + " pickle.dump([dt, rf, svc, xgbc], f)\n", + "with open(\"dataset/4-type-dataset-ct.pkl\", 'wb') as f:\n", + " pickle.dump([X_train, X_test, y_train, y_test], f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load Models" + ] + }, + { + "cell_type": "code", + "execution_count": 487, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"4-type-model-4-ct.pkl\", 'rb') as f:\n", + " [dt, rf, svc, xgbc] = pickle.load(f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 488, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"dataset/4-type-dataset-ct.pkl\", 'rb') as f:\n", + " [X_train, X_test, y_train, y_test] = pickle.load(f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Prediction**" + ] + }, + { + "cell_type": "code", + "execution_count": 531, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import sklearn" + ] + }, + { + "cell_type": "code", + "execution_count": 532, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "dt_pred = dt.predict(X_test)\n", + "rf_pred = rf.predict(X_test)\n", + "svc_pred = ohe.transform(svc.predict(X_test).reshape(-1, 1)).toarray()\n", + "xgbc_pred = ohe.transform(xgbc.predict(np.array(X_test)).reshape(-1, 1)).toarray()" + ] + }, + { + "cell_type": "code", + "execution_count": 533, + "metadata": {}, + "outputs": [], + "source": [ + "import math" + ] + }, + { + "cell_type": "code", + "execution_count": 534, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Decision Tree\n", + "Precision: 0.5555555555555556\n", + "Recal: 0.5555555555555556\n", + "F1: 0.5555555555555556\n", + "CI: 0.32464394339996944\n", + "\n", + "Random Forest\n", + "Precision: 0.5555555555555556\n", + "Recal: 0.5555555555555556\n", + "F1: 0.6666666666666667\n", + "CI: 0.32464394339996944\n", + "\n", + "SVC\n", + "Precision: 0.5555555555555556\n", + "Recal: 0.5555555555555556\n", + "F1: 0.5555555555555556\n", + "CI: 0.32464394339996944\n", + "\n", + "XGBoost\n", + "Precision: 0.7777777777777778\n", + "Recal: 0.7777777777777778\n", + "F1: 0.7777777777777778\n", + "CI: 0.27161661029914536\n" + ] + } + ], + "source": [ + "print(\"Decision Tree\")\n", + "print(\"Precision: \", sklearn.metrics.accuracy_score(y_test, dt_pred))\n", + "print(\"Recal: \", sklearn.metrics.recall_score(y_test, dt_pred, average='micro'))\n", + "print(\"F1: \", sklearn.metrics.f1_score(y_test, dt_pred, average='micro'))\n", + "print(\"CI:\", 1.96 * math.sqrt( ((1 - sklearn.metrics.accuracy_score(y_test, dt_pred)) * (sklearn.metrics.accuracy_score(y_test, dt_pred))) / len(X_test)))\n", + "\n", + "print()\n", + "\n", + "print(\"Random Forest\")\n", + "print(\"Precision: \", sklearn.metrics.accuracy_score(y_test, rf_pred))\n", + "print(\"Recal: \", sklearn.metrics.recall_score(y_test, rf_pred, average='micro'))\n", + "print(\"F1: \", sklearn.metrics.f1_score(y_test, rf_pred, average='micro'))\n", + "print(\"CI:\", 1.96 * math.sqrt( ((1 - sklearn.metrics.accuracy_score(y_test, rf_pred)) * (sklearn.metrics.accuracy_score(y_test, rf_pred))) / len(X_test)))\n", + "\n", + "print()\n", + "\n", + "print(\"SVC\")\n", + "print(\"Precision: \", sklearn.metrics.accuracy_score(y_test, svc_pred))\n", + "print(\"Recal: \", sklearn.metrics.recall_score(y_test, svc_pred, average='micro'))\n", + "print(\"F1: \", sklearn.metrics.f1_score(y_test, svc_pred, average='micro'))\n", + "print(\"CI:\", 1.96 * math.sqrt( ((1 - sklearn.metrics.accuracy_score(y_test, svc_pred)) * (sklearn.metrics.accuracy_score(y_test, svc_pred))) / len(X_test)))\n", + "\n", + "print()\n", + "\n", + "print(\"XGBoost\")\n", + "print(\"Precision: \", sklearn.metrics.accuracy_score(y_test, xgbc_pred))\n", + "print(\"Recal: \", sklearn.metrics.recall_score(y_test, xgbc_pred, average='micro'))\n", + "print(\"F1: \", sklearn.metrics.f1_score(y_test, xgbc_pred, average='micro'))\n", + "print(\"CI:\", 1.96 * math.sqrt( ((1 - sklearn.metrics.accuracy_score(y_test, xgbc_pred)) * (sklearn.metrics.accuracy_score(y_test, xgbc_pred))) / len(X_test)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/4_type_without_ct_score.ipynb b/4_type_without_ct_score.ipynb new file mode 100644 index 0000000..78cabf0 --- /dev/null +++ b/4_type_without_ct_score.ipynb @@ -0,0 +1,1200 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Interpretable ML - COVID19\n", + "> Interpretable ML Research for COVID19\n", + "- toc:true\n", + "- branch: master\n", + "- badges: true\n", + "- comments: true\n", + "- author: Han Wu\n", + "- categories: [jupyter]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 0. Load Data" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 148, + "metadata": {}, + "outputs": [], + "source": [ + "np.set_printoptions(suppress=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "metadata": {}, + "outputs": [], + "source": [ + "covid = pd.read_csv(\"dataset/covid.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AVG age for severity 0: 36.833333333333336\n", + "AVG age for severity 1: 47.45283018867924\n", + "AVG age for severity 2: 54.3125\n", + "AVG age for severity 3: 69.4\n" + ] + } + ], + "source": [ + "print(\"AVG age for severity 0:\", np.mean(covid[covid.Severity03 == 0].Age.to_numpy()))\n", + "print(\"AVG age for severity 1:\", np.mean(covid[covid.Severity03 == 1].Age.to_numpy()))\n", + "print(\"AVG age for severity 2:\", np.mean(covid[covid.Severity03 == 2].Age.to_numpy()))\n", + "print(\"AVG age for severity 3:\", np.mean(covid[covid.Severity03 == 3].Age.to_numpy()))" + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(92, 74)" + ] + }, + "execution_count": 151, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "covid.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Data Wash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Remove Features that has NULL value" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "remove_columns = ['MedNum', 'LVEF', 'SO2', 'PO2', 'YHZS', 'RML', 'RUL', 'RLL', 'LUL', 'LLL']" + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "metadata": {}, + "outputs": [], + "source": [ + "covid = covid.drop(remove_columns, axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Remove Features that records time rather than biomarkers" + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "metadata": {}, + "outputs": [], + "source": [ + "remove_columns = ['Onset2Admi', 'Onset2CT1', 'Onset2CTPositive1', 'Onset2CTPeak']" + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "metadata": {}, + "outputs": [], + "source": [ + "covid = covid.drop(remove_columns, axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Remove Patients that have no records" + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "metadata": {}, + "outputs": [], + "source": [ + "covid = covid[covid.Weight != \" \"]" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "metadata": {}, + "outputs": [], + "source": [ + "covid = covid[covid.cTnI != \" \"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "String to Float" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "metadata": {}, + "outputs": [], + "source": [ + "covid['Weight'] = covid['Weight'].astype(np.float64)\n", + "covid['Height'] = covid['Height'].astype(np.float64)\n", + "covid['cTnITimes'] = covid['cTnITimes'].astype(np.float64)\n", + "covid['cTnI'] = covid['cTnI'].astype(np.float64)\n", + "covid['NTproBNP'] = covid['NTproBNP'].astype(np.float64)\n", + "covid['Cr'] = covid['Cr'].astype(np.float64)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Train Test Split" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import preprocessing\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 160, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import OneHotEncoder" + ] + }, + { + "cell_type": "code", + "execution_count": 161, + "metadata": {}, + "outputs": [], + "source": [ + "ohe = OneHotEncoder(drop='if_binary').fit(covid.Severity03.to_numpy().reshape(-1, 1))" + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "metadata": {}, + "outputs": [], + "source": [ + "y = covid.Severity03.to_numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "metadata": {}, + "outputs": [], + "source": [ + "y = ohe.transform(y.reshape(-1, 1)).toarray()" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "metadata": {}, + "outputs": [], + "source": [ + "# Use Both\n", + "# covid = covid.drop([\"Severity01\", \"Severity03\"], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "metadata": {}, + "outputs": [], + "source": [ + "# Use None\n", + "covid = covid.drop([\"Severity01\", \"Severity03\", \"CTScore\", \"AIVolumeP\"], axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Patient No. is irrrelevant" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "metadata": {}, + "outputs": [], + "source": [ + "covid = covid.drop([\"No\"], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "X = covid\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.9, random_state = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((77, 55), (9, 55))" + ] + }, + "execution_count": 168, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.shape, X_test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 169, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Sex', 'Age', 'AgeG1', 'Height', 'Weight', 'BMI', 'Temp', 'cTnITimes',\n", + " 'cTnI', 'cTnICKMBOrdinal1', 'cTnICKMBOrdinal2', 'AST', 'LDH', 'CK',\n", + " 'CKMB', 'HBDH', 'HiCKMB', 'NTproBNP', 'Cr', 'PCT1', 'WBC1', 'NEU1',\n", + " 'LYM1', 'N2L1', 'CRP1', 'ALB1', 'PCT2', 'WBC2', 'NEU2', 'LYM2', 'N2L2',\n", + " 'CRP2', 'ALB2', 'Sympton', 'Fever', 'Cough', 'Phlegm', 'Hemoptysis',\n", + " 'SoreThroat', 'Catarrh', 'Headache', 'ChestPain', 'Fatigue',\n", + " 'SoreMuscle', 'Stomachache', 'Diarrhea', 'PoorAppetite', 'NauseaNVomit',\n", + " 'Hypertention', 'Hyperlipedia', 'DM', 'Lung', 'CAD', 'Arrythmia',\n", + " 'Cancer'],\n", + " dtype='object')" + ] + }, + "execution_count": 169, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0., 1., 0., 0.],\n", + " [0., 0., 1., 0.],\n", + " [0., 1., 0., 0.],\n", + " [0., 1., 0., 0.],\n", + " [0., 0., 1., 0.],\n", + " [0., 0., 1., 0.],\n", + " [0., 1., 0., 0.],\n", + " [0., 0., 1., 0.],\n", + " [1., 0., 0., 0.]])" + ] + }, + "execution_count": 170, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_test" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Feature Selection (Same as 2-type)" + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "metadata": {}, + "outputs": [], + "source": [ + "cor_features = ['PCT2', 'Stomachache', 'Arrythmia', \n", + " 'LYM2',\n", + " 'NEU1',\n", + " 'NEU2',\n", + " 'BMI',\n", + " 'N2L2',\n", + " 'cTnICKMBOrdinal2',\n", + " 'HBDH',\n", + " 'Height', 'CK', 'HiCKMB', 'Cr', 'PCT1',\n", + " 'Weight', 'AST', 'CKMB', 'WBC1', 'WBC2']" + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "metadata": {}, + "outputs": [], + "source": [ + "# X_train.drop(labels=cor_features, axis=1, inplace=True)\n", + "# X_test.drop(labels=cor_features, axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# X_train.shape, X_test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 174, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Sex', 'Age', 'AgeG1', 'Height', 'Weight', 'BMI', 'Temp', 'cTnITimes',\n", + " 'cTnI', 'cTnICKMBOrdinal1', 'cTnICKMBOrdinal2', 'AST', 'LDH', 'CK',\n", + " 'CKMB', 'HBDH', 'HiCKMB', 'NTproBNP', 'Cr', 'PCT1', 'WBC1', 'NEU1',\n", + " 'LYM1', 'N2L1', 'CRP1', 'ALB1', 'PCT2', 'WBC2', 'NEU2', 'LYM2', 'N2L2',\n", + " 'CRP2', 'ALB2', 'Sympton', 'Fever', 'Cough', 'Phlegm', 'Hemoptysis',\n", + " 'SoreThroat', 'Catarrh', 'Headache', 'ChestPain', 'Fatigue',\n", + " 'SoreMuscle', 'Stomachache', 'Diarrhea', 'PoorAppetite', 'NauseaNVomit',\n", + " 'Hypertention', 'Hyperlipedia', 'DM', 'Lung', 'CAD', 'Arrythmia',\n", + " 'Cancer'],\n", + " dtype='object')" + ] + }, + "execution_count": 174, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 175, + "metadata": {}, + "outputs": [], + "source": [ + "import sklearn\n", + "import sklearn.ensemble\n", + "import sklearn.metrics\n", + "import xgboost as xgb" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Cross Validation**" + ] + }, + { + "cell_type": "code", + "execution_count": 176, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import cross_val_score" + ] + }, + { + "cell_type": "code", + "execution_count": 177, + "metadata": {}, + "outputs": [], + "source": [ + "def cv_score(classifier, X, y, scoring):\n", + " return cross_val_score(classifier, X, y, cv=5, scoring=scoring)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Decision Tree**" + ] + }, + { + "cell_type": "code", + "execution_count": 213, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "DecisionTreeClassifier()" + ] + }, + "execution_count": 213, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dt = sklearn.tree.DecisionTreeClassifier()\n", + "\n", + "dt_f1 = cv_score(dt, X_train, y_train, 'f1_micro')\n", + "\n", + "dt.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 214, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.4525\n" + ] + } + ], + "source": [ + "print(np.mean(dt_f1))" + ] + }, + { + "cell_type": "code", + "execution_count": 215, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Decision Tree\n", + "Precision: 0.3333333333333333\n", + "Recal: 0.3333333333333333\n", + "F1: 0.3333333333333333\n", + "CI: 0.3079842869168074\n" + ] + } + ], + "source": [ + "dt_pred = dt.predict(X_test)\n", + "print(\"Decision Tree\")\n", + "print(\"Precision: \", sklearn.metrics.accuracy_score(y_test, dt_pred))\n", + "print(\"Recal: \", sklearn.metrics.recall_score(y_test, dt_pred, average='micro'))\n", + "print(\"F1: \", sklearn.metrics.f1_score(y_test, dt_pred, average='micro'))\n", + "print(\"CI:\", 1.96 * math.sqrt( ((1 - sklearn.metrics.accuracy_score(y_test, dt_pred)) * (sklearn.metrics.accuracy_score(y_test, dt_pred))) / len(X_test)))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Random Forest**" + ] + }, + { + "cell_type": "code", + "execution_count": 216, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 217, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RandomForestClassifier()" + ] + }, + "execution_count": 217, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rf = sklearn.ensemble.RandomForestClassifier(n_estimators=100)\n", + "\n", + "rf_f1 = cv_score(rf, X_train, y_train, 'f1_micro')\n", + "\n", + "rf.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 218, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.526854897337656\n" + ] + } + ], + "source": [ + "print(np.mean(rf_f1))" + ] + }, + { + "cell_type": "code", + "execution_count": 219, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Random Forest\n", + "Precision: 0.3333333333333333\n", + "Recal: 0.3333333333333333\n", + "F1: 0.46153846153846156\n", + "CI: 0.3079842869168074\n" + ] + } + ], + "source": [ + "rf_pred = rf.predict(X_test)\n", + "print(\"Random Forest\")\n", + "print(\"Precision: \", sklearn.metrics.accuracy_score(y_test, rf_pred))\n", + "print(\"Recal: \", sklearn.metrics.recall_score(y_test, rf_pred, average='micro'))\n", + "print(\"F1: \", sklearn.metrics.f1_score(y_test, rf_pred, average='micro'))\n", + "print(\"CI:\", 1.96 * math.sqrt( ((1 - sklearn.metrics.accuracy_score(y_test, rf_pred)) * (sklearn.metrics.accuracy_score(y_test, rf_pred))) / len(X_test)))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**SVM**" + ] + }, + { + "cell_type": "code", + "execution_count": 183, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.svm import SVC" + ] + }, + { + "cell_type": "code", + "execution_count": 184, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " return f(*args, **kwargs)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " return f(*args, **kwargs)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " return f(*args, **kwargs)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " return f(*args, **kwargs)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " return f(*args, **kwargs)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " return f(*args, **kwargs)\n" + ] + }, + { + "data": { + "text/plain": [ + "SVC(decision_function_shape='ovo', probability=True)" + ] + }, + "execution_count": 184, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "svc = SVC(probability=True, decision_function_shape='ovo')\n", + "\n", + "svc_f1 = cv_score(svc, X_train, ohe.inverse_transform(y_train), 'f1_micro')\n", + "\n", + "svc.fit(X_train, ohe.inverse_transform(y_train))" + ] + }, + { + "cell_type": "code", + "execution_count": 185, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.6375\n" + ] + } + ], + "source": [ + "print(np.mean(svc_f1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**XGBoost**" + ] + }, + { + "cell_type": "code", + "execution_count": 186, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\xgboost\\sklearn.py:892: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n", + " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " return f(*args, **kwargs)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[22:31:29] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:541: \n", + "Parameters: { scale_pos_weight } might not be used.\n", + "\n", + " This may not be accurate due to some parameters are only used in language bindings but\n", + " passed down to XGBoost core. Or some parameters are not used but slip through this\n", + " verification. Please open an issue if you find above cases.\n", + "\n", + "\n", + "[22:31:29] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\xgboost\\sklearn.py:892: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n", + " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " return f(*args, **kwargs)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[22:31:29] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:541: \n", + "Parameters: { scale_pos_weight } might not be used.\n", + "\n", + " This may not be accurate due to some parameters are only used in language bindings but\n", + " passed down to XGBoost core. Or some parameters are not used but slip through this\n", + " verification. Please open an issue if you find above cases.\n", + "\n", + "\n", + "[22:31:29] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\xgboost\\sklearn.py:892: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n", + " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " return f(*args, **kwargs)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[22:31:29] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:541: \n", + "Parameters: { scale_pos_weight } might not be used.\n", + "\n", + " This may not be accurate due to some parameters are only used in language bindings but\n", + " passed down to XGBoost core. Or some parameters are not used but slip through this\n", + " verification. Please open an issue if you find above cases.\n", + "\n", + "\n", + "[22:31:29] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\xgboost\\sklearn.py:892: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n", + " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " return f(*args, **kwargs)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[22:31:30] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:541: \n", + "Parameters: { scale_pos_weight } might not be used.\n", + "\n", + " This may not be accurate due to some parameters are only used in language bindings but\n", + " passed down to XGBoost core. Or some parameters are not used but slip through this\n", + " verification. Please open an issue if you find above cases.\n", + "\n", + "\n", + "[22:31:30] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\xgboost\\sklearn.py:892: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n", + " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " return f(*args, **kwargs)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[22:31:30] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:541: \n", + "Parameters: { scale_pos_weight } might not be used.\n", + "\n", + " This may not be accurate due to some parameters are only used in language bindings but\n", + " passed down to XGBoost core. Or some parameters are not used but slip through this\n", + " verification. Please open an issue if you find above cases.\n", + "\n", + "\n", + "[22:31:30] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\xgboost\\sklearn.py:892: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n", + " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n", + "d:\\anaconda3\\envs\\covid-19\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " return f(*args, **kwargs)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[22:31:30] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:541: \n", + "Parameters: { scale_pos_weight } might not be used.\n", + "\n", + " This may not be accurate due to some parameters are only used in language bindings but\n", + " passed down to XGBoost core. Or some parameters are not used but slip through this\n", + " verification. Please open an issue if you find above cases.\n", + "\n", + "\n", + "[22:31:30] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n" + ] + }, + { + "data": { + "text/plain": [ + "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.7,\n", + " colsample_bynode=1, colsample_bytree=0.7, eta=0.3, gamma=0,\n", + " gpu_id=-1, importance_type='gain', interaction_constraints='',\n", + " learning_rate=0.300000012, max_delta_step=0, max_depth=6,\n", + " min_child_weight=1, missing=nan, monotone_constraints='()',\n", + " n_estimators=100, n_jobs=8, num_parallel_tree=1,\n", + " objective='multi:softprob', random_state=0, reg_alpha=4,\n", + " reg_lambda=1, scale_pos_weight=9, subsample=0.8,\n", + " tree_method='exact', validate_parameters=1, verbosity=None)" + ] + }, + "execution_count": 186, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create a model\n", + "# Params from: https://www.kaggle.com/aharless/swetha-s-xgboost-revised\n", + "xgbc = xgb.XGBClassifier(\n", + " booster = 'gbtree',\n", + " max_depth = 6,\n", + " eta = 0.3,\n", + " subsample = 0.8,\n", + " colsample_bytree = 0.7,\n", + " colsample_bylevel = 0.7,\n", + " scale_pos_weight = 9,\n", + " min_child_weight = 1,\n", + " reg_alpha = 4,\n", + " objective = 'multi:softmax'\n", + ")\n", + "\n", + "xgbc_f1 = cv_score(xgbc, X_train, ohe.inverse_transform(y_train), 'f1_micro')\n", + "\n", + "# Fit the models\n", + "xgbc.fit(np.array(X_train), np.array(ohe.inverse_transform(y_train)))" + ] + }, + { + "cell_type": "code", + "execution_count": 187, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.6258333333333332\n" + ] + } + ], + "source": [ + "print(np.mean(xgbc_f1))" + ] + }, + { + "cell_type": "code", + "execution_count": 188, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "XGBoost\n", + "Precision: 0.6666666666666666\n", + "Recal: 0.6666666666666666\n", + "F1: 0.6666666666666666\n", + "CI: 0.3079842869168074\n" + ] + } + ], + "source": [ + "xgbc_pred = ohe.transform(xgbc.predict(np.array(X_test)).reshape(-1, 1)).toarray()\n", + "print(\"XGBoost\")\n", + "print(\"Precision: \", sklearn.metrics.accuracy_score(y_test, xgbc_pred))\n", + "print(\"Recal: \", sklearn.metrics.recall_score(y_test, xgbc_pred, average='micro'))\n", + "print(\"F1: \", sklearn.metrics.f1_score(y_test, xgbc_pred, average='micro'))\n", + "print(\"CI:\", 1.96 * math.sqrt( ((1 - sklearn.metrics.accuracy_score(y_test, xgbc_pred)) * (sklearn.metrics.accuracy_score(y_test, xgbc_pred))) / len(X_test)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Save Models" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "import pickle" + ] + }, + { + "cell_type": "code", + "execution_count": 223, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"4-type-model-4.pkl\", 'wb') as f:\n", + " pickle.dump([dt, rf, svc, xgbc], f)\n", + "with open(\"dataset/4-type-dataset.pkl\", 'wb') as f:\n", + " pickle.dump([X_train, X_test, y_train, y_test], f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load Models" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"4-type-model-4.pkl\", 'rb') as f:\n", + " [dt, rf, svc, xgbc] = pickle.load(f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"dataset/4-type-dataset-ct.pkl\", 'rb') as f:\n", + " [X_train, X_test, y_train, y_test] = pickle.load(f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Prediction**" + ] + }, + { + "cell_type": "code", + "execution_count": 189, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import sklearn" + ] + }, + { + "cell_type": "code", + "execution_count": 190, + "metadata": {}, + "outputs": [], + "source": [ + "svc_pred = ohe.transform(svc.predict(X_test).reshape(-1, 1))" + ] + }, + { + "cell_type": "code", + "execution_count": 220, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "dt_pred = dt.predict(X_test)\n", + "rf_pred = rf.predict(X_test)\n", + "svc_pred = ohe.transform(svc.predict(X_test).reshape(-1, 1)).toarray()\n", + "xgbc_pred = ohe.transform(xgbc.predict(np.array(X_test)).reshape(-1, 1)).toarray()" + ] + }, + { + "cell_type": "code", + "execution_count": 221, + "metadata": {}, + "outputs": [], + "source": [ + "import math" + ] + }, + { + "cell_type": "code", + "execution_count": 222, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Decision Tree\n", + "Precision: 0.3333333333333333\n", + "Recal: 0.3333333333333333\n", + "F1: 0.3333333333333333\n", + "CI: 0.3079842869168074\n", + "\n", + "Random Forest\n", + "Precision: 0.3333333333333333\n", + "Recal: 0.3333333333333333\n", + "F1: 0.46153846153846156\n", + "CI: 0.3079842869168074\n", + "\n", + "SVC\n", + "Precision: 0.5555555555555556\n", + "Recal: 0.5555555555555556\n", + "F1: 0.5555555555555556\n", + "CI: 0.32464394339996944\n", + "\n", + "XGBoost\n", + "Precision: 0.6666666666666666\n", + "Recal: 0.6666666666666666\n", + "F1: 0.6666666666666666\n", + "CI: 0.3079842869168074\n" + ] + } + ], + "source": [ + "print(\"Decision Tree\")\n", + "print(\"Precision: \", sklearn.metrics.accuracy_score(y_test, dt_pred))\n", + "print(\"Recal: \", sklearn.metrics.recall_score(y_test, dt_pred, average='micro'))\n", + "print(\"F1: \", sklearn.metrics.f1_score(y_test, dt_pred, average='micro'))\n", + "print(\"CI:\", 1.96 * math.sqrt( ((1 - sklearn.metrics.accuracy_score(y_test, dt_pred)) * (sklearn.metrics.accuracy_score(y_test, dt_pred))) / len(X_test)))\n", + "\n", + "print()\n", + "\n", + "print(\"Random Forest\")\n", + "print(\"Precision: \", sklearn.metrics.accuracy_score(y_test, rf_pred))\n", + "print(\"Recal: \", sklearn.metrics.recall_score(y_test, rf_pred, average='micro'))\n", + "print(\"F1: \", sklearn.metrics.f1_score(y_test, rf_pred, average='micro'))\n", + "print(\"CI:\", 1.96 * math.sqrt( ((1 - sklearn.metrics.accuracy_score(y_test, rf_pred)) * (sklearn.metrics.accuracy_score(y_test, rf_pred))) / len(X_test)))\n", + "\n", + "print()\n", + "\n", + "print(\"SVC\")\n", + "print(\"Precision: \", sklearn.metrics.accuracy_score(y_test, svc_pred))\n", + "print(\"Recal: \", sklearn.metrics.recall_score(y_test, svc_pred, average='micro'))\n", + "print(\"F1: \", sklearn.metrics.f1_score(y_test, svc_pred, average='micro'))\n", + "print(\"CI:\", 1.96 * math.sqrt( ((1 - sklearn.metrics.accuracy_score(y_test, svc_pred)) * (sklearn.metrics.accuracy_score(y_test, svc_pred))) / len(X_test)))\n", + "\n", + "print()\n", + "\n", + "print(\"XGBoost\")\n", + "print(\"Precision: \", sklearn.metrics.accuracy_score(y_test, xgbc_pred))\n", + "print(\"Recal: \", sklearn.metrics.recall_score(y_test, xgbc_pred, average='micro'))\n", + "print(\"F1: \", sklearn.metrics.f1_score(y_test, xgbc_pred, average='micro'))\n", + "print(\"CI:\", 1.96 * math.sqrt( ((1 - sklearn.metrics.accuracy_score(y_test, xgbc_pred)) * (sklearn.metrics.accuracy_score(y_test, xgbc_pred))) / len(X_test)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/dataset/2-type-dataset-ct.pkl b/dataset/2-type-dataset-ct.pkl new file mode 100644 index 0000000..23fdbef Binary files /dev/null and b/dataset/2-type-dataset-ct.pkl differ diff --git a/dataset/2-type-dataset.pkl b/dataset/2-type-dataset.pkl new file mode 100644 index 0000000..70fd336 Binary files /dev/null and b/dataset/2-type-dataset.pkl differ diff --git a/dataset/4-type-dataset-ct.pkl b/dataset/4-type-dataset-ct.pkl new file mode 100644 index 0000000..3520dbd Binary files /dev/null and b/dataset/4-type-dataset-ct.pkl differ diff --git a/dataset/4-type-dataset.pkl b/dataset/4-type-dataset.pkl new file mode 100644 index 0000000..a3f6812 Binary files /dev/null and b/dataset/4-type-dataset.pkl differ diff --git a/dataset/covid.csv b/dataset/covid.csv new file mode 100644 index 0000000..e4c21db --- /dev/null +++ b/dataset/covid.csv @@ -0,0 +1,93 @@ +MedNum,No,Sex,Age,AgeG1,Height,Weight,BMI,Temp,Severity03,Severity01,CTScore,AIVolumeP,cTnITimes,cTnI,cTnICKMBOrdinal1,cTnICKMBOrdinal2,AST,LDH,CK,CKMB,HBDH,HiCKMB,NTproBNP,Cr,LVEF,PCT1,WBC1,NEU1,LYM1,N2L1,CRP1,ALB1,PCT2,WBC2,NEU2,LYM2,N2L2,CRP2,ALB2,SO2,PO2,YHZS,Sympton,Fever,Cough,Phlegm,Hemoptysis,SoreThroat,Catarrh,Headache,ChestPain,Fatigue,SoreMuscle,Stomachache,Diarrhea,PoorAppetite,NauseaNVomit,Hypertention,Hyperlipedia,DM,Lung,CAD,Arrythmia,Cancer,Onset2Admi,Onset2CT1,Onset2CTPositive1,Onset2CTPeak,RUL,RML,RLL,LUL,LLL +346136,2,0,75,1,150,57,25.3,38.1,3,1,9,.337,11,.15,1,1,17,188,55,4.9,149,10.7,4480,89,57,.12,5.61,3.95,.95,4.16,26,35.9,.12,7.95,7.51,.31,24.23,127.5,30.2, , , ,1,1,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,7,7,7,11, , , , , +346167,3,0,48,0,155,52,21.6,36.9,1,0,1,.007, , ,0,0,14,139,27,8.3,113,17.5, , , ,0,2.52,.95,1.52,.63,.07,38.68,0,3.49,1.76,1.18,1.49,20,33.3, , , ,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,10, , , , , +346256,4,0,66,1,164,53,19.7,39,1,0,7,.097,7,.01,0,0,27,308,39,18.4,244,19.4,697,78, ,0,3.11,2.09,.7,2.99,5.2,34.1,0,4.05,3.4,.56,6.07,5.2,34.1, , , ,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,6,6,9, , , , , +346350,5,1,65,1,170,75,26,37.9,1,0,4,.003,6,.01,0,0,20,193,68,10.3,144,21,77,93.4, ,0,4.84,3.4,1.15,2.96,7,38.7,0,4.71,3.08,1.01,3.05,7,35.8, , , ,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,15,15,15, , , , , +346364,6,0,50,0,161,61.5,23.7,37.3,1,0,9,.137, , ,0,0,21,187,18,8.3,154,22.3, , , ,0,6.04,4.33,1.27,3.41,8.8,34.9,0,9.1,7.86,1.04,7.56,15.67,33.4, , , ,1,1,1,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,10,12,12,12, , , , , +346367,7,1,36,0,168,93,33,36.8,2,1,18,.709,7,.01,0,0,14.6,260,41,10,201,15.5,1660,59.6,67,0,6.2,4.3,1.2,3.58,47.4,41.1,0,9.29,8.64,.33,26.18,49.1,35.3, , , ,1,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,4,4,4,11, , , , , +346368,8,0,38,0,163,75,28.2,37.2,0,0,0,0, , ,0,0,34,157,63,12.7,110,17.7, , , ,0,6.76,4.36,1.37,3.18,1.5,40.4,0,6.76,4.36,1.37,3.18,16.06,36.5, , , ,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2, , , , , , , +346476,9,0,63,1,160,72.5,28.3,36.5,0,0,0,.001,3,.01,0,0,26,238,108,10.8,161,24,20,90, ,0,3.82,2.32,1.12,2.07,7,42.8,0,4.03,2.82,.84,3.36,7,36.8, , , ,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1, , , , , , , +346446,10,1,56,1,172,70,23.7,40,1,0,4,.038,2,.01,0,0,18.9,143,49,14.3,98,15.1,51,91, ,0,4.87,3.54,1.17,3.03,7.3,42.9,0,5.42,3.88,1.05,3.7,11.41,36.3, , , ,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,5,6,6,9, , , , , +346498,12,1,69,1,173,60,20,38,2,1,13,.556,13,.01,0,0,21.8,343,109,14,252,18.9,2280,93.4,63,0,3.95,2.11,1.23,1.72,25.9,35.5,0,5.64,4.74,.45,10.53,128.17,37.9, , , ,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,3,4,4,9, , , , , +346500,13,0,70,1,160,75,29.3,36.7,2,1,10,.315,12,.056,1,1,17.2,219,128,13.6,169,15,4710,75,65,0,5.59,3.33,1.44,2.31,10.9,32.3,0,6.96,5.79,.66,8.77,45.79,32.4, , , ,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,3,4,4,9, , , , , +346572,14,0,58,1,162,65,24.8,37.4,2,1,9,.242,8,.01,0,0,18.6,208,73,21.4,164,21.4,1220,62.5, ,0,4.81,2.8,1.21,2.31,14.19,38.8,0,4.63,2.63,1.24,2.12,64.15,36.5, , , ,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,3,9, , , , , +346546,15,1,60,1,172,60,20.3,38.9,3,1,16,.905,23,.014,0,0,22.6,283,167,11.5,205,18,2820,65.9,63,0,4.7,2.8,.7,4,17.07,34.1,0,4.78,4.45,.27,16.48,59.28,33.6, , , ,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,5,7,7,9, , , , , +346615,16,0,33,0,152,47.5,20.6,37.5,1,0,3,.031,7,.01,0,0,33.4,155,29,13.7,121,22.7,248,55.8, ,0,5.47,3.23,1.41,2.29,.54,38.3,0,5.39,4.22,1,4.22,8.6,37.5, , , ,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,6,6,12, , , , , +346614,17,0,55,1,157,61,24.7,36.3,0,0,0,0,8,.01,0,0,29.8,185,91,14.2,141,14.2,71,71.8, ,.13,5.23,2.76,1.78,1.55,29.64,34.9,0,4.47,2.64,1.33,1.98,29.64,34.9, , , ,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, , , , , , , +346617,18,1,32,0,158,50,20,36.8,1,0,3,.018,7,.01,0,0,24.7,153,60,16.8,126,16.8,27,86, ,0,4.4,1.88,2.09,.9,1.21,43.3,0,4.14,2.21,1.42,1.56,3.34,38.5, , , ,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,2,7, , , , , + ,19,0,59,1,155,63,26.2,37,1,0,1,0,4,.01,0,0,17.3,165,42,11.8,147,30.7,156,71.5, ,0,2.62,1.3,1.02,1.27,0,41.5,0,2.53,1.38,.84,1.64,.46,36.5, , , ,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, , , , , , , , , +346537,20,0,23,0,150,44,19.6,36.8,0,0,0,0,1,.01,0,0,18,131,68,6,101,15.2,26,65.3, ,0,7.53,4.86,2.1,2.31,0,43.2,0,8.81,6.83,1.13,6.04,41.87,38.1, , , ,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0, , , , , , , +346514,21,1,59,1,168,63.5,22.5,36.4,0,0,0,.032,15,.51,1,1,32,254,82,23.7,177,23.6,1060,86, ,0,1.91,.65,.91,.71,17.2,32.6,0,1.91,.65,.71,.92,17.2,32.5, , , ,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,1,1,1, , , , , , , +297500,22,0,33,0,156,40,16.4,36.8,1,0,5,.014,7,.01,0,0,15.3,125,29,7.3,104,16.8,140,57.3, ,0,3.12,1.78,.9,1.98,2.12,36.5,0,2.26,1.27,.62,2.05,4.98,36.5, , , ,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,2,6,9, , , , , +346639,23,1,29,0,174,67,22.1,38,0,0,0,0,1,.01,0,0,14.3,124,92,21.6,103,26.8,20,76.7, ,0,7.45,4.29,2.41,1.78,0,45.2,0,4.92,3.11,1.43,2.17,.21,40.3, , , ,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,6,6, , , , , , , + ,24,0,56,1,145,55,26.2,37.2,1,0,5,.003,5,.01,0,0,12.9,184,60,11.9,144,23.2,117,58.2, ,0,6.61,3.93,2.16,1.82,.3,39.5,0,5.81,3.72,1.42,2.62,.53,37.7, , , ,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,18,19,19,19, , , , , +346,25,1,64,1,167,69,24.7,37.7,1,0,4,.028,10,.01,0,0,21,154,63,13.5,124,13.5,392,67.2,63,0,6.49,3.99,1.74,2.29,2.11,39.5,0,5.58,3.16,1.66,1.9,28.94,35.2, , , ,1,1,1,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,9,10,10,10, , , , , +346595,26,0,28,0,172,58,19.6,36.9,1,0,5,.092,9,.01,0,0,20.1,173,88,13.1,140,15.9,135,85.5, ,0,5.33,3.59,1.13,3.18,1.97,43.1,0,3.43,2.4,.82,2.93,31.02,36.6, , , ,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7, , , , , +346568,28,1,44,0,173,75,25.1,36.6,0,0,0,0,8,.01,0,0,35.5,151,192,13.5,122,16.3,187,103.4, ,.12,5.75,2.33,2.09,1.11,2.56,42.1,0,5.75,2.33,2.09,1.11,27.85,37.5, , , ,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1, , , , , , , +346627,29,1,52,1,170,80,27.7,36.7,2,1,9,.131,16,.085,1,1,33.9,193,105,13.9,138,20.1,456,94, ,0,4.53,2.61,1.46,1.79,4.72,40.6,0,3.83,3.05,.7,4.36,33.25,40.1, , , ,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,12, , , , , +346609,30,1,36,0,177,80,25.5,36.8,2,1,11,.267,7,.01,0,0,26,288,377,13.4,227,29.3,932,106, ,0,4.45,2.57,1.54,1.67,17.78,37.7,0,6.36,5.51,.68,8.1,46.57,35.5, , , ,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,6,6,9, , , , , +346622,31,0,29,0,160,70,27.3,38.5,0,0,0,0,2,.01,0,0,13.5,168,52,13.4,133,17.8,43,61.3, ,.13,4.2,1.88,1.84,1.02,.82,37.4,0,4.6,2.5,1.67,1.5,1.07,37.4, , , ,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,4, , , , , , , +346654,32,0,56,1,168,70,24.8,37.8,1,0,6,.101,7,.01,0,0,47.5,215,68,19.7,167,19.7,217,70.4, ,0,3.23,1.75,1.09,1.61,8,38.8,0,3.32,2.22,.88,2.52,31.97,38.8, , , ,1,1,1,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3,3,3,12, , , , , +346650,33,0,34,0,168,56,19.8,38.6,1,0,4,.023,1,.01,0,0,13,158,36,7.1,123,10.3,63,58.3, ,.15,4.38,1.12,2.8,.4,2.23,39.2,0,5.24,2.26,2.45,.92,2.23,37.6, , , ,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,7,7,7, , , , , +232494,34,1,42,0,170,72,24.9,37.5,1,0,10,.192,8,.01,0,0,28.4,263,77,17.9,190,19,475,72.9, ,0,3.76,2.33,1.08,2.16,36.49,37.6,0,4.13,2.81,.81,3.47,78.76,37.6, , , ,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,2,3,3,6, , , , , +346652,35,1,33,0,170,72,24.9,37.1,1,0,7,.102, , ,0,0,12.9,218,63,8.1,178,15.4, , , ,.14,4.1,1.96,1.61,1.22,6.95,38.6,0,3.85,3.09,.64,4.83,27.85,38, , , ,1,1,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,5,5,10, , , , , +346623,36,1,64,1,165,70,25.7,37.5,2,1,5,.026,12,.014,0,1,23.9,208,74,18,165,56.1,384,95.4, ,.13,6.64,3.66,2.03,1.8,6.36,39.3,0,6.6,4.07,1.42,2.87,16.46,36.2, , , ,1,1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12, , , , , +346611,37,1,19,0,173,78,26.1,36.6,1,0,2,0,3,.01,0,0,18.3,149,58,13.5,134,34.3,60,82.1, ,0,5.31,2.97,1.71,1.74,.87,45.4,0,5.32,3.09,1.46,2.12,.94,40.5, , , ,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1, , , , , +346672,38,1,38,0,173,70,23.4,35.9,2,1,8,.091,6,.017,0,0,17.9,137,40,4.4,127,34.9,51,105,65,.12,5.02,3.08,1.05,2.93,8.6,41.1,0,2.47,1.39,.64,2.17,19.52,33.6, , , ,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,3,9, , , , , +346697,39,0,43,0,163,55,20.7,37.1,1,0,3,.027,5,.01,0,0,13.4,140,42,14.7,108,14.7,76,58.9, ,.16,4.22,2.81,.82,3.43,1.17,39.4,0,4.22,2.81,.82,3.43,4.12,37, , , ,1,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5,6,6,9, , , , , +346699,40,1,69,1,168,73.5,26,36.5,1,0,1,.007,11,.01,0,0,19.1,262,60,15.7,158,46.5,810,171.8, ,5.75,12.52,9.14,1.61,5.68,128.31,35.7,0,12.55,9.14,1.61,5.68,128.31,34.3, , , ,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,6,16, , , , , +346701,41,0,41,0,158,45,18,37.5,1,0,3,.006,6,.01,0,0,16.2,154,36,12.6,114,15.2,54,57, ,0,5.51,3.4,1.57,2.17,4.1,40,0,7.08,5.24,1.31,4,4.1,35.6, , , ,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,2, , , , , +346699,42,1,44,0,174,72,23.8,36.6,1,0,1,.003,8,.01,0,0,27.8,201,140,16,164,16,23,91.5, ,0,3.27,1.04,1.29,.81,.44,43.1,0,3.44,1.63,1.2,1.36,.63,40.2, , , ,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,5, , , , , +346698,43,1,15,0,172,67.5,22.8,36.5,1,0,1,0,1,.01,0,0,17.4,160,94,16.3,144,16.3,20,65.4, ,0,6.43,3.13,2.76,1.13,.6,42.3,0,5.93,3.42,1.81,1.89,.69,40.7, , , ,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,13,13,13, , , , , +346724,44,1,44,0,170,70,24.2,36.8,2,1,9,.161,9,.01,0,0,21.2,196,50,11.8,163,18.5,2780,80.8,67,.19,4.77,3.47,.88,3.94,33.8,40.8,0,2.77,2.21,.5,4.42,92.58,36.7, , , ,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,2,5, , , , , +346726,45,0,57,1,150,52,23.1,37.2,1,0,3,.016, , ,0,0,25.2,166,43,17.2,130,17.2, , , ,0,7.69,2.81,4.19,.67,11.01,38.7,0,5.01,2.4,2.17,1.11,11.01,38.7, , , ,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,2,2,2, , , , , +346723,46,1,42,0,173,70,23.4,36.7,1,0,2,.004,8,.014,0,0,19.3,142,50,12.3,114,20.4,25,86.1, ,0,4.58,2.31,1.7,1.36,17.58,43.2,0,3.3,1.56,1.41,1.11,17.58,37, , , ,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,6,6, , , , , +346722,47,1,29,0,176,70,22.6,36.6,1,0,1,0,8,.01,0,0,21.2,187,88,16.2,161,16.2,67,102.4, ,.13,5.78,2.71,2.33,1.16,2.21,48,0,6.17,3.71,1.87,1.98,2.21,39.8, , , ,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,5,5, , , , , +346700,48,1,36,0,176,86,27.8,38,1,0,1,.015,4,.01,0,0,48.9,148,76,10.8,100,13.6,31,86.3, ,0,7.41,4.41,1.97,2.24,10.48,46.3,0,5.45,3.14,1.61,1.95,10.48,38.5, , , ,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,4,5,5,5, , , , , + ,49,1,66,1,176,60,19.4,36.7,1,0,2,.048,4,.01,0,0,29,293,70,12.2,206,15.6,90,71.8, ,0,8.49,5.23,2.47,2.12,3.77,37.4,0,6.68,5.07,1.08,4.69,3.77,35.8, , , ,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, , , , , , , , +346745,50,1,45,0,168,67.8,24,36.8,0,0,0,0,6,.01,0,0,12.9,102,52,7.9,86,12.3,55,88, ,.12,5.17,2.1,2.58,.81,.4,42.7,0,3.75,2.12,1.23,1.72,.98,37.7, , , ,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,6,2, , , , , , , +346671,51,0,29,0,158,49,19.6,37,0,0,0,0,1,.01,0,0,12.6,156,48,9.2,129,12.5,29,69.5, ,0,10.08,6.81,1.74,3.91,34.54,40.3,0,7.69,5.05,1.57,3.22,34.54,40.2, , , ,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2, , , , , , , +346720,52,0,36,0, , ,24.2,39.3,0,0,0,0,5,.01,0,0,16.1,143,64,15.7,120,21,94,62.5, ,0,3.72,2.23,1.11,2.01,9.73,40,0,3.19,1.97,.91,2.16,9.73,38.3, , , ,1,1,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2, , , , , , , +346765,53,1,37,0,184,82.5,24.4,36.8,1,0,3,.039,5,.01,0,0,17.8,190,54,20,160,20,119,74.6, ,0,5.2,2.59,2.01,1.29,4.1,36.6,0,4.38,2.11,1.78,1.19,4.11,36.6, , , ,1,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,6,6,8, , , , , +346721,54,0,63,1,157,60,24.3,37,2,1,6,.062,13,.01,0,0,20.2,256,35,13,211,31.6,882,81,64,0,3.3,1.18,1.59,.74,.8,36.3,0,1.99,1.16,.61,1.9,59.69,36, , , ,1,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,13,13, , , , , +346725,56,1,38,0,173,80,26.7,37.2,0,0,0,0,4,.01,0,0,31.6,206,190,14.5,167,18,54,103.3, ,0,6.55,2.95,2.86,1.03,.53,46,0,4.63,2.53,1.5,1.69,11.84,38, , , ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, , , , , , , , , +346716,57,0,75,1,148,68,31,37.6,1,0,4,.022,4,.01,0,0,16.8,215,53,9.4,144,32.9,113,90, ,0,5.02,2.41,2.1,1.15,1.94,39,0,4.3,2.47,1.1,2.25,22.74,34.5, , , ,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,3,9, , , , , +346715,58,0,48,0,161,54,20.8,37.3,1,0,6,.056,2,.01,0,0,12.6,184,69,17.1,140,18.5,272,61.2, ,0,4.7,2.83,1.47,1.93,23.89,37.2,0,4.54,3.14,1.02,3.08,23.89,37.2, , , ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, , , , , , , , , +346719,59,0,58,1,148,50,22.8,36.7,1,0,9,.168,1,.01,0,0,16,213,212,10.1,165,14,26,53.2, ,0,4.35,2.43,1.69,1.44,44.35,36.1,0,3.46,1.74,1.44,1.21,63.84,36.1, , , ,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,9,9,11, , , , , +346762,60,1,64,1,175,61.5,20.1,37.3,2,1,17,.583,15,.015,0,0,28.4,203,91,13.8,170,23.5,2510,73.4,72,.2,3.34,2.32,.71,3.27,5.89,36.5,0,6.18,5.73,.39,14.69,69.19,33.4, , , ,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3,3,3,15, , , , , +346,61,0,56,1,157,53,21.5,38.5,1,0,9,.162,4,.01,0,0,27.2,192,75,21.1,170,21.1,311,83.7, ,.1,4.25,2.89,.95,3.04,9.6,36.7,0,4.37,3.22,.81,3.98,57.78,35.5, , , ,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,5,5,5,8, , , , , +346760,62,1,29,0,176,93,30,36.3,1,0,5,.05,3,.01,0,0,34.6,182,55,12.9,132,12.9,20,69.9, ,0,4.54,2.71,1.41,1.92,4.99,41.1,0,3.37,1.63,1.3,1.25,6.05,41.1, , , ,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,4,4,7, , , , , +346761,63,0,80,1,0,61.5,23,37,3,1,10,.327,6,.01,0,0,16.4,258,66,16,195,16,1510,46,63,.3,3.09,1.75,.99,1.77,115.14,28.1,0,3.17,2.09,.57,3.67,115.14,28.1, , , ,1,0,0,0,0,0,0,0,0,1,1,0,1,1,0,0,1,1,0,0,0,0,6,6,6,6, , , , , +346747,64,0,29,0,155,54,22.5,36.5,1,0,1,.001,4,.01,0,0,12.1,108,41,8.8,92,11.2,75,69.3, ,0,4.09,1.87,1.7,1.1,.34,41.1,0,5.51,3.36,1.51,2.23,.34,36.6, , , ,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,5,5,12, , , , , +346746,65,0,19,0,163,52,19.6,37,0,0,0,0,4,.01,0,0,30.4,149,58,16.9,114,20.6,54,75.9, ,0,6.95,3.17,3.29,.96,0,39.1,0,6.34,3.18,2.63,1.21,.5,35.2, , , ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, , , , , , , , , +346795,66,0,36,0,165,60,22,36,1,0,3,.057,2,.01,0,0,14.6,121,62,15.6,104,19.3,26,62.5, ,.15,3.96,2.46,1.01,2.44,2.78,41.5,0,3.96,2.46,1.01,2.44,10.12,33.4, , , ,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,3,3,3,7, , , , , +346796,67,0,62,1,160,62.5,24.4,36.9,1,0,7,.128,2,.01,0,0,20.8,207,38,14.3,170,16.3,83,56.3, ,0,3.09,1.85,.99,1.87,46,38.1,0,3.09,1.85,.99,1.87,56.02,33.2, , , ,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,1,0,3,3,3,3, , , , , +346785,68,0,53,1,158,64,25.6,36.2,1,0,2,.001,8,.01,0,1,17,125,31,9.7,101,58.5,119,61.4, ,0,3.89,2.22,1.36,1.63,.79,42.8,0,5,3.67,.98,3.74,1.74,37.9, , , ,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,4,8, , , , , +346801,69,0,66,1,158,59,23.6,36.7,1,0,6,.052,6,.01,0,0,14.4,122,46,30.4,95,30.4,359,67.4, ,0,3.74,1.73,1.48,1.17,3.57,38.7,0,5.22,3.77,.78,4.83,22.12,33.3, , , ,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,3,3,3,7, , , , , +346802,70,0,51,1,160,49,19.1,36.6,1,0,3,.038,3,.01,0,0,15.2,128,39,9,107,9,130,60.7, ,0,4.58,2.25,1.36,1.65,5.61,43.6,0,4.58,2.25,1.36,1.65,12.2,35.1, , , ,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,5,11, , , , , +346787,71,1,55,1,176,94,30.3,38,1,0,6,.026,6,.01,0,0,13.4,143,63,10.9,117,16.5,46,86.2, ,0,6.05,3.45,1.94,1.78,9.7,41.7,0,5.72,3.29,1.57,2.1,12.77,36.3, , , ,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,3,4,4,4, , , , , +346794,72,1,35,0,175,82,26.8,37.1,1,0,1,.012,5,.01,0,0,14,144,94,16.2,101,24.3,144,88, ,0,3.74,1.77,1.46,1.21,5.84,41.9,0,3.14,1.5,1.15,1.3,5.84,38, , , ,1,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3,3,3,3, , , , , +346707,73,0,22,0,160,40,15.6,37,0,0,0,0,6,.01,0,0,16.7,121,57,15.2,103,25.9,53,81, ,0,3.88,2.4,1.01,2.38,0,43.9,0,4.63,3.16,.89,3.55,5.59,37, , , ,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,3, , , , , , , +346814,74,1,35,0,183,85,25.4,37.4,0,0,0,0,6,.01,0,0,20.8,139,81,6.4,110,23.5,62,88.2, ,0,3.79,2.11,.98,2.15,.46,43.7,0,3.79,2.11,.98,2.15,.46,40.8, , , ,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1, , , , , , , +346810,75,1,65,1,159,74,29.3,37.7,3,1,11,.23,13,2.6,1,1,37.6,280,45,13.6,223,51.1,3480,79.8,63,0,6.87,5.59,.81,6.9,39.03,41.2,0,6.78,5.11,.7,7.3,49.48,33.5, , , ,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,10,11,11,11, , , , , +346770,76,0,54,1,155,55,22.9,37.7,2,1,5,.059,1,.01,0,0,20.5,187,33,6.1,140,22.1,193,64.6, ,0,3.61,1.65,1.69,.98,26.8,34.7,0,4.24,2.36,1.5,1.57,40.8,35.5, , , ,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,7,7,7, , , , , +346813,77,0,64,1,165,75,27.5,37.6,1,0,3,.012,8,.01,0,0,15.1,180,58,13,153,15.5,338,59.3, ,.13,3.65,1.54,1.78,.87,5.14,36.3,0,3.49,1.66,1.47,1.13,5.14,37.8, , , ,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,11,11,15, , , , , + ,78,0,58,1,160,58,22.7,37.5,1,0,2,0,4,.01,0,0,15.5,185,68,17.7,135,17.8,34,70.8, ,0,7.14,4.3,2.23,1.93,.64,39.4,0,6.86,4.88,1.41,3.46,.93,38.8, , , ,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, , , , , , , , + ,79,1,39,0,170,70,24.2,37,1,0,5,.013,5,.01,0,0,21.8,159,92,11.1,130,14.3,65,95.2, ,0,4.72,2.28,1.95,1.17,1.16,44.5,0,4.42,2.56,1.36,1.88,1.16,38.8, , , ,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,11,11,11, , , , , +346832,81,0,32,0,165,60,22,38,2,1,9,.14,11,.01,0,0,21.3,265,104,8.8,204,13,251,64, ,0,2.63,1.49,.89,1.67,43.35,39.6,0,2.12,1.29,.53,2.43,111.02,36.7, , , ,1,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,4,4,6, , , , , +346855,82,0,64,1,160,60.1,23.5,36.6,2,1,4,.042,6,.01,0,0,19.7,141,33,8.3,116,11.9,230,72.6, ,0,3.08,1.78,1.05,1.7,.05,42.3,0,2.16,1.55,.46,3.37,22.2,35.3, , , ,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,10,11,11,20, , , , , +346871,83,0,59,1,159,52,20.6,36.3,1,0,3,.001,2,.01,0,0,21.5,169,47,15.7,144,15.7,108,55.4,71,0,3.76,2.94,.58,5.07,0,39.8,0,3.76,2.94,.58,5.07,3.17,39.8, , , ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, , , , , , , , , +346872,84,0,32,0,157,52,21.1,36.3,1,0,1,.001,6,.01,0,0,12.7,137,45,13.7,115,14.5,32,62.1, ,.13,4.64,1.89,2.22,.85,.31,36.9,0,3.64,2.03,1.13,1.8,1.16,36.9, , , ,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,10,10,16, , , , , +346875,85,0,37,0,156,41,16.8,36.5,0,0,0,0,2,.01,0,0,10.4,123,45,7.7,99,16.8,70,53.5, ,0,4.45,1.47,1.47,1,0,39,0,4.45,1.47,1.47,1,.06,39, , , ,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,2,3, , , , , , , +346889,86,0,71,1,158,53,21.2,36.5,1,0,3,.006,4,.01,0,0,17.2,127,26,8.3,103,12.3,856,56.9, ,0,4.59,2.18,1.99,1.1,4.83,35.9,0,5.12,3.59,1.11,3.23,4.83,35.9, , , ,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,9,10,10,10, , , , , +346885,87,0,33,0,165,62.5,23,36.2,1,0,2,.003,4,.01,0,0,12,185,51,16.3,151,16.3,74,73.3, ,0,7.21,3.3,3.26,1.01,0,40.2,0,8.3,5.04,2.48,2.03,.25,39.2, , , ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, , , , , , , , , +346878,89,1,67,1,170,72,24.9,37.8,3,1,7,.109,16,.04,1,1,21.2,148,50,12.9,126,16.8,8650,506, ,.36,4.25,3.47,.37,9.38,32.01,36.7,0,7.5,7.07,.26,27.19,183.56,32.7, , , ,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,5, , , , , +346985,90,0,59,1,160,60,23.4,37.8,1,0,6,.086,6,.01,0,0,34.5,202,78,14.5,163,15.6,197,66.1, ,0,4.27,1.62,2.04,.79,5.49,38.3,0,5.29,3.2,1.5,2.13,5.49,38.2, , , ,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,16,16,16,16, , , , , +346924,91,0,36,0,160,75,29.3,37.3,1,0,3,.002,4,.01,0,0,15.5,127,60,11.1,99,18.8,54,78.2, ,0,8.1,4.77,2.54,1.88,4.1,41,0,7.99,5.61,1.64,3.42,6.8,39.3, , , ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, , , , , , , , , +347040,92,1,56,1,170,70,24.2,36.7,1,0,3,.018,6,.01,0,0,19.6,151,36,21.4,134,21.4,46,102.5, ,0,5.08,2.78,1.74,1.6,8.17,45.8,0,5.6,3.37,1.59,2.12,8.17,39.5, , , ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0, , , , , , , , , +347039,93,1,25,0,175,73,23.8,36.9,0,0,0,0,6,.01,0,0,31.5,173,108,12.2,127,13.1,77,86.8, ,0,7.49,3.28,3.37,.97,.56,46.4,0,7.08,3.42,2.62,1.31,1.34,43.3, , , ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, , , , , , , , , +347042,94,1,63,1,173,90,30.1,36.4,2,1,6,.113,7,.01,0,0,25.7,220,100,10.3,160,14.3,433,75.4, ,0,7.15,4.79,1.53,3.13,22.69,39.2,0,8.22,5.75,1.46,3.94,22.69,36.5, , , ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, , , , , , , , , +347041,95,1,39,0,170,72.5,25.1,36.7,1,0,2,.01,5,.01,0,0,25.6,153,141,6.4,120,10.4,90,86.4, ,0,6.42,3.99,1.96,2.04,0,47.6,0,5.42,3.9,.97,4.02,1.44,43.9, , , ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, , , , , , , , , +347052,96,0,62,1,158,63,25.2,36.7,2,1,8,.093,3,.01,0,0,29.7,209,38,13.7,161,16.6,209,56.1, ,0,6.43,4.48,1.49,3.01,38.85,36,0,6.43,4.48,1.49,3.01,38.85,35.7, , , ,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,3,3,3,3, , , , , +347057,97,0,63,1,155,54,22.5,36.5,1,0,5,.009,5,.01,0,0,16.6,201,58,16.9,162,16.9,274,55.4, ,0,7.36,4.21,2.35,1.79,.4,38.5,0,6.17,3.75,1.6,2.34,.95,37.3, , , ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, , , , , , , , , +347084,98,0,37,0,163,52,19.6,36.7,0,0,0,0,3,.01,0,0,11.2,134,68,12,113,17.1,20,64, ,0,5.77,2.74,2.25,1.22,.96,38.8,0,5.22,2.22,2.09,1.06,.96,38.8, , , ,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,4, , , , , , , diff --git a/lime.png b/lime.png new file mode 100644 index 0000000..9f53f32 Binary files /dev/null and b/lime.png differ diff --git a/model/2-type-model-4-ct.pkl b/model/2-type-model-4-ct.pkl new file mode 100644 index 0000000..8d8be0c Binary files /dev/null and b/model/2-type-model-4-ct.pkl differ diff --git a/model/2-type-model-4.pkl b/model/2-type-model-4.pkl new file mode 100644 index 0000000..8848446 Binary files /dev/null and b/model/2-type-model-4.pkl differ diff --git a/model/4-type-model-4-ct.pkl b/model/4-type-model-4-ct.pkl new file mode 100644 index 0000000..abd36d3 Binary files /dev/null and b/model/4-type-model-4-ct.pkl differ diff --git a/model/4-type-model-4.pkl b/model/4-type-model-4.pkl new file mode 100644 index 0000000..5b1af26 Binary files /dev/null and b/model/4-type-model-4.pkl differ