{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prerequisites"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "!pip install numpy matplotlib pandas sklearn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -V"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data Preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.set_printoptions(suppress=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "X = pickle.load( open( \"res.pickle\", \"rb\" ) )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1000000, 493)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "layer_1 = np.linspace(10, 250, 100, endpoint=True)\n",
    "layer_2 = np.linspace(0.5, 10, 100, endpoint=True)\n",
    "layer_3 = np.linspace(710, 1700, 100, endpoint=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = np.zeros((X.shape[0], 3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "i = 0\n",
    "for l3 in layer_3:\n",
    "    for l2 in layer_2:\n",
    "        for l1 in layer_1:\n",
    "            y[i] = [l3, l2, l1]\n",
    "            i = i + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1000000, 3)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## HDF5"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Write to HDF5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_X = pd.DataFrame(X)\n",
    "df_y = pd.DataFrame(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:MainThread:numexpr.utils:NumExpr defaulting to 8 threads.\n"
     ]
    }
   ],
   "source": [
    "store = pd.HDFStore('res.h5')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "store['X'] = df_X\n",
    "store['y'] = df_y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "store.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Read from HDF5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "res = pd.read_hdf('res.h5', 'X')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1000000, 3)"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Vaex"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "import vaex"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Python 3.8.3\n"
     ]
    }
   ],
   "source": [
    "!python -V"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 710.,  710.,  710., ..., 1700., 1700., 1700.])"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y[:, 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "vaex_df = vaex.from_arrays(x=X, y=y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1000000, 493)"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1000000, 3)"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead>\n",
       "<tr><th>#                                  </th><th>x                                                  </th><th>y                                          </th></tr>\n",
       "</thead>\n",
       "<tbody>\n",
       "<tr><td><i style='opacity: 0.6'>0</i>      </td><td>&#x27;[0.93921708 1.         0.96758555 0.88402555 0....</td><td>[710.    0.5  10. ]                        </td></tr>\n",
       "<tr><td><i style='opacity: 0.6'>1</i>      </td><td>&#x27;[0.9392044  1.         0.96759943 0.88406149 0....</td><td>[710.           0.5         12.42424242]   </td></tr>\n",
       "<tr><td><i style='opacity: 0.6'>2</i>      </td><td>&#x27;[0.93918573 1.         0.9676289  0.88417059 0....</td><td>[710.           0.5         14.84848485]   </td></tr>\n",
       "<tr><td><i style='opacity: 0.6'>3</i>      </td><td>&#x27;[0.93915796 1.         0.96770218 0.88444087 0....</td><td>[710.           0.5         17.27272727]   </td></tr>\n",
       "<tr><td><i style='opacity: 0.6'>4</i>      </td><td>&#x27;[0.93910895 1.         0.96785629 0.88496242 0....</td><td>[710.          0.5        19.6969697]      </td></tr>\n",
       "<tr><td>...                                </td><td>...                                                </td><td>...                                        </td></tr>\n",
       "<tr><td><i style='opacity: 0.6'>999,995</i></td><td>&#x27;[0.90888059 1.         0.99898726 0.9374508  0....</td><td>[1700.          10.         240.3030303]   </td></tr>\n",
       "<tr><td><i style='opacity: 0.6'>999,996</i></td><td>&#x27;[0.90857156 1.         0.99928981 0.93796907 0....</td><td>[1700.           10.          242.72727273]</td></tr>\n",
       "<tr><td><i style='opacity: 0.6'>999,997</i></td><td>&#x27;[0.90826533 1.         0.99958941 0.93848211 0....</td><td>[1700.           10.          245.15151515]</td></tr>\n",
       "<tr><td><i style='opacity: 0.6'>999,998</i></td><td>&#x27;[0.90796191 1.         0.99988598 0.93898997 0....</td><td>[1700.           10.          247.57575758]</td></tr>\n",
       "<tr><td><i style='opacity: 0.6'>999,999</i></td><td>&#x27;[0.90749824 0.99982035 1.         0.93932393 0....</td><td>[1700.   10.  250.]                        </td></tr>\n",
       "</tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "#        x                                                    y\n",
       "0        '[0.93921708 1.         0.96758555 0.88402555 0....  [710.    0.5  10. ]\n",
       "1        '[0.9392044  1.         0.96759943 0.88406149 0....  [710.           0.5         12.42424242]\n",
       "2        '[0.93918573 1.         0.9676289  0.88417059 0....  [710.           0.5         14.84848485]\n",
       "3        '[0.93915796 1.         0.96770218 0.88444087 0....  [710.           0.5         17.27272727]\n",
       "4        '[0.93910895 1.         0.96785629 0.88496242 0....  [710.          0.5        19.6969697]\n",
       "...      ...                                                  ...\n",
       "999,995  '[0.90888059 1.         0.99898726 0.9374508  0....  [1700.          10.         240.3030303]\n",
       "999,996  '[0.90857156 1.         0.99928981 0.93796907 0....  [1700.           10.          242.72727273]\n",
       "999,997  '[0.90826533 1.         0.99958941 0.93848211 0....  [1700.           10.          245.15151515]\n",
       "999,998  '[0.90796191 1.         0.99988598 0.93898997 0....  [1700.           10.          247.57575758]\n",
       "999,999  '[0.90749824 0.99982035 1.         0.93932393 0....  [1700.   10.  250.]"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vaex_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "could not broadcast input array from shape (100000,493) into shape (100000)",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-61-5408838d5df1>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mvaex_df\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexport_hdf5\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'res.hdf5'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mprogress\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\vaex\\dataframe.py\u001b[0m in \u001b[0;36mexport_hdf5\u001b[1;34m(self, path, column_names, byteorder, shuffle, selection, progress, virtual, sort, ascending)\u001b[0m\n\u001b[0;32m   5802\u001b[0m         \"\"\"\n\u001b[0;32m   5803\u001b[0m         \u001b[1;32mimport\u001b[0m \u001b[0mvaex\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexport\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 5804\u001b[1;33m         \u001b[0mvaex\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexport\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexport_hdf5\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumn_names\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbyteorder\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mshuffle\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mprogress\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mprogress\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvirtual\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mvirtual\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msort\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0msort\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mascending\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mascending\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   5805\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   5806\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mexport_fits\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumn_names\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mshuffle\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mprogress\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvirtual\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msort\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mascending\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\vaex\\export.py\u001b[0m in \u001b[0;36mexport_hdf5\u001b[1;34m(dataset, path, column_names, byteorder, shuffle, selection, progress, virtual, sort, ascending)\u001b[0m\n\u001b[0;32m    310\u001b[0m     \u001b[0mkwargs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlocals\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    311\u001b[0m     \u001b[1;32mimport\u001b[0m \u001b[0mvaex\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhdf5\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexport\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 312\u001b[1;33m     \u001b[0mvaex\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhdf5\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexport\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexport_hdf5\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    313\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    314\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\vaex\\hdf5\\export.py\u001b[0m in \u001b[0;36mexport_hdf5\u001b[1;34m(dataset, path, column_names, byteorder, shuffle, selection, progress, virtual, sort, ascending)\u001b[0m\n\u001b[0;32m    243\u001b[0m     \u001b[0mdataset_output\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mvaex\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhdf5\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mHdf5MemoryMapped\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mwrite\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    244\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 245\u001b[1;33m     column_names = vaex.export._export(dataset_input=dataset, dataset_output=dataset_output, path=path, random_index_column=random_index_name,\n\u001b[0m\u001b[0;32m    246\u001b[0m                                        \u001b[0mcolumn_names\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcolumn_names\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mshuffle\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mshuffle\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbyteorder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mbyteorder\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    247\u001b[0m                                        progress=progress, sort=sort, ascending=ascending)\n",
      "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\vaex\\export.py\u001b[0m in \u001b[0;36m_export\u001b[1;34m(dataset_input, dataset_output, random_index_column, path, column_names, byteorder, shuffle, selection, progress, virtual, sort, ascending)\u001b[0m\n\u001b[0;32m    139\u001b[0m         \u001b[1;32mfor\u001b[0m \u001b[0mfuture\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mfutures\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    140\u001b[0m             \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 141\u001b[1;33m                 \u001b[0mfuture\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m0.1\u001b[0m\u001b[1;33m/\u001b[0m\u001b[1;36m4\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    142\u001b[0m             \u001b[1;32mexcept\u001b[0m \u001b[0mconcurrent\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfutures\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mTimeoutError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    143\u001b[0m                 \u001b[0mdone\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda3\\lib\\concurrent\\futures\\_base.py\u001b[0m in \u001b[0;36mresult\u001b[1;34m(self, timeout)\u001b[0m\n\u001b[0;32m    430\u001b[0m                 \u001b[1;32mraise\u001b[0m \u001b[0mCancelledError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    431\u001b[0m             \u001b[1;32melif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_state\u001b[0m \u001b[1;33m==\u001b[0m \u001b[0mFINISHED\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 432\u001b[1;33m                 \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__get_result\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    433\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    434\u001b[0m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_condition\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda3\\lib\\concurrent\\futures\\_base.py\u001b[0m in \u001b[0;36m__get_result\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    386\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m__get_result\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    387\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_exception\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 388\u001b[1;33m             \u001b[1;32mraise\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_exception\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    389\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    390\u001b[0m             \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_result\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda3\\lib\\concurrent\\futures\\thread.py\u001b[0m in \u001b[0;36mrun\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m     55\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     56\u001b[0m         \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 57\u001b[1;33m             \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     58\u001b[0m         \u001b[1;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mexc\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     59\u001b[0m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfuture\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mset_exception\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mexc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\vaex\\export.py\u001b[0m in \u001b[0;36m_export_column\u001b[1;34m(dataset_input, dataset_output, column_name, shuffle, sort, selection, N, order_array, order_array_inverse, progress_status)\u001b[0m\n\u001b[0;32m    211\u001b[0m                             \u001b[0mto_array\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mtarget_set_item\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mvalues\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfilled\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfill_value\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    212\u001b[0m                         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 213\u001b[1;33m                             \u001b[0mto_array\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mtarget_set_item\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mvalues\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    214\u001b[0m                         \u001b[0mto_offset\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[0mno_values\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    215\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mValueError\u001b[0m: could not broadcast input array from shape (100000,493) into shape (100000)"
     ]
    }
   ],
   "source": [
    "vaex_df.export_hdf5(path='res.hdf5', progress=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Train test split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "from torch.utils.data import Dataset, TensorDataset, DataLoader\n",
    "from torch.utils.data.dataset import random_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "property = 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train_tensor = torch.from_numpy(X_train).float()\n",
    "y_train_tensor = torch.from_numpy(y_train[:, property]).float()\n",
    "\n",
    "x_test_tensor = torch.from_numpy(X_test).float()\n",
    "y_test_tensor = torch.from_numpy(y_test[:, property]).float()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Builds dataset with ALL data\n",
    "origin_train_dataset = TensorDataset(x_train_tensor, y_train_tensor)\n",
    "\n",
    "# Splits randomly into train and validation datasets\n",
    "train_dataset, val_dataset = random_split(origin_train_dataset, [int(x_train_tensor.shape[0] * 0.9), int(x_train_tensor.shape[0] * 0.1)])\n",
    "\n",
    "# Builds a loader for each dataset to perform mini-batch gradient descent\n",
    "train_loader = DataLoader(dataset=train_dataset, batch_size=2000)\n",
    "val_loader = DataLoader(dataset=val_dataset, batch_size=2000)\n",
    "\n",
    "test_dataset = TensorDataset(x_test_tensor, y_test_tensor)\n",
    "test_loader  = DataLoader(dataset=test_dataset, batch_size=2000)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch.nn as nn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Net(nn.Module):\n",
    "\n",
    "    def __init__(self):\n",
    "        super(Net, self).__init__()\n",
    "        self.bn1 = nn.BatchNorm1d(X.shape[1])\n",
    "        self.fc1 = nn.Linear(X.shape[1], 100)\n",
    "        self.bn2 = nn.BatchNorm1d(100)\n",
    "        self.fc2 = nn.Linear(100, 50)\n",
    "        self.fc3 = nn.Linear(50, 10)\n",
    "        self.fc4 = nn.Linear(10, 1)\n",
    "\n",
    "    def forward(self, x):\n",
    "        x = self.bn1(x)\n",
    "        x = self.fc1(x)\n",
    "        x = torch.tanh(x)\n",
    "        x = self.bn2(x)\n",
    "        x = self.fc2(x)\n",
    "        x = torch.tanh(x)\n",
    "        x = self.fc3(x)\n",
    "        x = torch.relu(x)\n",
    "        x = self.fc4(x)\n",
    "        x = torch.relu(x)\n",
    "        return x"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch.optim as optim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "device = 'cuda' if torch.cuda.is_available() else 'cpu'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "n_epochs = 20"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_train_step(model, loss_fn, optimizer):\n",
    "    def train_step(x, y):\n",
    "        model.train()\n",
    "        yh = model(x)\n",
    "        yh = torch.reshape(yh, (-1,))\n",
    "        loss = loss_fn(y, yh)\n",
    "        loss.backward()\n",
    "        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.25)\n",
    "        optimizer.step()\n",
    "        optimizer.zero_grad()\n",
    "        return loss.item()\n",
    "    return train_step"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = Net().to(device)\n",
    "\n",
    "loss_fn = nn.MSELoss(reduction='mean')\n",
    "\n",
    "# optimizer = optim.SGD(model.parameters(), lr=0.01)\n",
    "optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.001)\n",
    "\n",
    "train_step = make_train_step(model, loss_fn, optimizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Net(\n",
       "  (bn1): BatchNorm1d(493, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
       "  (fc1): Linear(in_features=493, out_features=100, bias=True)\n",
       "  (bn2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
       "  (fc2): Linear(in_features=100, out_features=50, bias=True)\n",
       "  (fc3): Linear(in_features=50, out_features=10, bias=True)\n",
       "  (fc4): Linear(in_features=10, out_features=1, bias=True)\n",
       ")"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.eval()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1] Training loss: 17419.833\t Validation loss: 11374.671\n",
      "[2] Training loss: 5437.492\t Validation loss: 981.141\n",
      "[3] Training loss: 170.984\t Validation loss: 17.300\n",
      "[4] Training loss: 16.513\t Validation loss: 14.237\n",
      "[5] Training loss: 14.340\t Validation loss: 9.781\n",
      "[6] Training loss: 12.084\t Validation loss: 9.245\n",
      "[7] Training loss: 10.989\t Validation loss: 9.115\n",
      "[8] Training loss: 10.344\t Validation loss: 9.222\n",
      "[9] Training loss: 9.963\t Validation loss: 7.374\n",
      "[10] Training loss: 9.699\t Validation loss: 9.329\n",
      "[11] Training loss: 9.408\t Validation loss: 9.093\n",
      "[12] Training loss: 9.157\t Validation loss: 10.942\n",
      "[13] Training loss: 9.157\t Validation loss: 9.646\n",
      "[14] Training loss: 9.015\t Validation loss: 8.723\n",
      "[15] Training loss: 8.873\t Validation loss: 8.430\n",
      "[16] Training loss: 8.702\t Validation loss: 8.981\n",
      "[17] Training loss: 8.599\t Validation loss: 9.332\n"
     ]
    }
   ],
   "source": [
    "training_losses = []\n",
    "validation_losses = []\n",
    "\n",
    "for epoch in range(n_epochs):\n",
    "    batch_losses = []\n",
    "    for x_batch, y_batch in train_loader:\n",
    "        x_batch = x_batch.to(device)\n",
    "        y_batch = y_batch.to(device)\n",
    "        loss = train_step(x_batch, y_batch)\n",
    "        batch_losses.append(loss)\n",
    "    training_loss = np.mean(batch_losses)\n",
    "    training_losses.append(training_loss)\n",
    "\n",
    "    with torch.no_grad():\n",
    "        val_losses = []\n",
    "        for x_val, y_val in val_loader:\n",
    "            x_val = x_val.to(device)\n",
    "            y_val = y_val.to(device)\n",
    "            model.eval()\n",
    "            yh = model(x_val)\n",
    "            yh = torch.reshape(yh, (-1,))\n",
    "            val_loss = loss_fn(y_val, yh).item()\n",
    "            val_losses.append(val_loss)\n",
    "        validation_loss = np.mean(val_losses)\n",
    "        validation_losses.append(validation_loss)\n",
    "\n",
    "    print(f\"[{epoch+1}] Training loss: {training_loss:.3f}\\t Validation loss: {validation_loss:.3f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# model.state_dict()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Testing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "def mean_absolute_percentage_error(y_true, y_pred):\n",
    "    return torch.mean(torch.abs((y_true - y_pred) / y_true)) * 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_test_tensor = x_test_tensor.to(device)\n",
    "y_test_tensor = y_test_tensor.to(device)\n",
    "y_pred = model(x_test_tensor).squeeze()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(3279.8179, grad_fn=<MseLossBackward>)\n"
     ]
    }
   ],
   "source": [
    "test_loss = loss_fn(y_test_tensor, y_pred)\n",
    "print(test_loss)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The mean of absolute percentage error for L1: 2.51%\n"
     ]
    }
   ],
   "source": [
    "print(f\"The mean of absolute percentage error for L1: {mean_absolute_percentage_error(y_test_tensor.cpu(), y_pred.cpu()):.2f}%\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The mean of absolute percentage error for L2: 3.12%\n"
     ]
    }
   ],
   "source": [
    "print(f\"The mean of absolute percentage error for L2: {mean_absolute_percentage_error(y_test_tensor.cpu(), y_pred.cpu()):.2f}%\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The mean of absolute percentage error for L3: 3.09%\n"
     ]
    }
   ],
   "source": [
    "print(f\"The mean of absolute percentage error for L3: {mean_absolute_percentage_error(y_test_tensor.cpu(), y_pred.cpu()):.2f}%\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Real-world case"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "case1 = pd.read_csv('case1.txt', header=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "case1 = np.reshape(np.array(case1), [1, -1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "case1_tensor = torch.from_numpy(case1).float()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "model.eval()\n",
    "pred_1 = model(case1_tensor)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[150.0873]], grad_fn=<ReluBackward0>)"
      ]
     },
     "execution_count": 95,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pred_1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[1.1310]], grad_fn=<ReluBackward0>)"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pred_1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[1472.1879]], grad_fn=<ReluBackward0>)"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pred_1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "case1 = pd.read_csv('case2.txt', header=None)\n",
    "case1 = np.reshape(np.array(case1), [1, -1])\n",
    "case1_tensor = torch.from_numpy(case1).float()\n",
    "model.eval()\n",
    "pred_1 = model(case1_tensor)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pred_1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[2.4505]], grad_fn=<ReluBackward0>)"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pred_1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[1467.8439]], grad_fn=<ReluBackward0>)"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pred_1"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}