{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Prerequisites" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "!pip install numpy matplotlib pandas sklearn" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!python -V" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data Preprocessing" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "import pickle\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "np.set_printoptions(suppress=True)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "scrolled": true }, "outputs": [], "source": [ "X = pickle.load( open( \"res.pickle\", \"rb\" ) )" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "X = X.T" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1000000, 493)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.shape" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "layer_1 = np.linspace(10, 250, 100, endpoint=True)\n", "layer_2 = np.linspace(0.5, 10, 100, endpoint=True)\n", "layer_3 = np.linspace(710, 1700, 100, endpoint=True)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "y = np.zeros((X.shape[0], 3))" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "i = 0\n", "for l3 in layer_3:\n", " for l2 in layer_2:\n", " for l1 in layer_1:\n", " y[i] = [l3, l2, l1]\n", " i = i + 1" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1000000, 3)" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## HDF5" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Write to HDF5" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "df_X = pd.DataFrame(X)\n", "df_y = pd.DataFrame(y)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:MainThread:numexpr.utils:NumExpr defaulting to 8 threads.\n" ] } ], "source": [ "store = pd.HDFStore('res.h5')" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "store['X'] = df_X\n", "store['y'] = df_y" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "store.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Read from HDF5" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "res = pd.read_hdf('res.h5', 'X')" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1000000, 3)" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Vaex" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "import vaex" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Python 3.8.3\n" ] } ], "source": [ "!python -V" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 710., 710., 710., ..., 1700., 1700., 1700.])" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y[:, 0]" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "scrolled": true }, "outputs": [], "source": [ "vaex_df = vaex.from_arrays(x=X, y=y)" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1000000, 493)" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.shape" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1000000, 3)" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y.shape" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| # | x | y |
|---|---|---|
| 0 | '[0.93921708 1. 0.96758555 0.88402555 0.... | [710. 0.5 10. ] |
| 1 | '[0.9392044 1. 0.96759943 0.88406149 0.... | [710. 0.5 12.42424242] |
| 2 | '[0.93918573 1. 0.9676289 0.88417059 0.... | [710. 0.5 14.84848485] |
| 3 | '[0.93915796 1. 0.96770218 0.88444087 0.... | [710. 0.5 17.27272727] |
| 4 | '[0.93910895 1. 0.96785629 0.88496242 0.... | [710. 0.5 19.6969697] |
| ... | ... | ... |
| 999,995 | '[0.90888059 1. 0.99898726 0.9374508 0.... | [1700. 10. 240.3030303] |
| 999,996 | '[0.90857156 1. 0.99928981 0.93796907 0.... | [1700. 10. 242.72727273] |
| 999,997 | '[0.90826533 1. 0.99958941 0.93848211 0.... | [1700. 10. 245.15151515] |
| 999,998 | '[0.90796191 1. 0.99988598 0.93898997 0.... | [1700. 10. 247.57575758] |
| 999,999 | '[0.90749824 0.99982035 1. 0.93932393 0.... | [1700. 10. 250.] |