You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

733 lines
124 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "f1396231-0139-456f-96d8-6f18199e8e25",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"import math\n",
"from sklearn.model_selection import train_test_split\n",
"import scipy\n",
"from scipy.stats import ks_2samp"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "3521f41e-c9d2-42f3-b657-45634f3fd8ac",
"metadata": {},
"outputs": [],
"source": [
"df=pd.read_csv(r'/home/unipi/v.vichi3/Desktop/dataframe.csv')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "ab94adc5-854e-49cd-9a9f-bce8ffd4ee87",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>a</th>\n",
" <th>e</th>\n",
" <th>i</th>\n",
" <th>om</th>\n",
" <th>w</th>\n",
" <th>ma</th>\n",
" <th>H-value</th>\n",
" <th>MOID</th>\n",
" <th>EPOCH</th>\n",
" <th>PHA</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'0000001'</td>\n",
" <td>2.662454</td>\n",
" <td>0.533875</td>\n",
" <td>26.69427</td>\n",
" <td>19.950131</td>\n",
" <td>284.514731</td>\n",
" <td>14.514345</td>\n",
" <td>9.45</td>\n",
" <td>0.4615</td>\n",
" <td>59945.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>'0000002'</td>\n",
" <td>1.457965</td>\n",
" <td>0.222633</td>\n",
" <td>10.82898</td>\n",
" <td>209.642798</td>\n",
" <td>343.753667</td>\n",
" <td>338.406046</td>\n",
" <td>11.16</td>\n",
" <td>0.1356</td>\n",
" <td>59945.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>'0000003'</td>\n",
" <td>1.893819</td>\n",
" <td>0.538294</td>\n",
" <td>41.18978</td>\n",
" <td>286.606414</td>\n",
" <td>287.585635</td>\n",
" <td>83.688834</td>\n",
" <td>12.40</td>\n",
" <td>0.1162</td>\n",
" <td>59945.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>'0000004'</td>\n",
" <td>2.001212</td>\n",
" <td>0.448742</td>\n",
" <td>17.44771</td>\n",
" <td>214.338458</td>\n",
" <td>245.558687</td>\n",
" <td>15.816629</td>\n",
" <td>12.60</td>\n",
" <td>0.2447</td>\n",
" <td>59945.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>'0000005'</td>\n",
" <td>4.221445</td>\n",
" <td>0.713243</td>\n",
" <td>30.98228</td>\n",
" <td>206.677675</td>\n",
" <td>330.127884</td>\n",
" <td>117.375359</td>\n",
" <td>12.90</td>\n",
" <td>0.2592</td>\n",
" <td>59945.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ID a e i om w \\\n",
"0 '0000001' 2.662454 0.533875 26.69427 19.950131 284.514731 \n",
"1 '0000002' 1.457965 0.222633 10.82898 209.642798 343.753667 \n",
"2 '0000003' 1.893819 0.538294 41.18978 286.606414 287.585635 \n",
"3 '0000004' 2.001212 0.448742 17.44771 214.338458 245.558687 \n",
"4 '0000005' 4.221445 0.713243 30.98228 206.677675 330.127884 \n",
"\n",
" ma H-value MOID EPOCH PHA \n",
"0 14.514345 9.45 0.4615 59945.0 0.0 \n",
"1 338.406046 11.16 0.1356 59945.0 0.0 \n",
"2 83.688834 12.40 0.1162 59945.0 0.0 \n",
"3 15.816629 12.60 0.2447 59945.0 0.0 \n",
"4 117.375359 12.90 0.2592 59945.0 0.0 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "21f6c17e-a521-455d-82c3-0f6aeea7df77",
"metadata": {},
"outputs": [],
"source": [
"kep=df.iloc[:,1:6]\n",
"y=df.iloc[:,-3]\n",
"X=kep.to_numpy()\n",
"y=y.to_numpy()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "505356c7-da2b-4929-b65b-f775f835cc5a",
"metadata": {},
"outputs": [],
"source": [
"#Split the data into training and test\n",
"X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=750000)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "9458e17f-873a-4355-964b-6c162eebd71e",
"metadata": {},
"outputs": [],
"source": [
"df_train=pd.DataFrame(X_train, columns=['a','e','i','om','w'])\n",
"df_test=pd.DataFrame(X_test, columns=['a','e','i','om','w'])\n",
"ydf_train=pd.DataFrame(y_train, columns=['MOID'])\n",
"ydf_test=pd.DataFrame(y_test,columns=['MOID'])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "32d4bbfb-26fa-4add-8246-e1c267667b28",
"metadata": {},
"outputs": [],
"source": [
"#Comparison of some statistical measures between training and test set \n",
"A_train=df_train.describe()\n",
"A_test=df_test.describe()\n",
"B_train=ydf_train.describe()\n",
"B_test=ydf_test.describe()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "af994684-c4b9-461f-8c0e-e13e0cd9b909",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>e</th>\n",
" <th>i</th>\n",
" <th>om</th>\n",
" <th>w</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>0.002937</td>\n",
" <td>-0.000607</td>\n",
" <td>0.027988</td>\n",
" <td>0.420302</td>\n",
" <td>0.343676</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>-0.001472</td>\n",
" <td>0.000403</td>\n",
" <td>0.064839</td>\n",
" <td>-0.227066</td>\n",
" <td>0.136723</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>-0.000002</td>\n",
" <td>-0.001717</td>\n",
" <td>-0.049321</td>\n",
" <td>-0.000011</td>\n",
" <td>-0.023646</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>0.005710</td>\n",
" <td>-0.000832</td>\n",
" <td>0.016489</td>\n",
" <td>1.228961</td>\n",
" <td>0.372439</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>0.009029</td>\n",
" <td>-0.000021</td>\n",
" <td>0.066847</td>\n",
" <td>0.206380</td>\n",
" <td>0.186781</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>0.000243</td>\n",
" <td>-0.000705</td>\n",
" <td>-0.010827</td>\n",
" <td>-0.018515</td>\n",
" <td>0.263608</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>0.027082</td>\n",
" <td>0.000222</td>\n",
" <td>3.584705</td>\n",
" <td>0.001342</td>\n",
" <td>0.022134</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" a e i om w\n",
"mean 0.002937 -0.000607 0.027988 0.420302 0.343676\n",
"std -0.001472 0.000403 0.064839 -0.227066 0.136723\n",
"min -0.000002 -0.001717 -0.049321 -0.000011 -0.023646\n",
"25% 0.005710 -0.000832 0.016489 1.228961 0.372439\n",
"50% 0.009029 -0.000021 0.066847 0.206380 0.186781\n",
"75% 0.000243 -0.000705 -0.010827 -0.018515 0.263608\n",
"max 0.027082 0.000222 3.584705 0.001342 0.022134"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"A_train.iloc[1:,:]-A_test.iloc[1:,:] \n",
"#the first row of the describe method is the count of objects, so we ignore it"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "17336f68-0547-40f9-a003-a86422fa899e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Axes: >"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAgMAAAGdCAYAAACPX3D5AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy81sbWrAAAACXBIWXMAAA9hAAAPYQGoP6dpAABJJklEQVR4nO3deVgU9R8H8PeCsILJIcrlCVqEHIrgAYqKeGLeiVkpmnnl8VMyFa8stU00Q/M2LySVlDLLuxTU1BLFI+/KxBBEDg88lmt/f5hbuwjsMcuyzvvVM88T35n5zmfmQfaz32skCoVCASIiIhItM2MHQERERMbFZICIiEjkmAwQERGJHJMBIiIikWMyQEREJHJMBoiIiESOyQAREZHIMRkgIiISOSYDREREIlfF2AE886TQ2BGQGHH9Tf3cvv/E2CGYLAtzfhfTR207S4PWb+U3VrC6HqcsFawuQ6k0yQAREVGlIRFXsiauuyUiIqIS2DJARESkTiIxdgQVii0DRERE6iRmwm1aWLFiBXx9fWFjYwMbGxsEBgZiz549pR6fmJgIiURSYrt8+bJW12XLABERkTojtQzUqVMHn376KRo1agQA2LhxI3r16oWUlBR4eXmVet6VK1dgY2Oj/LlWrVpaXZfJABERUSXRo0cPlZ/nzZuHFStW4MSJE2UmA46OjrCzs9P5uuwmICIiUidgN4FcLsf9+/dVNrlcXm4IRUVF2Lp1Kx4+fIjAwMAyj/Xz84OLiwtCQ0Nx6NAhrW+XyQAREZE6iUSwTSaTwdbWVmWTyWSlXvr8+fN46aWXIJVKMWrUKHz77bdo3Ljxc491cXHB6tWrkZCQgG+++QYeHh4IDQ3F4cOHtbtdhaJyLLvCRYfIGCrHb7/p4qJDuuOiQ/ox+KJDLT8QrK67h+eWaAmQSqWQSqXPPT4/Px+pqam4e/cuEhIS8OWXXyIpKanUhEBdjx49IJFIsHPnTo1j1HnMQHFxMX7//XdkZmaiuLhYZV/btm11rZaIiMj4BFx0qKwP/uextLRUDiAMCAjAyZMnsXjxYqxatUqj81u1aoW4uDitYtQpGThx4gTefPNN3LhxA+oNCxKJBEVFRbpUS0REVDlUonUGFAqFRmMMnklJSYGLi4tW19ApGRg1ahQCAgKwa9cuuLi4QFKJHhoREZGpmjZtGrp164a6deviwYMH2Lp1KxITE7F3714AQFRUFNLS0hAbGwsAiImJQYMGDeDl5YX8/HzExcUhISEBCQkJWl1Xp2Tg2rVr2L59u7IZg4iI6IVipHcT3L59G4MGDUJ6ejpsbW3h6+uLvXv3olOnTgCA9PR0pKamKo/Pz8/HpEmTkJaWBisrK3h5eWHXrl0ICwvT6ro6DSDs0KEDJk+ejK5du2p7aqk4gJCMgQMI9cMBhLrjAEL9GHwAYevpgtX1+Od5gtVlKDq1DIwbNw7vv/8+MjIy4OPjAwsLC5X9vr6+ggRHREREhqdTy4CZWcmMViKRQKFQ6DyAkC0DZAxsGdAPWwZ0x5YB/Ri8ZaDNTMHqenx0jmB1GYpOLQPXr18XOg4iIqLKQ2QD43VKBurXry90HERERJWHkQYQGoteLyq6ePEiUlNTkZ+fr1Les2dPvYIiIiKiiqNT6vPnn3+iSZMm8Pb2Rvfu3dG7d2/07t0bffr0QZ8+fYSOUSvxW75Ct84d0NzPB2/074vTp5LLPD755K94o39fNPfzQViXUHwdv6XEMT/u34c+PcIQ0NQbfXqE4acfD6jsX7tmFd4M74fA5n5oHxyICePew1/X/1Q5JjsrCzOnTUXH9m3Q0r8JRo8Yhhs3/tL7fvVhjGel6XX//OMPjB8zCq1b+iOwuR/eHhiO9Fu3lPu3fx2PYUMGIahFMzTx8sD9+/d1eAKGFb/1K4R16YAWzXwwMFyz5zswvC9aNPNB966h2Kb2fBO2f42hg99EcFBzBAc1x8h3h+D8+XMqxxQWFmLpks8R1qUDWvr7onvXUKxasbTEKqGm6PuEeAzu1w2vtW+OMUPfwPkzpzU678K5FHQLbobREeEq5bu/S0Dk6CHo16UN+nVpgynjR+DyxfOGCN3ovtu+FW/27oouwf4YOTgc51JOlXrs+TOnMW74IPTu1AZd2wYgIrwHtm2JLfX4g/v3oENLH8z8YLwhQjceAV9UZAp0ivJ///sf3NzccPv2bVhbW+PChQs4fPgwAgICkJiYKHCImtu7ZzeiP5Vh+IjRiN++A82a+eO9kcNVPkT+6++/b2LM6BFo1swf8dt34N3hozD/k3n4cf8+5TFnz6Rg8qSJeK1nL2z75ju81rMXJr8/AefOnVUek3zyVwwY+BY2bfkaq9asR2FREUYNH4ZHjx4BeLp61ITxY/D33zcR88VyxG//Fi6utTFy2FDlMRXNWM9Kk+veTE3FkEFvws3NHV9u2IRt3+zEiFHvwfI/y3k+efIYQa2DMWz4KAM8Hf3t27MbCz6V4d3ho7F12w74NfPHmFHDkZ7+/Oeb9vdNjH1vBPya+WPrth0Y9u4ozJfNw48H/n2+ySd/Qdew7lizLhaxcVvh7OyC0SPewe3bt5XHrF+7Btu/3oqp02bhm527MSHyA2xcvxZbvtpk8Hs2pMQf92Ll4mgMjBiO5Rvi4d2kGWa8/x4yM9LLPO9h3gMs+HgG/PxblNh3LiUZIR27IfqLL/H5qk1wdHLGtAmjkXXn9nNqMl2HDuzFss/n462hw7E6dht8mvpj6sTRuF3Ks6tqZYXerw/E56s2YMPW7/D20BFYv3Ipfvh2W4ljM9JvYeWShfBp2szQt1HxzCTCbSZAp9kENWvWxMGDB+Hr6wtbW1v8+uuv8PDwwMGDB/H+++8jJSVF60CEmE3w1hv94dm4MWbM+khZ1rtHN4R06Ij/TXy/xPGff7YASYkHseP7PcqyOR/NwtUrV7BpczwA4IP3J+BhXh6Wr/pSeczoEcNgY2OL+QsXPTeOnJwchAQHYt3GOPgHNMdff11Hr+5dkfDdD2jU6GUAT19NGRIchAmRk9D39f7637yWjPWsNLnu5EkTUaVKFXzy6YJy7+Pkr7/g3aGDceT4SdjY2Gj5FAw3m+Dtgf3h6dkY0/9zn33+uc/xz3m+MYsWIOnQQXz7n+c796NZuHr1CmK/in/uNYqKitA2qDmmTpuFHr16AwDGvTcSDg4OmD3nE+Vx708Yh6pVq2KeBs9TWxU1m2D8u2+hkYcnxn8wQ1n27sDeCGobgndG/6/U8z6ZORm169aDmZk5jh05hBUbvy712KKiIrzeJRjvvR+FTt16lHqcUCpqNsF777yJlz08MXHKv6PjhwzoidZtO2D4mAka1TFrygRUrWqFaR/9+6a9oqIiTBw1FF179Mb5M6eQ9+AB5ixYInT4pTL4bIIQ4WYAPD4k3MwEQ9Hpt7GoqAgvvfQSgKeJwa1/vtXVr18fV65cES46LRTk5+PSxQsIDGqjUh4Y1Bpnzzw/OTl39gwCg1qrlAW1DsbFC7+hoKDg6TFnzpSoM6h1cKl1AkDegwcAABtbW2VsACC1/Pebrbm5OSwsLJByuvTmOkMx1rPS5LrFxcU4kpSI+vUbYNTwYWgfHIi33uiPgz/9qPsNV7CCguffZ6ug1jh7tvTn26qc56vuyZPHKCwshO0/v2cA4NfMH7/8cgI3/no64+fK5ctIOX0Kbdq20+eWjKqgoADXrlyCfwvV97n7twjExfNnSzkL2PfDDqSn/Y2339Gs9Uj+5AkKCwtRXYeksrIqKCjA1csXEdAySKU8oEUQLpw/o1Ed165cwoVzZ9CkWYBK+aa1K2Frb4+wnn2FCrdyEVk3gU4DCL29vXHu3Dm4u7ujZcuWiI6OhqWlJVavXg13d3ehY9RI7t1cFBUVwcHBQaXcwaEmsrLuPPecrKwsODjUVDveAYWFhbh7Nxe1ajn+c4x6nQ6l1qlQKLAwWga/Zv54+eVXAAAN3Nzh6lobS2I+w8wPP4aVlRViN25AVtYd3Lnz/HoMyVjPSpPr5mRn49GjR1i3dg3GjpuACZGT8PPRI4j831h8uT4WAc1LNvdWNrm5T++zhpbPN0j
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.heatmap(np.absolute(A_train.iloc[1:,:]-A_test.iloc[1:,:]),annot=True,cmap='Blues')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "2f5fcc38-f866-4350-b5d1-ba27914a505b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MOID</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>0.000081</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>-0.000106</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>0.000600</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>0.000700</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>0.099600</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" MOID\n",
"mean 0.000081\n",
"std -0.000106\n",
"min 0.000000\n",
"25% 0.000000\n",
"50% 0.000600\n",
"75% 0.000700\n",
"max 0.099600"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"B_train.iloc[1:,:]-B_test.iloc[1:,:]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "62315497-b84c-4466-a35c-042144607977",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Axes: >"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAgwAAAGdCAYAAAB+VCt0AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy81sbWrAAAACXBIWXMAAA9hAAAPYQGoP6dpAABHFElEQVR4nO3deVRU5f8H8PewS4kQyKIRAZaiorKkgWFZionhrliGuISikgqaiLiUpuSSmbmimNqilKj5TVwwxRU1CXEBc0MxBVndlWWY3x/+nJq56CxeHBjer849J5957jOfmXOUD59nuRKZTCYDERER0VMY6DoAIiIiqvmYMBAREZFKTBiIiIhIJSYMREREpBITBiIiIlKJCQMRERGpxISBiIiIVGLCQERERCoxYSAiIiKVjHQdwGMPK3QdARER1RZm1fzTq55HuGhjPUhfLNpYulRjEgYiIqIaQ8ICvDJ+I0RERKQSKwxERETKJBJdR1DjMGEgIiJSxikJASYMREREylhhEGAKRURERCqxwkBERKSMUxICTBiIiIiUcUpCgCkUERERqaR1haGyshIXLlxAfn4+KisrFV7r0KHDMwdGRESkM5ySENAqYThy5Ag++ugjXLlyBTKZTOE1iUQCqVQqSnBEREQ6wSkJAa0ShrCwMHh7e2Pbtm1wcHCAhF8sERGRXtMqYTh//jw2btyIJk2aiB0PERGR7nFKQkCrb6Rdu3a4cOGC2LEQERHVDBKJeJee0KrC8Omnn2L8+PHIy8uDu7s7jI2NFV5v1aqVKMERERFRzSCRKa9aVIOBgbAwIZFIIJPJtF70+LBC41uIiKiOMqvmU4TqvTVVtLEeHJwp2li6pNVXnp2dLXYcRERENYceTSWIRauEwcnJSew4iIiIag4uehR4pqJOZmYmcnJyUFZWptDevXv3ZwqKiIiIahatEoZLly6hV69eOHXqlHztAgD5eQw8uIn0XUVFBZYv+Q7btv0PRYWFsGnYEN179MLwsFFVrvEBgIKCfHw9dw4yM08j58oVfDQwGBOjY0SJJ/f6dcz+cgaOHTsCM1NTdO0WiPETJsLYxAQAcO3aPwjwf09w39LlK9HejyezEgmwwiCgVcIwduxYODs7Y/fu3XBxccGxY8dQVFSE8ePHY/78+WLHSFTjfB+/Er/+sgEzZ8+Ba5MmyDx9GtOmRKN+/foYGBxS5T1lZWWweskKocNH4od1a0SLRSqVInzUCFhZWWHNDz/j1s2bmDI5CjKZDNExigu34uLXwNX13/NTGjRoIFocRHrFgGsYlGmVMKSmpmLPnj1o2LAhDAwMYGBggLfeeguxsbEYM2YM0tPTxY6TqEbJyDiBd959Dx3efgcA0Ljxy9ietA1nzpx+4j2NG7+MqOgpAIAtmxOf2G/L5kSsWb0K1/75B40aN8ZHA4MR9OHAJ/ZPPXwQly5ewM4/UmBrawcAGP/ZJEyLmYRPx0bgxRdflPdt0MASNg0bavJRiYgAaHlwk1Qqlf8jZGNjg+vXrwN4tBjy77//Fi86ohrKw8MLx44cweXLj3YM/X32LNLT0+Dn9/YzjZv46y9Y/O03CB8Tgc3/S8KnYyOx5LtF2Lpl8xPvyThxAk2avCZPFgCgffu3UFZWhkylBGZs+Ei84+eDkIEDkLxzxzPFSqTXJAbiXXpCqwpDy5YtcfLkSbi4uKBdu3aYO3cuTExMEBcXBxcXF7FjJKpxhn4Sirt376DnB11haGgIqVSKT8dGoGu3D55p3LjlSzH+s0no1NkfAPDyy464dPECNv6agO49e1V5T1FhIV6ysVFos2jQAMbGxigqLAQAmJubY8LEaLTx8ISBgQQpe/dg4oQIzCwrxQeBPZ4pZiK9xG2VAlolDFOmTMG9e/cAAF9++SU++OAD+Pn5wdraGgkJCSrvLy0tRWlpqUKbzNAUpqam2oRD9Nzt2J6Ebb9vRezcr9GkSROcPZuFeV/FomFD2yf+YFeluLgYeXm5+HxaDL6Y/u/aA6m0Ai/Wrw8AGDXiE/yVlgYAcGjUCJu3bgMASCD8x00mg/wfPSurlxAcMlj+WouW7rh9+zbWrF7FhIGI1KJVwtClSxf5/7u4uCAzMxPFxcWwsrJS68mVsbGx+OKLLxTaYqZOx5Rpn2sTDtFz983XczF02HB0DegGAHjt9abIvX4d8atWaJ0wyCorAQDTvpgJd/fWCq8ZGD4qa06fMQulDx8CAIyMH/31tbaxwamTGQr9b9+6hYqKclhbWz/x/Vq1bo3Nib9qFSuR3tOjqQSxPNM5DBcuXMDFixfRoUMHvPTSS1D3lOno6GhERkYqtMkMWV2g2uPhg4cwUFpFbWhoiMpKjU9al7O2sYGtnR3+uXoV3T6o+iwTOzs7QVvrNm2wKm45Cgry0bChLQDg8OFDMDExQfMWLZ/4fmezsrgAkuhJOCUhoFXCUFRUhP79+2Pv3r2QSCQ4f/48XFxc8Mknn8DS0hJff/31U+83NRVOP/BZElSbvP1OR6yMWw57h0ZwbdIEZ7Oy8MPa79GjVx95n2+/+Rr5+TcwK3auvO1sVhYA4P79eygpKcbZrCwYGxvD9f8fFT9y1KeYE/slXnzxRbT364DysjKcOXMat2/dxqDBQ6qMxcf3Lbi4NkHMpImImDARt2/dwoL5c9C7b3/54uStWzbDyMgIzdyaw8BAgn0pe/Hzjz9gXOSE6vqKiEjPaPXwqUGDBiE/Px+rVq2Cm5sbMjIy4OLigl27diEiIgJnzpzROBAmDFSb3Lt3F0sWfYs9f+xGcXERGtraomvXbhgxcrT8sKSpkyfh+vVriF/zg/y+1i2aCsZq1Kgxtifvkf856ff/Yc338bh08QLq1TPHa6+/joHBIXivU+cnxpN7/TpmffkF/jx6BKamZuja7QOM/ywKJv8fy9Ytm/F9/Epcz70OQwMDOL36KgYGh3D9AtVa1f7wKf95oo31YNdnoo2lS1olDPb29ti5cydat26N+vXryxOG7OxsuLu74+7duxoHwoSBiIjUVe0JQxfxDiF8sFM/KnlafeX37t2Dubm5oL2wsJA7HYiIqPbjokcBrb6RDh06YN26dfI/SyQSVFZWYt68eejYsaNowREREVHNoFWFYd68eXjnnXdw/PhxlJWVYeLEiThz5gyKi4tx6NAhsWMkIiJ6vrhLQkCrCkPz5s2RkZGBtm3bonPnzrh37x569+6N9PR0uLq6ih0jERHR88WjoQW0XjZiZWWFbt264Y033kDl/x848+effwIAuneveg85ERER1U5aJQw7duzAoEGDUFRUJDisSSKRQCqVihIcERGRTnBKQkCrWkl4eDj69euH69evo7KyUuFiskBERLUepyQEtPok+fn5iIyMrPKYWiIiItI/WiUMffv2RUpKisihEBER1RCsMAhoddLj/fv30a9fPzRs2BDu7u4wNjZWeH3MmDEaB8KTHomISF3VftJj92WijfVg60jRxtIlrb7yn3/+GTt37kS9evWQkpKi8EhriUSiVcJARERENZdWCcOUKVMwY8YMTJo0CQYG+lNuISIiAqBXUwli0SphKCsrQ1BQEJMFIiLST9xWKaDVT/yQkBAkJCSIHQsREVHNwEWPAlpVGKRSKebOnYudO3eiVatWgkWPCxYsECU4IiIiqhm0ShhOnToFDw8PAMDp06cVXpOwjENERLUdf5YJaJUw7N27V+w4iIiIagz+8iukP5MrREREVG2q+egLIiKi2ocVBiEmDERERMqYLwhwSoKIiIhUYoWBiIhICackhJgwEBERKWHCIMQpCSIiIlKJFQYiIiIlrDAIMWEgIiJSwoRBiAkDERGRMuYLAlzDQERERCqxwkBERKSEUxJCTBiIiIiUMGEQ4pQEERERqcQKAxERkRJWGISYMBARESlhwiDEKQkiIiJSiRUGIiIiZSwwCDBhICIiUsIpCSFOSRAREZFKTBiIiIiUSCQS0S5NLV26FM7OzjAzM4OXlxcOHDjw1P779u2Dl5cXzMzM4OLiguXLlwv6LFy4EE2bNkW9evXg6OiIiIgIPHz4UKO4mDAQEREp0VXCkJCQgHHjxiE
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.heatmap(np.absolute(B_train.iloc[1:,:]-B_test.iloc[1:,:]),annot=True, cmap='Blues')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "dfeb6da7-8916-4bf2-9e1e-8e912ced27e4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"a : distance: 0.004706798228196085 p-value: 0.2274892259071406\n",
"e : distance: 0.0034702911257064317 p-value: 0.5959712249062891\n",
"i : distance: 0.001685299119189354 p-value: 0.9990253593864518\n",
"om : distance: 0.004452073316022598 p-value: 0.2855013975223749\n",
"w : distance: 0.0037016791405732885 p-value: 0.5122989577027215\n"
]
}
],
"source": [
"#Perform the Kolmogorov-Smirnov two-sample test to check that the training and test set are drawn from the same distributions\n",
"#The null hypothesis is that the distributions are the same\n",
"#We choose a confidence level of 99%, that is we will reject the null hypothesis in favour of the alternative if the p-value is less than 0.01\n",
"#First compute the KS test statistic and the p-value for all the variables a,e,i,om,w,MOID \n",
"distances=list(map(lambda i:\n",
" ks_2samp(df_train.iloc[:,i],df_test.iloc[:,i]).statistic, \n",
" range(df_train.shape[1])))\n",
"pvalues=list(map(lambda i:\n",
" ks_2samp(df_train.iloc[:,i],df_test.iloc[:,i]).pvalue,\n",
" range(df_train.shape[1])))\n",
"\n",
"for i in range(df_train.shape[1]):\n",
" print(df_train.columns[i], \": distance:\", distances[i], \"p-value: \", pvalues[i])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "5675a84e-9960-440b-af30-188b39c29c3d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MOID statistic 0.0029887162568097425\n"
]
}
],
"source": [
"print(\"MOID statistic\",ks_2samp(ydf_train.loc[:,'MOID'],ydf_test.loc[:,'MOID']).statistic)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "120d7bfb-620f-42f2-8515-60dc802b8e4b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MOID p-value 0.7731305746585694\n"
]
}
],
"source": [
"print(\"MOID p-value\",ks_2samp(ydf_train.loc[:,'MOID'],ydf_test.loc[:,'MOID']).pvalue)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b666430a-ce44-426e-84d1-ef54d56ba2fb",
"metadata": {},
"outputs": [],
"source": [
"#Since the p-values are all greater than the threshold alpha=0.01, we cannot reject the null hypothesis. \n",
"#Thus, it is likely that the two datasets are drawn from the same distribution (there is no evidence in favour of the alternative hypothesis)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "147defd4-1659-45f8-8a08-bf95049889e3",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/unipi/v.vichi3/miniconda3/envs/py39/lib/python3.9/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.\n",
" with pd.option_context('mode.use_inf_as_na', True):\n",
"/home/unipi/v.vichi3/miniconda3/envs/py39/lib/python3.9/site-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.\n",
" data_subset = grouped_data.get_group(pd_key)\n",
"/home/unipi/v.vichi3/miniconda3/envs/py39/lib/python3.9/site-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.\n",
" data_subset = grouped_data.get_group(pd_key)\n"
]
},
{
"data": {
"text/plain": [
"<Axes: xlabel='a', ylabel='Proportion'>"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAG2CAYAAACDLKdOAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy81sbWrAAAACXBIWXMAAA9hAAAPYQGoP6dpAABRv0lEQVR4nO3dd3wUdeLG8c9m0zskISQQktBDhwSQdkgLRVGsgAVQRDkLAjY4ThT0xIYFEFRAOD1P+ClFVASC0gSkBkWa9FCCIZQkJJCyO78/ojkxoYUkk90879dr79jJzO4zjNl9+E6zGIZhICIiIuIkXMwOICIiIlKSVG5ERETEqajciIiIiFNRuRERERGnonIjIiIiTkXlRkRERJyKyo2IiIg4FZUbERERcSoqNyIiIuJUVG5ERETEqZhablavXk3v3r0JDw/HYrGwcOHCKy6zatUqYmNj8fT0pGbNmrz//vulH1REREQchqnlJjMzk6ZNmzJlypSrmv/gwYP06tWLDh06kJiYyD/+8Q+GDRvGvHnzSjmpiIiIOApLeblxpsViYcGCBfTp0+eS8zz33HMsWrSIXbt2FUwbOnQoP/30E+vXry+DlCIiIlLeuZod4FqsX7+e+Pj4i6Z1796dmTNnkpubi5ubW6FlsrOzyc7OLnhut9s5ffo0QUFBWCyWUs8sIiIi188wDDIyMggPD8fF5fI7nhyq3Jw4cYLQ0NCLpoWGhpKXl0dqaiphYWGFlpkwYQLjxo0rq4giIiJSio4cOUL16tUvO49DlRug0GjLH3vVLjUKM3r0aEaOHFnwPC0tjRo1anDkyBH8/f1LL6iIiMg1MgyDtHPnOZp8nOTkExw9kczxM5kcyTDYmBlKjmG9rtd3JQ8fzuPjkoO3cR4/d/C1peHn44Mv5/Hxr4yPuxVvMvH2DSQPV/w8XPBwt+Kadx6rZwCurq5YrS5YXV2xulixYsfFOwCXwBpYrRasFguRQT64Wkv2sN709HQiIiLw8/O7ivV0IFWrVuXEiRMXTUtJScHV1ZWgoKAil/Hw8MDDw6PQdH9/f5UbEREpc3k2O8fOZHHo6HGSjh7mSMoZDp/J5lC6wdEcHzINzz/NXfV/f3T/31lAwZylunsmIR55+Hu44GdPwz+gMn4eLvh5eeLrZsc3MBh/L3f8Ayrj7+OFn38Ant7+WNy9wYEPy7iaQ0ocqty0adOGr7766qJpy5YtIy4ursjjbURERMyQa7Nz7Mx5Dp1I5dCRo+w/cYZDp7JIOgfHLniSx59HYHx+f/xPCGep7ppGhGdW/v/7QvVK3oRH1aVatRp4VusMbl5luk6OxNRyc+7cOfbt21fw/ODBg2zbto3KlStTo0YNRo8ezbFjx/j444+B/DOjpkyZwsiRIxkyZAjr169n5syZfPbZZ2atgoiIVFB2u0Fy+gUOp2Zy8MRpDh9J4kBKGgfSDJKyPMi76GorFv5cYDzIIdLyGzXcz1Hd1yAqwJXI8FBqRNWiWngNPAOrgMv17YKqyEwtN5s3b6ZTp04Fz/84NmbgwIHMnj2b5ORkkpKSCn4eHR3N4sWLGTFiBO+99x7h4eFMmjSJO+64o8yzi4iI87PZDZLTzpN0KouDJ8/lj8ScSOXwmQscOmclx/7X40o8//SnbGpYUohyPU0tnwtEB7pSIySAiGrVCYuqh0twL3AtfNiEXL9yc52bspKenk5AQABpaWmXPebGZrORm5tbhsnkStzd3a94+p+IyLXKuJDL4VNZJJ3O4uhvpzj02ymOnDpHUloux7Os5BqX/txxI48ISwqRlt+I8sggupI70UFe1AoLomp0Q1xCG4B35TJcG+d1td/f4GDH3JQFwzA4ceIEZ8+eNTuK/IWLiwvR0dG4u7ubHUVEHIzdbnDs7Hn2JZ9m36HD7Dtxlv2ncziUDqm5l/pMcfv9f/OobjlJlOUEUZYTRPvkUqOSJzWr+BNeoxauoY0h+A6VmHJE5eYv/ig2VapUwdvbWxf6KyfsdjvHjx8nOTmZGjVqaLuISJGycvL49fhp9h4+wsED+ziUZnAg3eDQeS8uGH898eR/pSaINGpYUoiwpFDDPZ1IX4OISl5EhPhTNagS1qBaENwRAiPBVf/AKu9Ubv7EZrMVFJtLnVou5gkJCeH48ePk5eXp7DgRIS0rh+17fmXH/sPsPHaWX05bOJjth73gQF7fi+Z3I4+aluPUdkmmtutJaoX6UzM0kMjwMPz8/KFyRwiuC+4+hd9MHIrKzZ/8cYyNt7e3yUmkKH/sjrLZbCo3IhVMWlYOO/YdInHPAX5KSmXnWVeO5v75Ym6BBX8K5iz1rMep6ZNNpOtpakZGE10tlIiwqrhWuQF8qzj0dV7kylRuiqBdHuWTtotIxWCzG/x64CCJP//ElsNn2HrajYO5fz6epVLBnyIsKTT2PEVMZYPGEZWJiY4ktGZXFZgKTuVGRERMlXEhl59+PUji9u1sOZLOljQ/Mgwv8o+J+d/9BKtbUmjq8RvNqrjSKKoKMbXrEBjVCdw12i4XU7kREZEyYxgG+48cJ3HnHrYcOMG2k7DnvB8GLuRfIyb/OjG+nKeJ52+0CDGIrRVOs5h6VKrWTQfzylVRuamADh06RHR0NImJiTRr1szsOCLixGw2O7t+2cqm3QfZcCSLjae9OW3/Y6QloGC+6pYUmnunEhvuRWxMHRo07ojVL8Sc0OLwVG5ERKTEXDh3lp9+3sbmvUfZePQ8W89V+n0Xk/fvj/wr9zZxO07zShdoUd2X5nWjqVL3Vl0nRkqMyo0D++KLLxg3bhz79u3D29ub5s2b8+WXX+Lj48OsWbN4/fXXOXjwIFFRUQwbNoxHH30UyL+NBUDz5s0B6NixIytXrjRrNUTEgWWkp/Hj+jVs3HWADae92ZkTQh6u5B/0m3/grw/nifM4QqvgXG6IiaZxbFvcK1UzNbc4N5UbB5WcnEz//v15/fXXue2228jIyGDNmjUYhsH06dN54YUXmDJlCs2bNycxMZEhQ4bg4+PDwIED2bhxI61atWL58uU0bNhQV/wVkatm2O38ujORlVt2sDIpm42ZodiwArUK5glxSSfO7yytw92Ia1Sf+g1vxNVT146RsqNy46CSk5PJy8vj9ttvJzIyEoDGjRsD8NJLLzFx4kRuv/12IH+kZufOnXzwwQcMHDiQkJD8/dhBQUFUrVrVnBUQEYeRnnGOHzes5fvth1l10odkeyDwvwudRltPckNwNq2rexLb+m9Uj4jSpRvEVCo3Dqpp06Z06dKFxo0b0717d+Lj47nzzjvJy8vjyJEjDB48mCFDhhTMn5eXR0BAwGVeUUQkn91usHPvPlZs3MqqQ1kkZgb9PjqTvyvJk2xa+56kU5QHnVo2J7JuL11TRsoVlRsHZbVaSUhIYN26dSxbtozJkyczZswYvvrqKwCmT59O69atCy0jInIpe/bt59OEH1ly1EqKzY/82xfk38IgynqSG4PPcWPTWtxwQzye3r6XfS0RM6ncODCLxUK7du1o164dY8eOJTIykrVr11KtWjUOHDjAvffeW+Ryf76NgYhUbOeyzjNn6SpW7jjKD+fC+OM2Bt5coJ3PUTpFedEhtikR9XuCi8tlX0ukvFC5cVAbNmzgu+++Iz4+nipVqrBhwwZOnjxJTEwML774IsOGDcPf35+ePXuSnZ3N5s2bOXPmDCNHjqRKlSp4eXmxZMkSqlevjqenp3ZZiVQght1OYuIG5qz+ha9/CyILDyAMgE7eB+hZvxK39uiBh3+wuUFFiknlxkH5+/uzevVq3nnnHdLT04mMjGTixIn07NkTyL/55xtvvMGzzz6Lj48PjRs3Zvjw4QC4uroyadIkxo8fz9ixY+nQoYNOBRepADJOHmHhN98wZ6+FHbbqQDgA0S4p9AzP5I7ObajV4CZzQ4qUAIthGIbZIcpSeno6AQEBpKWl4e/vf9HPLly4wMGDB4mOjsbT09OkhHIp2j4ixWDL5Ze13/Dpun18dTaKc3gB4E4uN1c+Rr8batGybRcsrm4mBxW5vMt9f/+VRm5ERJyQ7UIm3y1byIxNp9mYWxOIAaCm2xn
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"#Plot, as an example, the ECDFs for the variable a for both datasets\n",
"feature_name='a'\n",
"dataframe=pd.DataFrame({\n",
"feature_name: np.concatenate((df_train.loc[:,feature_name],df_test.loc[:,feature_name])),\n",
" 'set': ['training']*df_train.shape[0]+['test']*df_test.shape[0]\n",
"})\n",
"sns.ecdfplot(data=dataframe, x=feature_name, hue='set')"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "c504c1d7-b968-4cec-b69a-f296511fb5fa",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/unipi/v.vichi3/miniconda3/envs/py39/lib/python3.9/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.\n",
" with pd.option_context('mode.use_inf_as_na', True):\n",
"/home/unipi/v.vichi3/miniconda3/envs/py39/lib/python3.9/site-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.\n",
" data_subset = grouped_data.get_group(pd_key)\n",
"/home/unipi/v.vichi3/miniconda3/envs/py39/lib/python3.9/site-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.\n",
" data_subset = grouped_data.get_group(pd_key)\n"
]
},
{
"data": {
"text/plain": [
"<Axes: xlabel='MOID', ylabel='Proportion'>"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAG2CAYAAACDLKdOAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy81sbWrAAAACXBIWXMAAA9hAAAPYQGoP6dpAABOpElEQVR4nO3dd3wUdeLG8c/sbiqQAAFCD6FpkCZBEJATpAkcHuhJUSmCeIhKEwREQdCTs54Cgo2i90PlVFQUBIInHUGaBYKUgEFICKEkgZC2md8fkdWYUBKSzGb3eb9eudfs7Mzuszcm+/CdZpimaSIiIiLiIWxWBxAREREpSio3IiIi4lFUbkRERMSjqNyIiIiIR1G5EREREY+iciMiIiIeReVGREREPIrKjYiIiHgUlRsRERHxKCo3IiIi4lEsLTfr16+nV69eVK9eHcMw+Oyzz664zrp164iMjMTf35+6devyxhtvFH9QERERKTUsLTfnz5+nWbNmzJkz56qWP3z4MD169KB9+/bs2rWLJ554glGjRvHJJ58Uc1IREREpLQx3uXGmYRh8+umn9O7d+5LLTJw4kWXLlhEdHe2aN2LECL7//nu2bNlSAilFRETE3TmsDlAQW7ZsoWvXrrnmdevWjfnz55OZmYmPj0+eddLT00lPT3c9zs7O5vTp04SEhGAYRrFnFhERkWtnmiYpKSlUr14dm+3yO55KVbmJj48nNDQ017zQ0FCysrJITEykWrVqedaZOXMm06dPL6mIIiIiUoyOHj1KzZo1L7tMqSo3QJ7Rlot71S41CjN58mTGjRvnepyUlETt2rU5evQoQUFBxRdURKQEpGU6OZGcRuLZc5w8fYrUcykcSUwhJTWdmLNOguzp7DztS1WfNH5OK48/aaThX+y5ynABXyOLc6Y/WTgI9z2Lj+HExzDxM7LxsZk4jGwcNhMfm8HR9DI0KJOKw2bk/GQkYw8M5kSGL1X9swlwmNhtNuw2A1tGMvbA8jgAm83ImWcYGAY50zYDu2HDZjOwGbjWIzMVw7cchmFg2Axs8Ns0gIHNsGHYwMDA+G19DMP1+I/PGWYmht0PwwADyMgGf8fF76E/fB8ZOevk8tv31R//F8MGwTXhMiMSl9vZkOc9rrDOJecX9HUuMTOsYhkc9qI9rDc5OZlatWpRrly5Ky5bqspN1apViY+PzzUvISEBh8NBSEhIvuv4+fnh5+eXZ35QUJDKjYi4rUxnNnFnUvkl7gRnT5/k+8PxJJxNIe6cyb7UsqRk++FHBun45rO2328/v3FAigk2P8gg0HUmiYMssnBQx4gnzqzIjY4jlA3wp5wvnDXLULtMNuWNc6T5V6JusI0smz/Vgv3wdxgY/kFUKOOHv68f/r4O/APL4e/vj69fIDYf38t/E4tcg6s5pKRUlZs2bdrwxRdf5Jq3evVqWrZsme/xNiIi7io72+TY2QscjD/Lvl+O8fPxMyScSeLkBYMD5wPyWcMHqJhrzsVi40sGGfgSbsThTzrXByTj72vH38dBOT8Hdcvb8M1Op2q1GgSWDaZi+WCCKobiX6Yc+JcHR34FSaT0srTcnDt3joMHD7oeHz58mN27d1OxYkVq167N5MmTOXbsGO+99x6Qc2bUnDlzGDduHMOHD2fLli3Mnz+fDz74wKqPICJySaZpkngug++PnuWHX06y95fjxJxMIeb8pXYLBeY7t7KRxE0BcVTwyaJSgEFIUCB1KvhRvXZ9KleuQlBweQy/suBb7rK7NUS8haXlZvv27XTs2NH1+OKxMYMHD2bRokXExcURGxvrej48PJwVK1YwduxYXn/9dapXr86sWbO46667Sjy7iMgfnUvPYnfsWTbsO8a3B46TnXaeH5PzG4HJW2yqcoravsk0CEyloX8STeqHUaNSBaqEXYdRJQLsGpkWKQi3uc5NSUlOTiY4OJikpCQdcyMihRKflMaPx5L4eMdR9h87xeGzWVdcpxJnifCJp2HZdCID4mhYNYg6ES1xVG8KFevqGBULOJ1OMjMzrY4hf+Dr63vJ07wL8v1dqo65EREpaRlZ2Ww4cJLlPxxn/f4TJJ53XnJZB1kEkk5nnx8JL5NJpwblCK8UREDNG6BKSyhbRSXGDZimSXx8PGfPnrU6ivyJzWYjPDwcX99rOw5M5UZE5A+SLmSy4odjfLXjIClJZ9iVlP9xMAC32nZTw3aadv6HualuKFUi2kHD7lCmbwkmloK6WGyqVKlCYGCgLujqJrKzszl+/DhxcXHUrl37mraLyo2IeLWUtEyW7T7O17t+Zn1sBlnmH/+g5hSbCqRQwUihtW0fN1dKo0NtH4JrRkCVv0D1G8GvrDXhpcCcTqer2FzqEiJincqVK3P8+HGysrKu6SxolRsR8SrObJP1+0+yaONB1h0886dnfy82EcYR/lr2AD2bViOsenWMWrdApaFgs5dsYClSF4+xCQy89IicWOfi7iin06lyIyJyOSeS03hnQwwfbTvC2fT8z6FoYsTQo+Jx/nZdINWbdYIaXXT9Fw+mXVHuqai2i8qNiHikXbFnePN/0eyIiedkRt6rlFcghYH2KHpWS6Jhp/sx6g8Hn+K/LYGIFD+VGxHxGNsOn+bFL3fx3bG0P8z9vdh0sP9A/0oxdKoXhE/kvVC9n85eEvFAKjciUqr9HJ/C88u287+Y1DzPVeUUt9u/Y2DtM9TrOBDqjgNH3lEcEU905MgRwsPD2bVrF82bN7c6TolSuRGRUufUuXSeW76HT3bF5XmuOoncW+kQ/RuYhLTsDdUHanRGxMvoJiQiUiqYpsmHW4/Q5KkviXx2Ta5iE0AaDwauY2fbb9k8pRsPj59ByN+egRqRKjZS6n388cc0adKEgIAAQkJC6Ny5M+fPnwdg4cKFRERE4O/vz/XXX8/cuXNd64WHhwNw4403YhgGHTp0sCK+JTRyIyJu7URyGlP+u5U1B8/9Nuf3stLbtpExTTKoc9tQCNU95sTzxMXFMWDAAF544QX69OlDSkoKGzZswDRN3n77baZNm8acOXO48cYb2bVrF8OHD6dMmTIMHjyYbdu20apVK9asWcMNN9xwzVf9LU1UbkTELX31wzGe+Gg7ZzJz/5m63viFCRU2cluvezEaPgN2/RkTzxUXF0dWVhZ33nknYWFhADRp0gSAZ555hpdffpk777wTyBmp2bt3L2+++SaDBw+mcuXKAISEhFC1alVrPoBF9FdBRNxGljObV5dvZ87mk7/N+f1P1FjHRwy99XrK3foo+I60JqBICWvWrBmdOnWiSZMmdOvWja5du/L3v/+drKwsjh49yrBhwxg+fLhr+aysLIKDgy1M7B5UbkTEcslpmYxbsIY1sdm55ofZEphe/yC39h6OUbGnRelErGO324mKimLz5s2sXr2a2bNnM2XKFL744gsA3n77bVq3bp1nHW+nciMilok9mcxD8//HnrO5L7Pewbabf7bKpEb3x8CvnEXpRNyDYRi0a9eOdu3aMXXqVMLCwti0aRM1atQgJiaGe++9N9/1/ngrA2+jciMiJe5gfBJD5q3m1/RA4Pdic3+FHxjftzNlwqdYF07EjWzdupWvv/6arl27UqVKFbZu3crJkyeJiIjg6aefZtSoUQQFBdG9e3fS09PZvn07Z86cYdy4cVSpUoWAgABWrlxJzZo18ff395pdVio3IlJifj6exOC31hKf5uDiHbdtZDOp0maGDXkAeyXtehL5o6CgINavX8+rr75KcnIyYWFhvPzyy3Tv3h3IuQHoiy++yOOPP06ZMmVo0qQJY8aMAcDhcDBr1ixmzJjB1KlTad++PWvXrrXuw5QgwzTN/O8i56GSk5MJDg4mKSmJoKAgq+OIeIWjp84zaN4aDp/7/d9TFUlmWsNY7rh7CEa5UAvTiTdJS0vj8OHDhIeH4++ve4m5m8ttn4J8f2vkRkSKTVpGFsNeX8GmE3Yu/rkJIYmpYT9xx4CRGOVrWhtQRDySyo2IFDnTNPnnu5/zzj4f4PczN/5dexO9B47BKHePdeFExOOp3IhIkfr822hGfxbDHw8UfrLWj9x/3xDswTqmRkSKn8q
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"#And for the MOID\n",
"feature_name='MOID'\n",
"dataframe=pd.DataFrame({\n",
"feature_name: np.concatenate((ydf_train.loc[:,feature_name],ydf_test.loc[:,feature_name])),\n",
" 'set': ['training']*ydf_train.shape[0]+['test']*ydf_test.shape[0]\n",
"})\n",
"sns.ecdfplot(data=dataframe, x=feature_name, hue='set')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "7d572c0d-7c08-4968-956a-b0bed21bbf63",
"metadata": {},
"outputs": [],
"source": [
"#Finally, extract the validation set from the training set\n",
"X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "115562d0-3da6-4872-a251-82ecb7a0ea76",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Size of training set: 600000\n",
"Size of validation set: 150000\n",
"Size of test set: 52376\n"
]
}
],
"source": [
"print(\"Size of training set:\",X_train.shape[0])\n",
"print(\"Size of validation set:\", X_val.shape[0])\n",
"print(\"Size of test set:\", X_test.shape[0])"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "5bdc5d56-6917-4d79-b80f-6e746ad9bd3d",
"metadata": {},
"outputs": [],
"source": [
"#Save the split dataset for future use\n",
"np.save('/home/unipi/v.vichi3/Desktop/X_train', X_train)\n",
"np.save('/home/unipi/v.vichi3/Desktop/X_val', X_val)\n",
"np.save('/home/unipi/v.vichi3/Desktop/X_test', X_test)\n",
"np.save('/home/unipi/v.vichi3/Desktop/y_train', y_train)\n",
"np.save('/home/unipi/v.vichi3/Desktop/y_val', y_val)\n",
"np.save('/home/unipi/v.vichi3/Desktop/y_test', y_test)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
}
},
"nbformat": 4,
"nbformat_minor": 5
}