You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
733 lines
124 KiB
Plaintext
733 lines
124 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "f1396231-0139-456f-96d8-6f18199e8e25",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import numpy as np\n",
|
|
"import pandas as pd\n",
|
|
"import seaborn as sns\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import math\n",
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"import scipy\n",
|
|
"from scipy.stats import ks_2samp"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "3521f41e-c9d2-42f3-b657-45634f3fd8ac",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df=pd.read_csv(r'/home/unipi/v.vichi3/Desktop/dataframe.csv')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "ab94adc5-854e-49cd-9a9f-bce8ffd4ee87",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>ID</th>\n",
|
|
" <th>a</th>\n",
|
|
" <th>e</th>\n",
|
|
" <th>i</th>\n",
|
|
" <th>om</th>\n",
|
|
" <th>w</th>\n",
|
|
" <th>ma</th>\n",
|
|
" <th>H-value</th>\n",
|
|
" <th>MOID</th>\n",
|
|
" <th>EPOCH</th>\n",
|
|
" <th>PHA</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>'0000001'</td>\n",
|
|
" <td>2.662454</td>\n",
|
|
" <td>0.533875</td>\n",
|
|
" <td>26.69427</td>\n",
|
|
" <td>19.950131</td>\n",
|
|
" <td>284.514731</td>\n",
|
|
" <td>14.514345</td>\n",
|
|
" <td>9.45</td>\n",
|
|
" <td>0.4615</td>\n",
|
|
" <td>59945.0</td>\n",
|
|
" <td>0.0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>'0000002'</td>\n",
|
|
" <td>1.457965</td>\n",
|
|
" <td>0.222633</td>\n",
|
|
" <td>10.82898</td>\n",
|
|
" <td>209.642798</td>\n",
|
|
" <td>343.753667</td>\n",
|
|
" <td>338.406046</td>\n",
|
|
" <td>11.16</td>\n",
|
|
" <td>0.1356</td>\n",
|
|
" <td>59945.0</td>\n",
|
|
" <td>0.0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>'0000003'</td>\n",
|
|
" <td>1.893819</td>\n",
|
|
" <td>0.538294</td>\n",
|
|
" <td>41.18978</td>\n",
|
|
" <td>286.606414</td>\n",
|
|
" <td>287.585635</td>\n",
|
|
" <td>83.688834</td>\n",
|
|
" <td>12.40</td>\n",
|
|
" <td>0.1162</td>\n",
|
|
" <td>59945.0</td>\n",
|
|
" <td>0.0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>'0000004'</td>\n",
|
|
" <td>2.001212</td>\n",
|
|
" <td>0.448742</td>\n",
|
|
" <td>17.44771</td>\n",
|
|
" <td>214.338458</td>\n",
|
|
" <td>245.558687</td>\n",
|
|
" <td>15.816629</td>\n",
|
|
" <td>12.60</td>\n",
|
|
" <td>0.2447</td>\n",
|
|
" <td>59945.0</td>\n",
|
|
" <td>0.0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>'0000005'</td>\n",
|
|
" <td>4.221445</td>\n",
|
|
" <td>0.713243</td>\n",
|
|
" <td>30.98228</td>\n",
|
|
" <td>206.677675</td>\n",
|
|
" <td>330.127884</td>\n",
|
|
" <td>117.375359</td>\n",
|
|
" <td>12.90</td>\n",
|
|
" <td>0.2592</td>\n",
|
|
" <td>59945.0</td>\n",
|
|
" <td>0.0</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" ID a e i om w \\\n",
|
|
"0 '0000001' 2.662454 0.533875 26.69427 19.950131 284.514731 \n",
|
|
"1 '0000002' 1.457965 0.222633 10.82898 209.642798 343.753667 \n",
|
|
"2 '0000003' 1.893819 0.538294 41.18978 286.606414 287.585635 \n",
|
|
"3 '0000004' 2.001212 0.448742 17.44771 214.338458 245.558687 \n",
|
|
"4 '0000005' 4.221445 0.713243 30.98228 206.677675 330.127884 \n",
|
|
"\n",
|
|
" ma H-value MOID EPOCH PHA \n",
|
|
"0 14.514345 9.45 0.4615 59945.0 0.0 \n",
|
|
"1 338.406046 11.16 0.1356 59945.0 0.0 \n",
|
|
"2 83.688834 12.40 0.1162 59945.0 0.0 \n",
|
|
"3 15.816629 12.60 0.2447 59945.0 0.0 \n",
|
|
"4 117.375359 12.90 0.2592 59945.0 0.0 "
|
|
]
|
|
},
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "21f6c17e-a521-455d-82c3-0f6aeea7df77",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"kep=df.iloc[:,1:6]\n",
|
|
"y=df.iloc[:,-3]\n",
|
|
"X=kep.to_numpy()\n",
|
|
"y=y.to_numpy()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "505356c7-da2b-4929-b65b-f775f835cc5a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#Split the data into training and test\n",
|
|
"X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=750000)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "9458e17f-873a-4355-964b-6c162eebd71e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df_train=pd.DataFrame(X_train, columns=['a','e','i','om','w'])\n",
|
|
"df_test=pd.DataFrame(X_test, columns=['a','e','i','om','w'])\n",
|
|
"ydf_train=pd.DataFrame(y_train, columns=['MOID'])\n",
|
|
"ydf_test=pd.DataFrame(y_test,columns=['MOID'])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "32d4bbfb-26fa-4add-8246-e1c267667b28",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#Comparison of some statistical measures between training and test set \n",
|
|
"A_train=df_train.describe()\n",
|
|
"A_test=df_test.describe()\n",
|
|
"B_train=ydf_train.describe()\n",
|
|
"B_test=ydf_test.describe()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"id": "af994684-c4b9-461f-8c0e-e13e0cd9b909",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>a</th>\n",
|
|
" <th>e</th>\n",
|
|
" <th>i</th>\n",
|
|
" <th>om</th>\n",
|
|
" <th>w</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>mean</th>\n",
|
|
" <td>0.002937</td>\n",
|
|
" <td>-0.000607</td>\n",
|
|
" <td>0.027988</td>\n",
|
|
" <td>0.420302</td>\n",
|
|
" <td>0.343676</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>std</th>\n",
|
|
" <td>-0.001472</td>\n",
|
|
" <td>0.000403</td>\n",
|
|
" <td>0.064839</td>\n",
|
|
" <td>-0.227066</td>\n",
|
|
" <td>0.136723</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>min</th>\n",
|
|
" <td>-0.000002</td>\n",
|
|
" <td>-0.001717</td>\n",
|
|
" <td>-0.049321</td>\n",
|
|
" <td>-0.000011</td>\n",
|
|
" <td>-0.023646</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>25%</th>\n",
|
|
" <td>0.005710</td>\n",
|
|
" <td>-0.000832</td>\n",
|
|
" <td>0.016489</td>\n",
|
|
" <td>1.228961</td>\n",
|
|
" <td>0.372439</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>50%</th>\n",
|
|
" <td>0.009029</td>\n",
|
|
" <td>-0.000021</td>\n",
|
|
" <td>0.066847</td>\n",
|
|
" <td>0.206380</td>\n",
|
|
" <td>0.186781</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>75%</th>\n",
|
|
" <td>0.000243</td>\n",
|
|
" <td>-0.000705</td>\n",
|
|
" <td>-0.010827</td>\n",
|
|
" <td>-0.018515</td>\n",
|
|
" <td>0.263608</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>max</th>\n",
|
|
" <td>0.027082</td>\n",
|
|
" <td>0.000222</td>\n",
|
|
" <td>3.584705</td>\n",
|
|
" <td>0.001342</td>\n",
|
|
" <td>0.022134</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" a e i om w\n",
|
|
"mean 0.002937 -0.000607 0.027988 0.420302 0.343676\n",
|
|
"std -0.001472 0.000403 0.064839 -0.227066 0.136723\n",
|
|
"min -0.000002 -0.001717 -0.049321 -0.000011 -0.023646\n",
|
|
"25% 0.005710 -0.000832 0.016489 1.228961 0.372439\n",
|
|
"50% 0.009029 -0.000021 0.066847 0.206380 0.186781\n",
|
|
"75% 0.000243 -0.000705 -0.010827 -0.018515 0.263608\n",
|
|
"max 0.027082 0.000222 3.584705 0.001342 0.022134"
|
|
]
|
|
},
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"A_train.iloc[1:,:]-A_test.iloc[1:,:] \n",
|
|
"#the first row of the describe method is the count of objects, so we ignore it"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "17336f68-0547-40f9-a003-a86422fa899e",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"<Axes: >"
|
|
]
|
|
},
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
},
|
|
{
|
|
"data": {
|
|
"image/png": "",
|
|
"text/plain": [
|
|
"<Figure size 640x480 with 2 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"sns.heatmap(np.absolute(A_train.iloc[1:,:]-A_test.iloc[1:,:]),annot=True,cmap='Blues')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"id": "2f5fcc38-f866-4350-b5d1-ba27914a505b",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>MOID</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>mean</th>\n",
|
|
" <td>0.000081</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>std</th>\n",
|
|
" <td>-0.000106</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>min</th>\n",
|
|
" <td>0.000000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>25%</th>\n",
|
|
" <td>0.000000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>50%</th>\n",
|
|
" <td>0.000600</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>75%</th>\n",
|
|
" <td>0.000700</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>max</th>\n",
|
|
" <td>0.099600</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" MOID\n",
|
|
"mean 0.000081\n",
|
|
"std -0.000106\n",
|
|
"min 0.000000\n",
|
|
"25% 0.000000\n",
|
|
"50% 0.000600\n",
|
|
"75% 0.000700\n",
|
|
"max 0.099600"
|
|
]
|
|
},
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"B_train.iloc[1:,:]-B_test.iloc[1:,:]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "62315497-b84c-4466-a35c-042144607977",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"<Axes: >"
|
|
]
|
|
},
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
},
|
|
{
|
|
"data": {
|
|
"image/png": "",
|
|
"text/plain": [
|
|
"<Figure size 640x480 with 2 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"sns.heatmap(np.absolute(B_train.iloc[1:,:]-B_test.iloc[1:,:]),annot=True, cmap='Blues')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"id": "dfeb6da7-8916-4bf2-9e1e-8e912ced27e4",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"a : distance: 0.004706798228196085 p-value: 0.2274892259071406\n",
|
|
"e : distance: 0.0034702911257064317 p-value: 0.5959712249062891\n",
|
|
"i : distance: 0.001685299119189354 p-value: 0.9990253593864518\n",
|
|
"om : distance: 0.004452073316022598 p-value: 0.2855013975223749\n",
|
|
"w : distance: 0.0037016791405732885 p-value: 0.5122989577027215\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"#Perform the Kolmogorov-Smirnov two-sample test to check that the training and test set are drawn from the same distributions\n",
|
|
"#The null hypothesis is that the distributions are the same\n",
|
|
"#We choose a confidence level of 99%, that is we will reject the null hypothesis in favour of the alternative if the p-value is less than 0.01\n",
|
|
"#First compute the KS test statistic and the p-value for all the variables a,e,i,om,w,MOID \n",
|
|
"distances=list(map(lambda i:\n",
|
|
" ks_2samp(df_train.iloc[:,i],df_test.iloc[:,i]).statistic, \n",
|
|
" range(df_train.shape[1])))\n",
|
|
"pvalues=list(map(lambda i:\n",
|
|
" ks_2samp(df_train.iloc[:,i],df_test.iloc[:,i]).pvalue,\n",
|
|
" range(df_train.shape[1])))\n",
|
|
"\n",
|
|
"for i in range(df_train.shape[1]):\n",
|
|
" print(df_train.columns[i], \": distance:\", distances[i], \"p-value: \", pvalues[i])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"id": "5675a84e-9960-440b-af30-188b39c29c3d",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"MOID statistic 0.0029887162568097425\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(\"MOID statistic\",ks_2samp(ydf_train.loc[:,'MOID'],ydf_test.loc[:,'MOID']).statistic)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"id": "120d7bfb-620f-42f2-8515-60dc802b8e4b",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"MOID p-value 0.7731305746585694\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(\"MOID p-value\",ks_2samp(ydf_train.loc[:,'MOID'],ydf_test.loc[:,'MOID']).pvalue)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b666430a-ce44-426e-84d1-ef54d56ba2fb",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#Since the p-values are all greater than the threshold alpha=0.01, we cannot reject the null hypothesis. \n",
|
|
"#Thus, it is likely that the two datasets are drawn from the same distribution (there is no evidence in favour of the alternative hypothesis)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"id": "147defd4-1659-45f8-8a08-bf95049889e3",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/home/unipi/v.vichi3/miniconda3/envs/py39/lib/python3.9/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.\n",
|
|
" with pd.option_context('mode.use_inf_as_na', True):\n",
|
|
"/home/unipi/v.vichi3/miniconda3/envs/py39/lib/python3.9/site-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.\n",
|
|
" data_subset = grouped_data.get_group(pd_key)\n",
|
|
"/home/unipi/v.vichi3/miniconda3/envs/py39/lib/python3.9/site-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.\n",
|
|
" data_subset = grouped_data.get_group(pd_key)\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"<Axes: xlabel='a', ylabel='Proportion'>"
|
|
]
|
|
},
|
|
"execution_count": 15,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
},
|
|
{
|
|
"data": {
|
|
"image/png": "",
|
|
"text/plain": [
|
|
"<Figure size 640x480 with 1 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"#Plot, as an example, the ECDFs for the variable a for both datasets\n",
|
|
"feature_name='a'\n",
|
|
"dataframe=pd.DataFrame({\n",
|
|
"feature_name: np.concatenate((df_train.loc[:,feature_name],df_test.loc[:,feature_name])),\n",
|
|
" 'set': ['training']*df_train.shape[0]+['test']*df_test.shape[0]\n",
|
|
"})\n",
|
|
"sns.ecdfplot(data=dataframe, x=feature_name, hue='set')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 25,
|
|
"id": "c504c1d7-b968-4cec-b69a-f296511fb5fa",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/home/unipi/v.vichi3/miniconda3/envs/py39/lib/python3.9/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.\n",
|
|
" with pd.option_context('mode.use_inf_as_na', True):\n",
|
|
"/home/unipi/v.vichi3/miniconda3/envs/py39/lib/python3.9/site-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.\n",
|
|
" data_subset = grouped_data.get_group(pd_key)\n",
|
|
"/home/unipi/v.vichi3/miniconda3/envs/py39/lib/python3.9/site-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.\n",
|
|
" data_subset = grouped_data.get_group(pd_key)\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"<Axes: xlabel='MOID', ylabel='Proportion'>"
|
|
]
|
|
},
|
|
"execution_count": 25,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
},
|
|
{
|
|
"data": {
|
|
"image/png": "",
|
|
"text/plain": [
|
|
"<Figure size 640x480 with 1 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"#And for the MOID\n",
|
|
"feature_name='MOID'\n",
|
|
"dataframe=pd.DataFrame({\n",
|
|
"feature_name: np.concatenate((ydf_train.loc[:,feature_name],ydf_test.loc[:,feature_name])),\n",
|
|
" 'set': ['training']*ydf_train.shape[0]+['test']*ydf_test.shape[0]\n",
|
|
"})\n",
|
|
"sns.ecdfplot(data=dataframe, x=feature_name, hue='set')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"id": "7d572c0d-7c08-4968-956a-b0bed21bbf63",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#Finally, extract the validation set from the training set\n",
|
|
"X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"id": "115562d0-3da6-4872-a251-82ecb7a0ea76",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Size of training set: 600000\n",
|
|
"Size of validation set: 150000\n",
|
|
"Size of test set: 52376\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(\"Size of training set:\",X_train.shape[0])\n",
|
|
"print(\"Size of validation set:\", X_val.shape[0])\n",
|
|
"print(\"Size of test set:\", X_test.shape[0])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"id": "5bdc5d56-6917-4d79-b80f-6e746ad9bd3d",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#Save the split dataset for future use\n",
|
|
"np.save('/home/unipi/v.vichi3/Desktop/X_train', X_train)\n",
|
|
"np.save('/home/unipi/v.vichi3/Desktop/X_val', X_val)\n",
|
|
"np.save('/home/unipi/v.vichi3/Desktop/X_test', X_test)\n",
|
|
"np.save('/home/unipi/v.vichi3/Desktop/y_train', y_train)\n",
|
|
"np.save('/home/unipi/v.vichi3/Desktop/y_val', y_val)\n",
|
|
"np.save('/home/unipi/v.vichi3/Desktop/y_test', y_test)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.9.19"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|