You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

733 lines
124 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "f1396231-0139-456f-96d8-6f18199e8e25",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"import math\n",
"from sklearn.model_selection import train_test_split\n",
"import scipy\n",
"from scipy.stats import ks_2samp"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "3521f41e-c9d2-42f3-b657-45634f3fd8ac",
"metadata": {},
"outputs": [],
"source": [
"df=pd.read_csv(r'/home/unipi/v.vichi3/Desktop/dataframe.csv')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "ab94adc5-854e-49cd-9a9f-bce8ffd4ee87",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>a</th>\n",
" <th>e</th>\n",
" <th>i</th>\n",
" <th>om</th>\n",
" <th>w</th>\n",
" <th>ma</th>\n",
" <th>H-value</th>\n",
" <th>MOID</th>\n",
" <th>EPOCH</th>\n",
" <th>PHA</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'0000001'</td>\n",
" <td>2.662454</td>\n",
" <td>0.533875</td>\n",
" <td>26.69427</td>\n",
" <td>19.950131</td>\n",
" <td>284.514731</td>\n",
" <td>14.514345</td>\n",
" <td>9.45</td>\n",
" <td>0.4615</td>\n",
" <td>59945.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>'0000002'</td>\n",
" <td>1.457965</td>\n",
" <td>0.222633</td>\n",
" <td>10.82898</td>\n",
" <td>209.642798</td>\n",
" <td>343.753667</td>\n",
" <td>338.406046</td>\n",
" <td>11.16</td>\n",
" <td>0.1356</td>\n",
" <td>59945.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>'0000003'</td>\n",
" <td>1.893819</td>\n",
" <td>0.538294</td>\n",
" <td>41.18978</td>\n",
" <td>286.606414</td>\n",
" <td>287.585635</td>\n",
" <td>83.688834</td>\n",
" <td>12.40</td>\n",
" <td>0.1162</td>\n",
" <td>59945.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>'0000004'</td>\n",
" <td>2.001212</td>\n",
" <td>0.448742</td>\n",
" <td>17.44771</td>\n",
" <td>214.338458</td>\n",
" <td>245.558687</td>\n",
" <td>15.816629</td>\n",
" <td>12.60</td>\n",
" <td>0.2447</td>\n",
" <td>59945.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>'0000005'</td>\n",
" <td>4.221445</td>\n",
" <td>0.713243</td>\n",
" <td>30.98228</td>\n",
" <td>206.677675</td>\n",
" <td>330.127884</td>\n",
" <td>117.375359</td>\n",
" <td>12.90</td>\n",
" <td>0.2592</td>\n",
" <td>59945.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ID a e i om w \\\n",
"0 '0000001' 2.662454 0.533875 26.69427 19.950131 284.514731 \n",
"1 '0000002' 1.457965 0.222633 10.82898 209.642798 343.753667 \n",
"2 '0000003' 1.893819 0.538294 41.18978 286.606414 287.585635 \n",
"3 '0000004' 2.001212 0.448742 17.44771 214.338458 245.558687 \n",
"4 '0000005' 4.221445 0.713243 30.98228 206.677675 330.127884 \n",
"\n",
" ma H-value MOID EPOCH PHA \n",
"0 14.514345 9.45 0.4615 59945.0 0.0 \n",
"1 338.406046 11.16 0.1356 59945.0 0.0 \n",
"2 83.688834 12.40 0.1162 59945.0 0.0 \n",
"3 15.816629 12.60 0.2447 59945.0 0.0 \n",
"4 117.375359 12.90 0.2592 59945.0 0.0 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "21f6c17e-a521-455d-82c3-0f6aeea7df77",
"metadata": {},
"outputs": [],
"source": [
"kep=df.iloc[:,1:6]\n",
"y=df.iloc[:,-3]\n",
"X=kep.to_numpy()\n",
"y=y.to_numpy()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "505356c7-da2b-4929-b65b-f775f835cc5a",
"metadata": {},
"outputs": [],
"source": [
"#Split the data into training and test\n",
"X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=750000)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "9458e17f-873a-4355-964b-6c162eebd71e",
"metadata": {},
"outputs": [],
"source": [
"df_train=pd.DataFrame(X_train, columns=['a','e','i','om','w'])\n",
"df_test=pd.DataFrame(X_test, columns=['a','e','i','om','w'])\n",
"ydf_train=pd.DataFrame(y_train, columns=['MOID'])\n",
"ydf_test=pd.DataFrame(y_test,columns=['MOID'])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "32d4bbfb-26fa-4add-8246-e1c267667b28",
"metadata": {},
"outputs": [],
"source": [
"#Comparison of some statistical measures between training and test set \n",
"A_train=df_train.describe()\n",
"A_test=df_test.describe()\n",
"B_train=ydf_train.describe()\n",
"B_test=ydf_test.describe()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "af994684-c4b9-461f-8c0e-e13e0cd9b909",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>e</th>\n",
" <th>i</th>\n",
" <th>om</th>\n",
" <th>w</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>0.002937</td>\n",
" <td>-0.000607</td>\n",
" <td>0.027988</td>\n",
" <td>0.420302</td>\n",
" <td>0.343676</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>-0.001472</td>\n",
" <td>0.000403</td>\n",
" <td>0.064839</td>\n",
" <td>-0.227066</td>\n",
" <td>0.136723</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>-0.000002</td>\n",
" <td>-0.001717</td>\n",
" <td>-0.049321</td>\n",
" <td>-0.000011</td>\n",
" <td>-0.023646</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>0.005710</td>\n",
" <td>-0.000832</td>\n",
" <td>0.016489</td>\n",
" <td>1.228961</td>\n",
" <td>0.372439</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>0.009029</td>\n",
" <td>-0.000021</td>\n",
" <td>0.066847</td>\n",
" <td>0.206380</td>\n",
" <td>0.186781</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>0.000243</td>\n",
" <td>-0.000705</td>\n",
" <td>-0.010827</td>\n",
" <td>-0.018515</td>\n",
" <td>0.263608</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>0.027082</td>\n",
" <td>0.000222</td>\n",
" <td>3.584705</td>\n",
" <td>0.001342</td>\n",
" <td>0.022134</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" a e i om w\n",
"mean 0.002937 -0.000607 0.027988 0.420302 0.343676\n",
"std -0.001472 0.000403 0.064839 -0.227066 0.136723\n",
"min -0.000002 -0.001717 -0.049321 -0.000011 -0.023646\n",
"25% 0.005710 -0.000832 0.016489 1.228961 0.372439\n",
"50% 0.009029 -0.000021 0.066847 0.206380 0.186781\n",
"75% 0.000243 -0.000705 -0.010827 -0.018515 0.263608\n",
"max 0.027082 0.000222 3.584705 0.001342 0.022134"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"A_train.iloc[1:,:]-A_test.iloc[1:,:] \n",
"#the first row of the describe method is the count of objects, so we ignore it"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "17336f68-0547-40f9-a003-a86422fa899e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Axes: >"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.heatmap(np.absolute(A_train.iloc[1:,:]-A_test.iloc[1:,:]),annot=True,cmap='Blues')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "2f5fcc38-f866-4350-b5d1-ba27914a505b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MOID</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>0.000081</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>-0.000106</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>0.000600</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>0.000700</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>0.099600</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" MOID\n",
"mean 0.000081\n",
"std -0.000106\n",
"min 0.000000\n",
"25% 0.000000\n",
"50% 0.000600\n",
"75% 0.000700\n",
"max 0.099600"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"B_train.iloc[1:,:]-B_test.iloc[1:,:]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "62315497-b84c-4466-a35c-042144607977",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Axes: >"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.heatmap(np.absolute(B_train.iloc[1:,:]-B_test.iloc[1:,:]),annot=True, cmap='Blues')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "dfeb6da7-8916-4bf2-9e1e-8e912ced27e4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"a : distance: 0.004706798228196085 p-value: 0.2274892259071406\n",
"e : distance: 0.0034702911257064317 p-value: 0.5959712249062891\n",
"i : distance: 0.001685299119189354 p-value: 0.9990253593864518\n",
"om : distance: 0.004452073316022598 p-value: 0.2855013975223749\n",
"w : distance: 0.0037016791405732885 p-value: 0.5122989577027215\n"
]
}
],
"source": [
"#Perform the Kolmogorov-Smirnov two-sample test to check that the training and test set are drawn from the same distributions\n",
"#The null hypothesis is that the distributions are the same\n",
"#We choose a confidence level of 99%, that is we will reject the null hypothesis in favour of the alternative if the p-value is less than 0.01\n",
"#First compute the KS test statistic and the p-value for all the variables a,e,i,om,w,MOID \n",
"distances=list(map(lambda i:\n",
" ks_2samp(df_train.iloc[:,i],df_test.iloc[:,i]).statistic, \n",
" range(df_train.shape[1])))\n",
"pvalues=list(map(lambda i:\n",
" ks_2samp(df_train.iloc[:,i],df_test.iloc[:,i]).pvalue,\n",
" range(df_train.shape[1])))\n",
"\n",
"for i in range(df_train.shape[1]):\n",
" print(df_train.columns[i], \": distance:\", distances[i], \"p-value: \", pvalues[i])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "5675a84e-9960-440b-af30-188b39c29c3d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MOID statistic 0.0029887162568097425\n"
]
}
],
"source": [
"print(\"MOID statistic\",ks_2samp(ydf_train.loc[:,'MOID'],ydf_test.loc[:,'MOID']).statistic)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "120d7bfb-620f-42f2-8515-60dc802b8e4b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MOID p-value 0.7731305746585694\n"
]
}
],
"source": [
"print(\"MOID p-value\",ks_2samp(ydf_train.loc[:,'MOID'],ydf_test.loc[:,'MOID']).pvalue)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b666430a-ce44-426e-84d1-ef54d56ba2fb",
"metadata": {},
"outputs": [],
"source": [
"#Since the p-values are all greater than the threshold alpha=0.01, we cannot reject the null hypothesis. \n",
"#Thus, it is likely that the two datasets are drawn from the same distribution (there is no evidence in favour of the alternative hypothesis)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "147defd4-1659-45f8-8a08-bf95049889e3",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/unipi/v.vichi3/miniconda3/envs/py39/lib/python3.9/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.\n",
" with pd.option_context('mode.use_inf_as_na', True):\n",
"/home/unipi/v.vichi3/miniconda3/envs/py39/lib/python3.9/site-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.\n",
" data_subset = grouped_data.get_group(pd_key)\n",
"/home/unipi/v.vichi3/miniconda3/envs/py39/lib/python3.9/site-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.\n",
" data_subset = grouped_data.get_group(pd_key)\n"
]
},
{
"data": {
"text/plain": [
"<Axes: xlabel='a', ylabel='Proportion'>"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"#Plot, as an example, the ECDFs for the variable a for both datasets\n",
"feature_name='a'\n",
"dataframe=pd.DataFrame({\n",
"feature_name: np.concatenate((df_train.loc[:,feature_name],df_test.loc[:,feature_name])),\n",
" 'set': ['training']*df_train.shape[0]+['test']*df_test.shape[0]\n",
"})\n",
"sns.ecdfplot(data=dataframe, x=feature_name, hue='set')"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "c504c1d7-b968-4cec-b69a-f296511fb5fa",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/unipi/v.vichi3/miniconda3/envs/py39/lib/python3.9/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.\n",
" with pd.option_context('mode.use_inf_as_na', True):\n",
"/home/unipi/v.vichi3/miniconda3/envs/py39/lib/python3.9/site-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.\n",
" data_subset = grouped_data.get_group(pd_key)\n",
"/home/unipi/v.vichi3/miniconda3/envs/py39/lib/python3.9/site-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.\n",
" data_subset = grouped_data.get_group(pd_key)\n"
]
},
{
"data": {
"text/plain": [
"<Axes: xlabel='MOID', ylabel='Proportion'>"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"#And for the MOID\n",
"feature_name='MOID'\n",
"dataframe=pd.DataFrame({\n",
"feature_name: np.concatenate((ydf_train.loc[:,feature_name],ydf_test.loc[:,feature_name])),\n",
" 'set': ['training']*ydf_train.shape[0]+['test']*ydf_test.shape[0]\n",
"})\n",
"sns.ecdfplot(data=dataframe, x=feature_name, hue='set')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "7d572c0d-7c08-4968-956a-b0bed21bbf63",
"metadata": {},
"outputs": [],
"source": [
"#Finally, extract the validation set from the training set\n",
"X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "115562d0-3da6-4872-a251-82ecb7a0ea76",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Size of training set: 600000\n",
"Size of validation set: 150000\n",
"Size of test set: 52376\n"
]
}
],
"source": [
"print(\"Size of training set:\",X_train.shape[0])\n",
"print(\"Size of validation set:\", X_val.shape[0])\n",
"print(\"Size of test set:\", X_test.shape[0])"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "5bdc5d56-6917-4d79-b80f-6e746ad9bd3d",
"metadata": {},
"outputs": [],
"source": [
"#Save the split dataset for future use\n",
"np.save('/home/unipi/v.vichi3/Desktop/X_train', X_train)\n",
"np.save('/home/unipi/v.vichi3/Desktop/X_val', X_val)\n",
"np.save('/home/unipi/v.vichi3/Desktop/X_test', X_test)\n",
"np.save('/home/unipi/v.vichi3/Desktop/y_train', y_train)\n",
"np.save('/home/unipi/v.vichi3/Desktop/y_val', y_val)\n",
"np.save('/home/unipi/v.vichi3/Desktop/y_test', y_test)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
}
},
"nbformat": 4,
"nbformat_minor": 5
}