{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Support Vector Machines on the ionosphere dataset\n", "#### Mario Martin\n", "\n", "We used this dataset in the Meta-methods notebook, so open it in another window to compare results with other methods." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np # Llibreria matemà tica\n", "import matplotlib.pyplot as plt # Per mostrar plots\n", "import sklearn # Llibreia de DM\n", "import sklearn.datasets as ds # Per carregar més facilment el dataset digits\n", "import sklearn.model_selection as cv # Pel Cross-validation\n", "import sklearn.neighbors as nb # Per fer servir el knn\n", "from sklearn.model_selection import cross_val_score \n", "%matplotlib inline " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import numpy as np # Numeric and matrix computation\n", "import pandas as pd # Optional: good package for manipulating data \n", "import sklearn as sk # Package with learning algorithms implemented\n", "\n", "# Loading the dataset.\n", "url = \"http://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data\"\n", "df = pd.read_csv(url,header =None)\n", "\n", "y=df[34].values\n", "X=df.values[:,0:34].astype('float32')\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's separate the data into training (for adjusting parameters), and validation. " ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "(X_train, X_test, y_train, y_test) = cv.train_test_split(X, y, test_size=.3, stratify = y,random_state=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Notice the stratification according to labels *y* that we demand in the spliting of data. The ionosphere dataset is small and with strarification we ensure we obtain the same proportion of examples of each class in training and test sets.\n", "\n", "**Remember**. Data should be numerical and normalized or standarized before using an SVM. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Normalization is not strictily necessary in our dataset because almost all columns are in range -1..1 (except columns 0 and 1)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...24252627282930313233
count351.000000351.0351.000000351.000000351.000000351.000000351.000000351.000000351.000000351.000000...351.000000351.000000351.000000351.000000351.000000351.000000351.000000351.000000351.000000351.000000
mean0.8917380.00.6413420.0443720.6010680.1158890.5500950.1193600.5118480.181345...0.396135-0.0711870.541641-0.0695380.378445-0.0279070.352514-0.0037940.3493640.014480
std0.3111550.00.4977080.4414350.5198620.4608100.4926540.5207500.5070660.483851...0.5784510.5084950.5162050.5500250.5758860.5079740.5714830.5135740.5226630.468337
min0.0000000.0-1.000000-1.000000-1.000000-1.000000-1.000000-1.000000-1.000000-1.000000...-1.000000-1.000000-1.000000-1.000000-1.000000-1.000000-1.000000-1.000000-1.000000-1.000000
25%1.0000000.00.472135-0.0647350.412660-0.0247950.211310-0.0548400.087110-0.048075...0.000000-0.3323900.286435-0.4431650.000000-0.2368850.000000-0.2425950.000000-0.165350
50%1.0000000.00.8711100.0163100.8092000.0228000.7287300.0147100.6842100.018290...0.553890-0.0150500.708240-0.0176900.4966400.0000000.4427700.0000000.4095600.000000
75%1.0000000.01.0000000.1941851.0000000.3346550.9692400.4456750.9532400.534195...0.9052400.1567650.9999450.1535350.8834650.1540750.8576200.2001200.8137650.171660
max1.0000000.01.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000...1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
\n", "

8 rows × 34 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 \\\n", "count 351.000000 351.0 351.000000 351.000000 351.000000 351.000000 \n", "mean 0.891738 0.0 0.641342 0.044372 0.601068 0.115889 \n", "std 0.311155 0.0 0.497708 0.441435 0.519862 0.460810 \n", "min 0.000000 0.0 -1.000000 -1.000000 -1.000000 -1.000000 \n", "25% 1.000000 0.0 0.472135 -0.064735 0.412660 -0.024795 \n", "50% 1.000000 0.0 0.871110 0.016310 0.809200 0.022800 \n", "75% 1.000000 0.0 1.000000 0.194185 1.000000 0.334655 \n", "max 1.000000 0.0 1.000000 1.000000 1.000000 1.000000 \n", "\n", " 6 7 8 9 ... 24 \\\n", "count 351.000000 351.000000 351.000000 351.000000 ... 351.000000 \n", "mean 0.550095 0.119360 0.511848 0.181345 ... 0.396135 \n", "std 0.492654 0.520750 0.507066 0.483851 ... 0.578451 \n", "min -1.000000 -1.000000 -1.000000 -1.000000 ... -1.000000 \n", "25% 0.211310 -0.054840 0.087110 -0.048075 ... 0.000000 \n", "50% 0.728730 0.014710 0.684210 0.018290 ... 0.553890 \n", "75% 0.969240 0.445675 0.953240 0.534195 ... 0.905240 \n", "max 1.000000 1.000000 1.000000 1.000000 ... 1.000000 \n", "\n", " 25 26 27 28 29 30 \\\n", "count 351.000000 351.000000 351.000000 351.000000 351.000000 351.000000 \n", "mean -0.071187 0.541641 -0.069538 0.378445 -0.027907 0.352514 \n", "std 0.508495 0.516205 0.550025 0.575886 0.507974 0.571483 \n", "min -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 \n", "25% -0.332390 0.286435 -0.443165 0.000000 -0.236885 0.000000 \n", "50% -0.015050 0.708240 -0.017690 0.496640 0.000000 0.442770 \n", "75% 0.156765 0.999945 0.153535 0.883465 0.154075 0.857620 \n", "max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 \n", "\n", " 31 32 33 \n", "count 351.000000 351.000000 351.000000 \n", "mean -0.003794 0.349364 0.014480 \n", "std 0.513574 0.522663 0.468337 \n", "min -1.000000 -1.000000 -1.000000 \n", "25% -0.242595 0.000000 -0.165350 \n", "50% 0.000000 0.409560 0.000000 \n", "75% 0.200120 0.813765 0.171660 \n", "max 1.000000 1.000000 1.000000 \n", "\n", "[8 rows x 34 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "However, let's see how to do that properly:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import StandardScaler\n", "from sklearn.preprocessing import MinMaxScaler\n", "\n", "#scaler = StandardScaler().fit(X_train)\n", "scaler = MinMaxScaler(feature_range=(-1, 1)).fit(X_train)\n", "\n", "# Apply the normalization trained in training data in both training and test sets\n", "X_train = scaler.transform(X_train)\n", "X_test = scaler.transform(X_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Linear SVM\n", "\n", "Let's try an SVM with default parameters. Linear means that we are not using any kernel to move the data to a higher dimensional space." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Confusion matrix on test set:\n", " [[27 11]\n", " [ 2 66]]\n", "\n", "Accuracy on test set: 0.8773584905660378\n" ] } ], "source": [ "from sklearn.svm import SVC\n", "from sklearn.svm import LinearSVC\n", "from sklearn.model_selection import GridSearchCV\n", "\n", "#knc = LinearSVC() \n", "knc = SVC(kernel='linear')\n", "knc.fit(X_train, y_train)\n", "pred=knc.predict(X_test)\n", "print(\"Confusion matrix on test set:\\n\",sklearn.metrics.confusion_matrix(y_test, pred))\n", "print(\"\\nAccuracy on test set: \",sklearn.metrics.accuracy_score(y_test, pred))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Not bad results. However, the linear SVM has parameter C that has to be adjusted. We will use *GridSearch* method to find the optimal value of C like we did in a previous notebook with the k value of the KNN algorithm. " ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Acc. 10-fold cross on train data= 0.8744420289855073\n", "\n", "Confusion matrix on test set:\n", " [[27 11]\n", " [ 2 66]]\n", "\n", "Accuracy on test set: 0.8773584905660378\n", "\n", "Best value of parameter C found: {'C': 1.0}\n", "\n", "Number of supports: 72 ( 49 of them have slacks)\n", "Prop. of supports: 0.2938775510204082\n" ] } ], "source": [ "# List of C values to test. We usualy test diverse orders of magnitude\n", "#Cs = np.logspace(-3, 11, num=15, base=10.0)\n", "Cs = np.logspace(-3, 5, num=9, base=10.0)\n", "\n", "param_grid = {'C': Cs}\n", "#grid_search = GridSearchCV(LinearSVC(), param_grid, cv=10)\n", "grid_search = GridSearchCV(SVC(kernel='linear'), param_grid, cv=10)\n", "grid_search.fit(X_train,y_train)\n", "\n", "# Let's plot the 10-fold cross.validation accuracy deppending on C\n", "scores = grid_search.cv_results_['mean_test_score']\n", "plt.semilogx(Cs,scores)\n", "plt.show()\n", "\n", "parval=grid_search.best_params_\n", "cvacc = cross_val_score(SVC(C=parval['C'],kernel='linear'), X=X_train, y=y_train, cv=10, scoring='accuracy')\n", "print('Acc. 10-fold cross on train data= ', cvacc.mean())\n", "\n", "\n", "# Let's apply the best C parameter found to the test set\n", "\n", "#knc = LinearSVC(C=parval['C']) \n", "knc = SVC(C=parval['C'],kernel='linear')\n", "knc.fit(X_train, y_train)\n", "pred=knc.predict(X_test)\n", "print(\"\\nConfusion matrix on test set:\\n\",sklearn.metrics.confusion_matrix(y_test, pred))\n", "print(\"\\nAccuracy on test set: \",sklearn.metrics.accuracy_score(y_test, pred))\n", "print(\"\\nBest value of parameter C found: \",parval)\n", "print(\"\\nNumber of supports: \",np.sum(knc.n_support_), \"(\",np.sum(np.abs(knc.dual_coef_)==parval['C']) ,\"of them have slacks)\")\n", "print(\"Prop. of supports: \",np.sum(knc.n_support_)/X_train.shape[0])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "On this dataset, the best C for a linear SVM is 1 (that casually is also the default value for parameter C) so we don't obtain any improvement tuning the C parameter. However, in other datasets we could obtain a dramatic increase of accuracy. \n", "\n", "Let's see (just for fun) how the C parameter affects performance on training and test sets. " ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "from sklearn.model_selection import validation_curve\n", "\n", "def plot_validation_curve(parameter_values, train_scores, validation_scores):\n", " train_scores_mean = np.mean(train_scores, axis=1)\n", " train_scores_std = np.std(train_scores, axis=1)\n", " validation_scores_mean = np.mean(validation_scores, axis=1)\n", " validation_scores_std = np.std(validation_scores, axis=1)\n", "\n", " plt.fill_between(parameter_values, train_scores_mean - train_scores_std,\n", " train_scores_mean + train_scores_std, alpha=0.1,\n", " color=\"r\")\n", " plt.fill_between(parameter_values, validation_scores_mean - validation_scores_std,\n", " validation_scores_mean + validation_scores_std, alpha=0.1, color=\"g\")\n", " plt.plot(parameter_values, train_scores_mean, 'o-', color=\"r\",\n", " label=\"Training score\")\n", " plt.plot(parameter_values, validation_scores_mean, 'o-', color=\"g\",\n", " label=\"Cross-validation score\")\n", " plt.ylim(validation_scores_mean.min() - .1, train_scores_mean.max() + .1)\n", " plt.legend(loc=4)\n", "\n", "\n", "training_scores, test_scores = validation_curve(SVC(kernel='linear'), X_train, y_train, param_name=\"C\", param_range=Cs,cv=10)\n", "plot_validation_curve(range(len(Cs)), training_scores, test_scores)\n", "plt.xticks(range(len(Cs)), Cs,rotation='vertical');\n", "plt.ylim([0.6, 1])\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Notice that from value of C=1, increasing C results in better accuracy on the training set but worse performance on the test set. This is because being too demanding on the separation of data in the training dataset, we are overfitting to it and we decrease performance in the test set. A nice picture of typical overfitting." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Polynomial kernels\n", "\n", "We have seen that performance with a linear SVM is Ok but not competitive with Metamethods. However, it could happen that using kernels we could even improve accuracy. We'll try first ploynomial kernel with degree 2 with default parameters." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Confusion matrix on test set:\n", " [[29 9]\n", " [ 0 68]]\n", "\n", "Accuracy on test set: 0.9150943396226415\n" ] } ], "source": [ "knc = SVC(kernel='poly',degree =2) \n", "knc.fit(X_train, y_train)\n", "pred=knc.predict(X_test)\n", "print(\"Confusion matrix on test set:\\n\",sklearn.metrics.confusion_matrix(y_test, pred))\n", "print(\"\\nAccuracy on test set: \",sklearn.metrics.accuracy_score(y_test, pred))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Better performance than the obtained with a linear SVM... It could even be increased because we didn't tune the C parameter for the polynomial kernel. Let's do that now." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAEACAYAAABfxaZOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAHFVJREFUeJzt3Xt81PWd7/HXJwkhgNwTUAkBFLzg3ebQ1ru2KrVWqnVd3O1jbU+71LbqVq1V2z3q0tVebOvRHleru7Rn6yqL2gtbOaWud22txAtYUCRgZ5IGJcwAQgYSknzOH5noGBMySWby+/1m3s/HYx7O/Ob3C++M5J0f3/nO92fujoiIFIeSoAOIiMjwUemLiBQRlb6ISBFR6YuIFBGVvohIEVHpi4gUEZW+iEgRUemLiBQRlb6ISBFR6YuIFJGyoAP0VFlZ6TNnzgw6hohIpLz44otb3b2qv/1CV/ozZ86krq4u6BgiIpFiZrFs9tPwjohIEVHpi4gUEZW+iEgRUemLiBQRlb6ISBFR6YuIFBGVvgTmrR17eKVhO7pkp8jwCd08fSlcO/fs5Y+bkjxbv5Vn67dSv2UXAGccNoXvXHAUU8dVBJxQpPCp9CVv9nZ0srphO89s2Mpz9Vt5uWE7HZ1OxYgS5s2azEW11XR0wu2PvcGZP3qKGz91BBccPw0zCzq6SMFS6UvOuDv1W3Z1nclv2MrzmxK0tHVgBkdPG8+XTjmIk+ZUcnzNRCpGlL573Pwj9+eaB1dz9YOrWfHqZm7RWb9I3ljYxlNra2tdyzBEx5Z39vDcxq3vns2//U4rADMmj+ak2ZWcNLuSjx48mQmjy/f5dTo6nZ8+9ya3rlzPyLISbjrvCM4/Tmf9Itkysxfdvba//XSmLwPS0trOH99M8OyGBM/WN/PG213j8hNHj+CEdMmfNLuS6ZNGD+jrlpYYXzz5IM44bArXPLSGq5atZsWrb3HL+UcyRWf9IjmjM33pV/dZ+O/Wvs1L8W20dzrlZSXMmzmJE2dXcvKcSuYeMI6SktyclWee9VeMKOWfzjuCBcceqLN+kX3I9kxfpS/9+tdnNvHPj7zGEQeO4+Q5VZw0u5Lame8fl8+Hjc27+PqDq3k5vp0z507l5vOPZMpYnfWL9EalLzlRv2Un59zxLKfMqeLev/vQsJ9td3Q6//bsJn7wuzcYXd511n/eMTrrF+kp29LXh7OkT+0dnVy1bDVjyku55YIjAyna0hJj0SkHs+KKk5k5eQz/sPQVLr3vRZp3tg57FpFCoNKXPt315EbWNO7gnz99VODDKrOn7MfDXz6B6z5xGE+sb+as255i+eomfZpXZICyKn0zm29m682s3syu6+X5GWb2mJmtMbMnzaw647lLzGxD+nZJLsNL/qxt2sHtj23gU8ccyCePPiDoOEDXWf+lpx7MiitOombyGK544GW+fN9LbN2ls36RbPVb+mZWCtwJfAKYC1xsZnN77PYD4N/d/WhgMfCd9LGTgBuBDwPzgBvNbGLu4ks+tLZ3cPWy1UwcU87i844IOs4HzJ4ylocv/SjXzj+Mx1/fwpk/eorfrGkKOpZIJGRzpj8PqHf3Te7eBiwFFvTYZy7wWPr+ExnPnw086u5Jd98GPArMH3psyafb/3sDr7+1k+9ecBQTx+z7Q1VBKSst4cunHcwjV5xEzaTRXHb/y3zlP17UWb9IP7Ip/WlAQ8bjxvS2TKuBz6Tvnw+MNbPJWR4rIfJSfBt3P7WRi2qr+djhU4OO0685U8fy8JdP4BvzD+W/123hrNue5pE1m4OOJRJa2ZR+b1M2er579nXgVDN7GTgV+AvQnuWxmNkiM6szs7rm5uYsIkk+7G7r4OvLVnPA+FH8r3N7juCFV1lpCV85bTa/ueIkqieO4qv3v8SXfl7HpuZdQUcTCZ1sSr8RmJ7xuBp43wCquze5+wXufhzwrfS2Hdkcm973HnevdffaqqqqAX4LkivfX/k6m7a2cOuFRzO2YkTQcQbskKlj+UX6rP+ZDVs587an+eYvX+Xtd/YEHU0kNLIp/VXAHDObZWblwEJgeeYOZlZpZt1f63pgSfr+SuAsM5uYfgP3rPQ2CZnfb9zKT5/7M587YSYnzK4MOs6gdZ/1P3XN6Xz2wzU8WNfAqbc+wfd++zo7du8NOp5I4PotfXdvBy6jq6xfA5a5+1ozW2xm56V3Ow1Yb2ZvAFOBm9PHJoFv0/WLYxWwOL1NQmRXazvXPLiGWZVjuHb+YUHHyYmqsSP5pwVH8thVp3H2Eftz15MbOeX7T/CTpzayZ29H0PFEAqNlGITrf7GG/1zVwIOXfpQPzZgUdJy8WNu0g1tXrufJ9c0cML6Cr318Dp85vpqyUn0+UQqDlmGQrDzx+hYeeKGBRaccXLCFD3DEgeP52efnsXTRR5g6roJrH36V+bc/w2//9JY+1StFRaVfxLan2rj24TUcMnU/rjxzTtBxhsVHDprML79yAnd/9kO4O5fe9yIX3PV7nt+UCDqayLBQ6RexG5evJdnSxo8uOpaRZfldJjlMzIz5R+7Pyq+dwvc+cxSbt+9h4T3Pc8mSF1jbtCPoeCJ5pdIvUite3cyvX2ni8jPmcOS08UHHCURZaQl//T9qePKa0/jmOYfxSsN2PnnHs/zD0peJJ1JBxxPJC72RW4Sad7Zy9v9+mmkTRvGLr5zACL2ZCcCO3Xv5yVMbWfLcm3R0On8zr4bLzphD1diRQUcT6ZfeyJVeuTvf+uWr7Gpt54cXHaPCzzB+1Ai+Mf8wnrrmdP6qdjr3/THOqbc+wY8efYOdezTHXwqDLoxeZH758l/43bq3+eY5h3HI1LFBxwmlqeMquOX8o/jiSbP44e/e4I7HNnDf8zG+evpsTj+0SlftkrwZWVbCgRNG5fXP0PBOEdm8Yzdn3fY0h04dy39+6aOU5uhC5oVuTeN2vvfb13muXjN8JL+OnT6BX331xEEdm+3wjs70i4S7842H1tDe4fzgr45R4Q/A0dUT+I8vfoS6Pydp3LY76DhSwCaMzv+aVyr9InH/C3Ge2bCVby84gpmVY4KOE0m1MydROzPoFCJDo3fxikA8keLmR17jpNmV/O2HZwQdR0QCpNIvcJ2dztcfXE2pGd+78GhKNKwjUtRU+gVuyXNv8sKfk9zwqblMy/OsABEJP5V+AavfspPvr1zPxw+fwoUfqg46joiEgEq/QLV3dHL1stWMLi/llguO0txyEQE0e6dg3f3URlY37uD//M1xTBlbEXQcEQkJnekXoLVNO7j9sQ2ce/QBnHv0gUHHEZEQUekXmNb2Dq5etprxo8r59oIjg44jIiGj4Z0C8+PH6nn9rZ3869/VMnFMedBxRCRkdKZfYJauauCsuVP5+NypQUcRkRBS6ReQltZ2tu5q5ZjpE4KOIiIhpdIvIPFk19WeaiaNDjiJiISVSr+AxNKX+JsxWaUvIr1T6ReQeLIFgBmTtIqmiPROpV9AYokU40eNYPwwrMktItGk0i8g8WRKQzsisk9Zlb6ZzTez9WZWb2bX9fJ8jZk9YWYvm9kaMzsnvX2mme02s1fSt7tz/Q3Ie+LJFNP1Jq6I7EO/H84ys1LgTuBMoBFYZWbL3X1dxm7/CCxz97vMbC6wApiZfm6jux+b29jSU3tHJ3/ZtptPHnVA0FFEJMSyOdOfB9S7+yZ3bwOWAgt67OPAuPT98UBT7iJKNpq276G90zW8IyL7lE3pTwMaMh43prdlugn4rJk10nWWf3nGc7PSwz5PmdnJQwkrfYulZ+7UaOaOiOxDNqXf20Ls3uPxxcDP3L0aOAf4uZmVAJuBGnc/DrgKuN/MxvU4FjNbZGZ1ZlbX3Nw8sO9AAM3RF5HsZFP6jcD0jMfVfHD45gvAMgB3/wNQAVS6e6u7J9LbXwQ2Aof0/APc/R53r3X32qqqqoF/F0JDMkV5aQlTx2ntfBHpWzalvwqYY2azzKwcWAgs77FPHPgYgJkdTlfpN5tZVfqNYMzsIGAOsClX4eU9sUSK6kmjKNWFz0VkH/qdvePu7WZ2GbASKAWWuPtaM1sM1Ln7cuBq4F4zu5KuoZ/Pubub2SnAYjNrBzqAS909mbfvpojFkilmaLqmiPQjq/X03X0FXW/QZm67IeP+OuDEXo57GHh4iBmlH+5OPNHCh2dNCjqKiIScPpFbABItbbS0dWh1TRHpl0q/AHQvqayZOyLSH5V+AYgntI6+iGRHpV8Auufoa90dEemPSr8AxJIt7D+ugooRpUFHEZGQU+kXgHgiRY3G80UkCyr9AqA5+iKSLZV+xO1u66B5Z6vexBWRrKj0I657uqaGd0QkGyr9iIsl0hdDn6wllUWkfyr9iHv3g1ka3hGRLKj0Iy6WSDG2oowJo0cEHUVEIkClH3HxZIqaSaMx05LKItI/lX7ExZMprbkjIllT6UdYR6fTuC2l6+KKSNZU+hHWtH03eztcZ/oikjWVfoRp5o6IDJRKP8L0wSwRGSiVfoTFEilGlBoHjB8VdBQRiQiVfoTFky1UTxxNaYmma4pIdlT6ERZLpLTQmogMiEo/otydeEJz9EVkYFT6EbU9tZedre060xeRAVHpR1QsqYuhi8jAqfQjSksqi8hgqPQjKp7Qmb6IDJxKP6JiyRRTxo5kVHlp0FFEJEKyKn0zm29m682s3syu6+X5GjN7wsxeNrM1ZnZOxnPXp49bb2Zn5zJ8MdPMHREZjH5L38xKgTuBTwBzgYvNbG6P3f4RWObuxwELgX9JHzs3/fgIYD7wL+mvJ0MUT6aYrqEdERmgbM705wH17r7J3duApcCCHvs4MC59fzzQlL6/AFjq7q3u/iZQn/56MgR79nbw1jt7mKEllUVkgLIp/WlAQ8bjxvS2TDcBnzWzRmAFcPkAjpUBauheXVPDOyIyQNmUfm8Lu3iPxxcDP3P3auAc4OdmVpLlsZjZIjOrM7O65ubmLCIVt1hCq2uKyOBkU/qNwPSMx9W8N3zT7QvAMgB3/wNQAVRmeSzufo+717p7bVVVVfbpi1RM6+iLyCBlU/qrgDlmNsvMyul6Y3Z5j33iwMcAzOxwukq/Ob3fQjMbaWazgDnAC7kKX6wakin2G1nGpDHlQUcRkYgp628Hd283s8uAlUApsMTd15rZYqDO3ZcDVwP3mtmVdA3ffM7dHVhrZsuAdUA78FV378jXN1MsYokWpk8ajZmWVBaRgem39AHcfQVdb9Bmbrsh4/464MQ+jr0ZuHkIGaWHWDLFIVPGBh1DRCJIn8iNmI5OpzG5WzN3RGRQVPoR89Y7e2jr6NTMHREZFJV+xHQvtKYPZonIYKj0Iyae7FpSWatrishgqPQjJpZIUVZiHDihIugoIhJBKv2IiSVTTJs4irJS/a8TkYFTc0RMPJHS0I6IDJpKP2LiSa2jLyKDp9KPkB2pvezYvVczd0Rk0FT6ERJLz9zRxVNEZLBU+hHSvaSyhndEZLBU+hESTy+prDdyRWSwVPoREku0ULnfSMaMzGqdPBGRD1DpR4hm7ojIUKn0I0Rz9EVkqFT6EdHa3sHmd/ao9EVkSFT6EdGQ3I27Zu6IyNCo9COie3VNlb6IDIVKPyK619Gv0adxRWQIVPoREUumGF1eSuV+5UFHEZEIU+lHRPfMHTMLOoqIRJhKPyJiSU3XFJGhU+lHQGen64NZIpITKv0I2LKzlbb2Tmom601cERkalX4ExBLp6Zoa3hGRIVLpR0BMq2uKSI5kVfpmNt/M1ptZvZld18vzt5nZK+nbG2a2PeO5joznlucyfLGIJ1KUlhjTJo4KOoqIRFy/a/SaWSlwJ3Am0AisMrPl7r6uex93vzJj/8uB4zK+xG53PzZ3kYtPLJniwAkVjCjVP8xEZGiyaZF5QL27b3L3NmApsGAf+18MPJCLcNIlnmjRdXFFJCeyKf1pQEPG48b0tg8wsxnALODxjM0VZlZnZs+b2acHnbSIxZMpajRdU0RyIJtLMPX2EVDvY9+FwEPu3pGxrcbdm8zsIOBxM3vV3Te+7w8wWwQsAqipqckiUvF4Z89etqX26k1cEcmJbM70G4HpGY+rgaY+9l1Ij6Edd29K/3cT8CTvH+/v3uced69199qqqqosIhWP7oXWNF1TRHIhm9JfBcwxs1lmVk5XsX9gFo6ZHQpMBP6QsW2imY1M368ETgTW9TxW+hbrXl1TwzsikgP9Du+4e7uZXQasBEqBJe6+1swWA3Xu3v0L4GJgqbtnDv0cDvzEzDrp+gXz3cxZP9K/2Lvr6OuNXBEZumzG9HH3FcCKHttu6PH4pl6O+z1w1BDyFb2GZIrJY8rZb2RW/6tERPZJE79DLpZIMV3j+SKSIyr9kIsltLqmiOSOSj/E2to72bxjt2buiEjOqPRDrHFbik5HSyqLSM6o9EMsnl5dU8M7IpIrKv0Qe7f0NbwjIjmi0g+xWCJFxYgSqsaODDqKiBQIlX6IxRJdF0M36235IxGRgVPph1g82UKNllQWkRxS6YeUuxNPao6+iOSWSj+kmne2smdvp0pfRHJKpR9S3RdD1xIMIpJLKv2QimkdfRHJA5V+SMUTLZQYVE9U6YtI7qj0QyqWTHHA+FGUl+l/kYjkjholpDRzR0TyQaUfUnEtqSwieaDSD6Fdre0kWto0c0dEck6lH0KxRPq6uPo0rojkmEo/hOIJLaksIvmh0g+h7iWVa1T6IpJjKv0QiiVTTBw9gnEVI4KOIiIFRqUfQvH0ksoiIrmm0g+hWLJF18UVkbxQ6YfM3o5Omrbv0Zo7IpIXKv2Qadq+m45O15u4IpIXKv2Q0eqaIpJPWZW+mc03s/VmVm9m1/Xy/G1m9kr69oaZbc947hIz25C+XZLL8IUopumaIpJHZf3tYGalwJ3AmUAjsMrMlrv7uu593P3KjP0vB45L358E3AjUAg68mD52W06/iwIST7RQXlbC1LEVQUcRkQKUzZn+PKDe3Te5exuwFFiwj/0vBh5I3z8beNTdk+mifxSYP5TAhS6Wnq5ZUmJBRxGRApRN6U8DGjIeN6a3fYCZzQBmAY8P9FjpEk+mNJ4vInmTTen3dsrpfey7EHjI3TsGcqyZLTKzOjOra25uziJSYXJ34smUxvNFJG+yKf1GYHrG42qgqY99F/Le0E7Wx7r7Pe5e6+61VVVVWUQqTFt3tZFq69CZvojkTTalvwqYY2azzKycrmJf3nMnMzsUmAj8IWPzSuAsM5toZhOBs9LbpBfxZNeSyjrTF5F86Xf2jru3m9lldJV1KbDE3dea2WKgzt27fwFcDCx1d884Nmlm36brFwfAYndP5vZbKBzdc/RrtI6+iORJv6UP4O4rgBU9tt3Q4/FNfRy7BFgyyHxFJZZIYQbTJ40KOoqIFCh9IjdEGpIpDhhXwciy0qCjiEiBUumHSEwzd0Qkz1T6IRLTOvoikmcq/ZBoaW1n665WZmgdfRHJI5V+SLx7XVyd6YtIHqn0Q6K79GdoTF9E8kilHxLxd9fR1/COiOSPSj8kYskWxlWUMX70iKCjiEgBU+mHRCyR0pu4IpJ3Kv2Q0OqaIjIcVPoh0N7RyV+27dbqmiKSdyr9ENi8Yw/tna6ZOyKSdyr9ENDqmiIyXFT6IRDTOvoiMkxU+iEQT6QoLy1h/3EVQUcRkQKn0g+BWCJF9aRRlJb0dklhEZHcUemHQDyZ0swdERkWKv2AuXtX6euDWSIyDFT6AUu2tLGrtZ3pOtMXkWGg0g9YrHt1TZW+iAwDlX7A3l1dU9M1RWQYqPQD1r2OvoZ3RGQ4qPQDFkuk2H9cBRUjSoOOIiJFQKUfsHiyRZdIFJFho9IPWCyhJZVFZPio9AO0u62DLTtbNXNHRIaNSj9ADdvSq2vqTF9EhklWpW9m881svZnVm9l1fexzkZmtM7O1ZnZ/xvYOM3slfVueq+CFIPbudE19GldEhkdZfzuYWSlwJ3Am0AisMrPl7r4uY585wPXAie6+zcymZHyJ3e5+bI5zF4RYomtJZQ3viMhwyeZMfx5Q7+6b3L0NWAos6LHP3wN3uvs2AHffktuYhSmeTDF2ZBkTRo8IOoqIFIlsSn8a0JDxuDG9LdMhwCFm9pyZPW9m8zOeqzCzuvT2Tw8xb0HpnrljpiWVRWR49Du8A/TWSN7L15kDnAZUA8+Y2ZHuvh2ocfcmMzsIeNzMXnX3je/7A8wWAYsAampqBvgtRFdDMsVhB4wNOoaIFJFszvQbgekZj6uBpl72+bW773X3N4H1dP0SwN2b0v/dBDwJHNfzD3D3e9y91t1rq6qqBvxNRFFHp9OwLaXr4orIsMqm9FcBc8xslpmVAwuBnrNwfgWcDmBmlXQN92wys4lmNjJj+4nAOoTNO3azt8O10JqIDKt+h3fcvd3MLgNWAqXAEndfa2aLgTp3X55+7iwzWwd0ANe4e8LMTgB+YmaddP2C+W7mrJ9i1r26ppZgEJHhlM2YPu6+AljRY9sNGfcduCp9y9zn98BRQ49ZeLrX0Vfpi8hwyqr0o2Dnnr3c+Ou1QcfI2mtv7WREqXHghFFBRxGRIlIwpd/e4ayKJYOOMSDnHTON0hJN1xSR4VMwpT9xTDnPfOOMoGOIiISaFlwTESkiKn0RkSKi0hcRKSIqfRGRIqLSFxEpIip9EZEiotIXESkiKn0RkSJiXcvmhIeZNQOxoHP0UAlsDTrEAEQpb5SyQrTyRikrRCtvGLPOcPd+16YPXemHkZnVuXtt0DmyFaW8UcoK0cobpawQrbxRytqThndERIqISl9EpIio9LNzT9ABBihKeaOUFaKVN0pZIVp5o5T1fTSmLyJSRHSmLyJSRFT6IiJFRKUvIlJEVPpDZGafNrN7zezXZnZW0Hl6MrMxZvZ/0xn/Nug8/Qn769lT+vV90czODTpLf8ysxMxuNrMfm9klQefZFzOrMbPlZrbEzK4LOk9fzOwgM/s3M3soY1uof+aKuvTTf6G2mNmfemyfb2brzay+v79w7v4rd/974HPAX+cxbma+geS+AHgonfG84cjX00DyBvF6DjZr2rXAsuFN+b5cA8m7AJgG7AUaQ571EOARd/+fwNyw5nT3Te7+hR5fIvCfuX1y96K9AacAxwN/ythWCmwEDgLKgdV0/aU7CvhNj9uUjON+CBwfwtzXA8em97k/7K9zEK/nEF7bjwML6foFdW7YX1vgOuBL6X0eCnnWycATwOPA58OaM+P5hzLuB/4zt69bwVwYfTDc/Wkzm9lj8zyg3t03AZjZUmCBu38H+MA/4c3MgO8C/8/dX8pv4i4DyU3XGV018AoB/ctuIHnN7DWG+fXMNMDXdj9gDF0ltdvMVrh75zDGHWjeBqAtvU/HcGXsNsCse4Eb08c8BPw0pDnX9fIlAv+Z25fQBQqBaXT9cHRrTG/ry+V0nfFdaGaX5jNYP/rK/QvgM2Z2F/BfQQTrQ195w/J6Zuo1q7t/y92/BtwP3Dvchb8P+/q7cLaZ/Rh4Oohgvegr62+BK8zsbuDPAeTqqdecZjY5nfE4M7s+/VxYf+YAivtMvw/Wy7Y+P8Hm7ncAd+QvTtZ6ze3uLcDnhztMFvrKG5bXM9M+/064+8+GL0pW+nptU0DP8eeg9ZX1T8CFwx1mH/rKmQAu7bExrD9zgM70e9MITM94XA00BZRlIKKWO0p5o5QVopU3KlmjkrNfKv0PWgXMMbNZZlZO1xt1ywPOlI2o5Y5S3ihlhWjljUrWqOTsX9DvJAd5Ax4ANvPeFLYvpLefA7xB17v13wo6Z9RzRylvlLJGLW9UskYl52BvWnBNRKSIaHhHRKSIqPRFRIqISl9EpIio9EVEiohKX0SkiKj0RUSKiEpfRKSIqPRFRIqISl9EpIj8f49vzzZqvLM3AAAAAElFTkSuQmCC\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Acc. 10-fold cross on train data= 0.9189710144927539\n", "\n", "Confusion matrix on test set:\n", " [[31 7]\n", " [ 0 68]]\n", "\n", "Accuracy on test set: 0.9339622641509434\n", "\n", "Best combination of parameters found: {'C': 100.0}\n", "\n", "Number of supports: 71 ( 19 of them have slacks)\n", "Prop. of supports: 0.2897959183673469\n" ] } ], "source": [ "Cs = np.logspace(-3, 11, num=15, base=10.0)\n", "\n", "param_grid = {'C': Cs}\n", "grid_search = GridSearchCV(SVC(kernel='poly',degree =2) , param_grid, cv=10)\n", "grid_search.fit(X_train,y_train)\n", "\n", "scores = grid_search.cv_results_['mean_test_score']\n", "\n", "plt.semilogx(Cs,scores)\n", "plt.show()\n", "\n", "parval=grid_search.best_params_\n", "\n", "cvacc = cross_val_score(SVC(kernel='poly',degree =2,C=parval['C']) , X=X_train, y=y_train, cv=10, scoring='accuracy')\n", "print('Acc. 10-fold cross on train data= ', cvacc.mean())\n", "\n", "\n", "knc = SVC(kernel='poly',degree =2,C=parval['C']) \n", "knc.fit(X_train, y_train)\n", "pred=knc.predict(X_test)\n", "print(\"\\nConfusion matrix on test set:\\n\",sklearn.metrics.confusion_matrix(y_test, pred))\n", "print(\"\\nAccuracy on test set: \",sklearn.metrics.accuracy_score(y_test, pred))\n", "print(\"\\nBest combination of parameters found: \",parval)\n", "print(\"\\nNumber of supports: \",np.sum(knc.n_support_), \"(\",np.sum(np.abs(knc.dual_coef_)==parval['C']) ,\"of them have slacks)\")\n", "print(\"Prop. of supports: \",np.sum(knc.n_support_)/X_train.shape[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Best result so far. Competitive with meta-methods. Now the best C value found is 100 and accuracy is a lot higher than with default parameters. It's always important when working with SVMs to find best parameters. \n", "\n", "For this C value we have a nice accuracy on the test set. Let's try what happens now with a polynomial kernel of degree 3." ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Confusion matrix on test set:\n", " [[ 1 37]\n", " [ 0 68]]\n", "\n", "Accuracy on test set: 0.6509433962264151\n" ] } ], "source": [ "knc = SVC(kernel='poly',degree =3) \n", "knc.fit(X_train, y_train)\n", "pred=knc.predict(X_test)\n", "print(\"Confusion matrix on test set:\\n\",sklearn.metrics.confusion_matrix(y_test, pred))\n", "print(\"\\nAccuracy on test set: \",sklearn.metrics.accuracy_score(y_test, pred))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Very bad results! That's because, again, we didn't use the optimal value for the C parameter but the default one. Let's find the best parameter." ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Acc. 10-fold cross on train data= 0.9106231884057971\n", "\n", "Confusion matrix on test set:\n", " [[31 7]\n", " [ 1 67]]\n", "\n", "Best combination of parameters found: {'C': 1000.0}\n", "\n", "Accuracy on test set: 0.9245283018867925\n", "\n", "Number of supports: 65 ( 13 of them have slacks)\n", "Prop. of supports: 0.2653061224489796\n" ] } ], "source": [ "Cs = np.logspace(-3, 11, num=15, base=10.0)\n", "\n", "param_grid = {'C': Cs}\n", "grid_search = GridSearchCV(SVC(kernel='poly',degree =3) , param_grid, cv=10)\n", "grid_search.fit(X_train,y_train)\n", "\n", "scores = grid_search.cv_results_['mean_test_score']\n", "\n", "plt.semilogx(Cs,scores)\n", "plt.show()\n", "\n", "parval=grid_search.best_params_\n", "\n", "cvacc = cross_val_score(SVC(kernel='poly',degree =3,C=parval['C']) , X=X_train, y=y_train, cv=10, scoring='accuracy')\n", "print('Acc. 10-fold cross on train data= ', cvacc.mean())\n", "\n", "knc = SVC(kernel='poly',degree =3,C=parval['C']) \n", "knc.fit(X_train, y_train)\n", "pred=knc.predict(X_test)\n", "print(\"\\nConfusion matrix on test set:\\n\",sklearn.metrics.confusion_matrix(y_test, pred))\n", "print(\"\\nBest combination of parameters found: \",parval)\n", "print(\"\\nAccuracy on test set: \",sklearn.metrics.accuracy_score(y_test, pred))\n", "print(\"\\nNumber of supports: \",np.sum(knc.n_support_), \"(\",np.sum(np.abs(knc.dual_coef_)==parval['C']) ,\"of them have slacks)\")\n", "print(\"Prop. of supports: \",np.sum(knc.n_support_)/X_train.shape[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Confusion matrix shows only 1 more error than the confusion matrix obtained with the quadratic kernel. Given the low number of cases we have in the test set, we cannot conclude that polynomial kernel with degree 2 is better than with degree 3. But remember that when there are several classifiers with a similar performance, we should always choose the simpler one! So we will choose the quadratic polynomial kernel as best polynomial kernel, not because of performance only but because is the better combination of performance and simplicity.\n", "\n", "## RBF Kernel\n", "\n", "There's another possibility for the kernel: The RBF kernel. This is the default kernel in the implementation of SVMs in sklearn, so we don't need to explicitely say the kernel used. Let's try it with default parameters." ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Confusion matrix on test set:\n", " [[30 8]\n", " [ 0 68]]\n", "\n", "Accuracy on test set: 0.9245283018867925\n" ] } ], "source": [ "knc = SVC() \n", "knc.fit(X_train, y_train)\n", "pred=knc.predict(X_test)\n", "print(\"Confusion matrix on test set:\\n\",sklearn.metrics.confusion_matrix(y_test, pred))\n", "print(\"\\nAccuracy on test set: \",sklearn.metrics.accuracy_score(y_test, pred))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Promising performance for default parameters. But we have to search for the best parameters. In this case we have two parameters to adjust: the C parameter and the gamma parameter. We will find the best combination using the *GridSearch* method." ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAT4AAAEECAYAAACx2Vj7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAHcBJREFUeJzt3Xu0JWV95vHvQ2PTXAQaWxFBLiqoiIrSIoqJMcpFJxEvYwI6AV0alom4RuMkalYiDJoJcSVLZYIokh7URFuXOrHNdOgwijoZBbtRBGnCrVVsRRFBRiO3PueZP6o27N5n73Pq7LPrnKpTz2etWr13XX5VB7p/561633p/sk1ERJfsstQXEBGx2JL4IqJzkvgionOS+CKic5L4IqJzkvgionOS+CKic5L4IqJzkvgionOS+KIySRct9TVETMKuS30B0SyS9hu1CXjJYl5LRF2S+GLQT4HvUyS6HpffH7UkVxQxYUl8MWgb8ELbtw5ukPSDJbieiInLM74Y9H5g9Yht713MC4moizItVUR0TW51l4CkR9j+2VJfxyiSngScAhxI8XzvR8AG29fXdL7X2f4fdcSOGCa3ujWTdJ6kNeXntZK2AVdK+r6k5y8g7sl9n/eR9HeSrpH0CUn7LyDu24H1FJ0Z3wA2l58/Kekd48adw3+tKW7EULnVrZmka20/tfx8OfAntjdLOgL4hO21Y8b9pu1nlp8vBn4MfAR4BfB82y8bM+6NwFNsPzCwfiVwne3Dx4x7zahNwBG2dxsnbsQ4cqtbv4dJ2tX2DmB325sBbN8oaVL/2NfaPrr8/D5JZywg1jTwGIohLf0OKLeNa3/gJOCugfUCvraAuBHzlsRXvwuAjZLOAy6V9H7gc8ALgasXEPdRkv6IInHsLUl+qPm+kEcYbwG+KOkmoDd85WDgCcBZC4j7T8Betmf8zJK+vIC4EfOWW91FIOkFwBuBIyh+2WwH/hFYN3hLOY+YZw+s+qDtn0p6NPBe26cv4Hp3AY6l6NxQeb2bbU+NGzOiSZL4YgZJ4qHE1+vV/YYX+JelrrhznPNJtv+trvjRTkl8i0DSScDL2Pkf/OdtX9q0uJJOBD4I3AT8sFx9EMWt7h/a/pcmxa1w3lttH1xH7GivJL6alc/0jgA+RnHLCMU/+NOBm2z/54bFvR54se3vDaw/DNho+8lNilvGOH/UJuAM23uPGzuWpyS+mkm60fYRQ9YLuHEBw0PqinsT8OSyF7p//Upgq+0nNCluGeMXwNuA+4Zs/hvba8aNHctTenXrd6+kY21/Y2D9s4B7Gxh3HbBZ0noe6tV9LHAq8HcNjAvFIOvv2J4xLEbSOQuMHSOc9II9/bM7q/V3XXXNfZtsnzz3nosjLb6aSXomcCHwcB66JX0s8P8onm1d1aS4ZewjgZeyc6/uBttbx41Zc9z9gHtt/2ohcWJ+jnn6Kl+56aBK+z7sgFuuGnewfh3S4quZ7W8Czy6HmTz4D972j5sYt4y9FdhaJhTbHhx03LS4d/Y+Tzp2zMZMeSFj2pdO3tVdJLZ/bPsq21soxvQ1Mq6kgyWtl3Q7cCXwDUm3l+sObVrcgdg/LWNvnlTsWc55bR1x28TANK60NE1afEvjpcA5DY37KYo5+V7TG7AsaQXwKorJC45rWNzaYkt6xahNwKPHibncTC/oLcalk8S3NDT3LksWd43tT/WvKJPJeknvbmDcOmN/CvgHGNpkWbWAuMuCMVMt7SNI4lsaxzQ47lWSPgh8lJ17X88AvtXAuHXGvgb4a9vfGdwg6UULiLtsNPE2tookvkVSTkN1IbC/7aMkPQ14qe33NCzu6cDrKebI63Wa/AD4AgsbdlJX3Dpjv4Wil3yYly8gbm0k7QO8k+KNnkeWq28HPg+cZ/vnkzqXgamWJr4MZ1kkkr4C/DHwYdvPKNd9x/ZRTYwb7SRpE/Al4KO9Hv6y5/8M4EW2T5jUuZ7+9JXetLHa2PADDrqtUcNZ0qu7ePYYMth4x9A9mxF3Bkm/1aa4dcau85oX6FDbf9U/rKns+f8riunFJmq64tI0SXyL5w5Jj6d8UC7pPwK3NTjuMM9qWdw6Y9d5zQvxfUl/0l9+QNL+ZUmBiZYHNWaq4tI0eca3eN4EXAQ8SdIPge8Cr2liXI0uNjQ4B2Aj4tYZu85rrsnvAu8AviKpVwD+J8AGiuE9k2OYal5OqyQtvkVie5vtF1E8cH6S7ecBv9m0uKqp2FBdcdt6zXOc93XjHmv7Lttvt/0k2/uVy5Ntv52iw2NiigHM7bzVTefGEqprrriFxFV9xYZqidvWa57jvI37ezHMU5+20p+r2LlxxGOb1bmRW92aafbqYgspA1lLXOorNlRX3Dpj13bNLfx7MYOB6Za2m5L46ldXdbG64tZVbKiuuHXGrvOa2/b3YgYD97f0aVkSX/3qqi5WS1zbl5aDoidabKiuuG29Zlr292KUadf19mW98owvIsZy5NNW+u//qdpcDccc8oM844uI9jNiqqW3uu286paTdGbi1hu7bXHrjF3nNU9blZamSeJbGnX9RWxb3Dpjty1unbFriVtMUqBKS9PkVjcixiSm3M62UxLfhKzUbl7FnpX2XcUe7K39Jt6r1La4dcaeb9yHH1ltWN4+B+zOgU/Zt3LcX2ytnhia8N/iXv6d+31fpSaagQdYsaBrWypJfBOyij15tl641JcRY3r+p+6pJe5XnrZ7LXHrcqW/WHlfOy2+iOig6QY+v6siiS8ixlJ0bqTFFxGdklvdiOiYYlqqJL6I6BAj7nd6dSOiY6ZbeqvbzqteAEknS7pB0s3DZtGV9OuSvilpR1m/IiKG6HVuVFmapnlXVCNJK4ALgBcDRwKnSTpyYLdbgdcCn1jcq4toFyOmXG2pokKj5BBJX5R0jaQvSzqob9sZkm4qlzPmOlenEh/FvGo3l3Uq7qeop3BK/w62v2f7GppZKiCiUabZpdIyl4qNkr8GPmb7acC5wF+Wx+4HnA08m+Lf+NmSVs92vq4lvgPZucTe9nJdRMyTDVPepdJSwZyNEoqE2Hu15PK+7ScBl9m+0/ZdwGXAybOdrGuJb1ibe+x3IyWdKWmLpC0PcN8CLiuijcR0xQVY0/u3Ui6DM8ZUaZR8G3hl+fnlwMMlPaLisTvpWq/uduCxfd8PoqiTOhbbF1HUtK3tJf6IpjJwvyunkDvmmIG5SqPkvwB/K+m1wFeBHwI7Kh67k64lvs3A4ZIOo/iPdirw6qW9pIh2MhOdZHTORontHwGvAJC0F/BK23dL2g78xsCxX57tZJ261bW9g6I61ibgeuDTtq+TdK6klwJIelb5H/JVwIclXbd0VxzRbBMczvJgo6SsW3wqsKF/B0lrJPWCvRNYV37eBJwoaXXZqXFiuW6krrX4sL0R2Diw7l19nzdT/MaIiFkUdXUn03ayvUNSr1GyAljXa5QAW2xvoGjV/aUkU9zqvqk89k5J76ZIngDn2r5ztvN1LvFFxKRMdlr5Co2SzwCfGXHsOh5qAc4piS8ixjLJFt9iS+KLiLE1sZBQFUl8ETEWWzww3c4U0s6rjoglV8zHlxZfRK32+ddH1Bb7bY+4tJa4p32/vjd63njI82qLXU1mYI6Ijik6N9Lii4iOaeJce1Uk8UXEWCb8ytqiSuKLiLGl2FBEdIoND0wn8UVEhxS3ukl8EdExbX1zo53pegEkrZN0u6TvjNguSeeXBU+ukfTMxb7GiDboDWepsjRN5xIfcAmzz8f/YuDwcjkTuHARrimihYpb3SpL0zTvimpm+6vAbHN1nUJRycm2rwD2lXTA4lxdRLvMo+ZGo+QZ30yjCpfctjSXE9FMRZW15iW1KpL4ZqpcuKSsFHUmwCr2qPOaIhrHiB3TK5b6MsaSxDdT5UpsqbIWXdfE29gqOveMr4INwOll7+5xwN22c5sbMaDNvbqda/FJ+iRF0ZI1ZTW1s4GHAdj+EMWc/y8BbgZ+Bbxuaa40ovma2GNbRecSn+3T5thuyupNETGLhrbmquhc4ouIycgMzBHRSWnxRUSnGNiR2VkioksyEWlEdFKe8UVEtzjP+CIetPtX9q8l7gcO/mwtcQF20161xN1vxY5a4gLssmrVxGPq3uqJbNJV1iSdDHwAWAFcbPu8ge0HAx8F9i33eYftjZIOBa4Hbih3vcL2G2c7VxJfRIxtUolP0grgAuAEitdGN0vaYHtr325/Bnza9oWSjqR42eDQctstto+uer4kvogYixFTk+vVPRa42fY2AEnrKaaI6098BvYuP+/DiHfoq2hnX3RENMIE5+MbNR1cv3OA/1S+aroReHPftsMkfUvSVyT92lwnS+KLiLHY85qkYI2kLX3LmQPhqkwHdxpwie2DKN6n/7ikXSjmyjzY9jOAPwI+IWlvZpFb3YgYm6s/47vD9tpZtleZDu71lGUjbH9d0ipgje3bgfvK9VdJugU4Atgy6mRp8UXEmKq19ip2gGwGDpd0mKSVwKkUU8T1uxV4IYCkJwOrgJ9KemTZOYKkx1HUy9k228nS4ouIsc2jxTdHHO+QdBawiWKoyjrb10k6F9hiewPwNuAjkt5KcRv8WtuW9OvAuZJ2AFPAG23PVldn+SY+SeuA3wJut31UuW4/4FMUXeDfA37H9l1Djj2Dousc4D22P7oY1xzRJpMex2d7I0WnRf+6d/V93gocP+S4zwLzGuS5nG91L2FmGcl3AF+0fTjwxfL7TsrkeDbwbIou9rMlra73UiNaqCw2VGVpmmWb+EaUkTyFYuQ35Z8vG3LoScBltu8sW4OXMXsd3ohOMsWtbpWlaZbtre4I+/fqZ9i+TdKjhuxTZTwRkCpr0XWZnWU5qVxeMlXWouvc0r/1y/ZWd4SfSDoAoPzz9iH7VC4vGdF1bb3V7Vri2wCcUX4+A/j8kH02ASdKWl12apxYrouIPnYSX+OUZSS/DjxR0nZJrwfOA06QdBPFLBDnlfuulXQxQDn+590UAyo3A+fONSYooqtSV7dhZikj+cIh+24B3tD3fR2wrqZLi1g2pqebl9SqWLaJLyLqZZp5G1tFEl9EjK2lnbpJfBExJk/uXd3FlsQXEeNraZMviS8ixpYWX7ROXdXQHrnql7XErdPd0/cs9SW0Ulvf3Ejii4ix2ODJFRtaVEl8ETG2tPgionuS+CKiWzKAOSK6KC2+iOiUDGCOiE5qaYuvnX3RJUnrJN0u6Tt96/aTdJmkm8o/V5frJel8STdLukbSM0fEPEbSteV+50tq56+0iMVgVVsaptWJj/lVUnsxRaHhwynqZFw4IuaF5fbevik0FDGKKy4N0+rEN89KaqcAH3PhCmDf3jT0PeX3vW1/3baBjzG8EltEmLT4GmSnSmpAr5JaleppB5brZ9snIkrF9PNzL03Tpc6NKtXTKldYg5SXjGjibWwVy7HFN6qSWpXqadvL9bPt8yDbF9lea3vtw9htwRce0Tq51W2MUZXUNgCnl727xwF3926Je8rvv5B0XNmbezrDK7FFhEHT1ZamaXXim08lNWAjsA24GfgI8Id9ca7uC/sHwMXlfrcA/1z3zxHRThVbexVbfJJOlnRDOZTsHUO2HyzpcknfKoekvaRv2zvL426QdNJc52r1M755VlIz8KYRcY7u+7wFOGoiFxix3E3oGZ+kFcAFFI2V7cBmSRtsb+3b7c+AT9u+UNKRFI2ZQ8vPpwJPAR4D/G9JR9ieGnW+Vrf4ImKJTW4c37HAzba32b4fWE8xBG3wbHuXn/fhoefvpwDrbd9n+7sUd2vHznayVrf4ImKJVW/xrZG0pe/7RbYv6vs+bLjZswdinAP8i6Q3A3sCL+o79oqBY2cdhpbEFxHj6Q1gruYO22tn2V5lKNlpwCW2/0bSc4CPSzqq4rE7SeKLiLFpcuP4qgw3ez3lK6S2vy5pFbCm4rE7mfUZn6QnSDp+yPpfk/T42Y6NiA6Y3DO+zcDhkg6TtJKis2LDwD63UnZcSnoysAr4abnfqZJ2k3QYxTv235jtZHO1+N4P/OmQ9feU2357juNjgeqqhAawcsWOWuIev89NtcS9c3pFLXEB9kg331gm1eKzvUPSWcAmYAWwzvZ1ks4FttjeALwN+Iikt1Kk09eWozWuk/RpYCuwA3jTbD26MHfiO9T2NUMucoukQ+f5s0XEcjPBtzJsb6QYotK/7l19n7cCM+5Ay21/AfxF1XPNlfhWzbJt96oniYhlqKFTTlUxVwN/s6TfH1xZviFxVT2XFBGt0dL5+OZq8b0F+J+SXsNDiW4tsBJ4eZ0XFhHNN8Fe3UU1a+Kz/RPguZJewEOvcf0v21+q/coiovmWY+LrsX05cHnN1xIRLSI3c+aVKjKAOSLG18C59qpoxeilSVVTk3RGuf9Nks4Yca6hcSNiiJZ2brQi8TGBamqS9gPOpnjx+Vjg7BFJbVTciBggV1uaphWJb0LV1E4CLrN9p+27gMsYXjpyVNyIGNTSFl+bn/HtVE1N0lzV1KpUWZstbkT0a2hrroo2J75RRk1RM++pa+Y8UaqsRde1NPG14lZ3hPlWU6s6dc2ouDOkylp0XYoNLb75VlPbBJwoaXXZqXFiua5q3IhYJlqR+CZRTc32ncC7Keb92gycW65D0sWSerPDjoobEYPSuVGfCVZTWwesG7L+DX2ffzYsbkQMSOdGRHRSEl9EdE4SX0R0icitbkR0TWZniYhOSosvIjonia/bdnniruz+kcmXgqyrBCTAEXuNfCllQR6z6121xI3myTO+iOieJL6I6JSGvpVRRRJfRIwtvboR0Tl5xhcR3dPSxNeK2VkiooGqzsxSMTlKOlnSDWWhsBm1biS9T9LV5XKjpJ/3bZvq27ZhrnOlxRcRYxHDpzUfK5a0AriAYiq47cBmSRtsb+3tY/utffu/GXhGX4h7bB9d9XyNavHVXUZS0jGSri2POV/SjP9vs8WNiAGTa/EdC9xse5vt+4H1FIW/RjkN+OS4l92oxEf9ZSQvLPftHTesytrQuBEx0wTLS1YtBoakQ4DDgC/1rV4laYukKyTNWRmxUYmvzjKS5ba9bX+9nKz0YwwvHTkqbkQMmq64wJoyMfWWMwcizacY2KnAZ2xP9a072PZa4NXA+yU9frbLbsMzvkmVkTyw/Dy4ftCo428b3LG/ytru++9V/SeKWA7mNwPzHWViGqVqMTAoEt9Os6zb/lH55zZJX6Z4/nfLqJM1qsU3T/MtI1n1N0rl3zz9VdZW7rv7yAuNWLYm94xvM3C4pMMkraRIbjN6ZyU9EVhNUYOnt261pN3Kz2uA44Gtg8f2a0Pim1QZye3l58H1g+bzmyei0yb1jM/2DuAsisqH1wOftn2dpHMlvbRv19OA9eXjqp4nA1skfRu4HDivvzd4mDbc6vbKPZ7HzDKSZ0laT9GRcXd5K7wJ+G99HRonAu+0faekX5QlJ68ETgf++4jzzYhb1w8X0WoTHMBseyNFlcT+de8a+H7OkOO+Bjx1PudqVOIry0j+BsWD0O0UvbPnAZ8uS0reCryq3H0j8BKKMpK/Al4HRRlJSb0yktBXRhL4A4qe492Bfy4XJL2xPPZDo+JGxEx5ZW0CFqGM5BbgqCHrP1QlbkT0yewsEdE1IrOzREQXpcUXEV0jtzPzJfFFxHjyjC8iuii9uh0nXGtFtDrsscv9S30J0XZJfBHRNWnxRUS3OMNZIqKL0uKLiC4RudWNiC7KOL6I6Jq0+CKiW1o8gHlJJiJdqmpqo84x5PqGxo2InWm62tI0SzUD8yUsTTW1Ued40BxxI6JPEt88LGE1tVHn6Dc07gJ/5IjlxxSdG1WWhmnSM77FqKY26hz9Ktf3jOi6dG7Up65qavM938wdU14yuq6lia9JVdYWo5raqHP0q1xlrb+85G77rprzB4xYTnoDmCdRZW2xNSnx9aqpwcxqaqeXvbvH8VDVs03AiWVNzdUU1dQ2ldt+Iem4sjf39IFYw87Rb2jcif6kEctB1ed7ecZXWKpqaqPOIWkt8Ebbb5gjbkT0aWKPbRVLkviWsJraz0acYwvwhrniRsTOmngbW0UbOjciookMTLcz8yXxRcT42pn3kvgiYny51Y2I7mlgj20VTRrOEhEtM8lxfJJOlnRDObnIsPfo3yfp6nK5UdLP+7bNa2KRtPgiYiwyaEKdG5JWABcAJ1C8RLBZ0gbbW3v72H5r3/5vBp5Rfu5NLLKW4qnjVeWxd406XxLfhOwis2rFAxOPu/eu9008Zs/jdhv24krEPExuHN+xwM22twFIWk8xqcjWEfufRpHsoG9ikfLY3sQinxx1siS+iBibqj/jWyNpS9/3i2xf1Pd92OQgzx56TukQ4DDgS7McO+vEIkl8ETGe+c3AfIfttbNsn8/kIqcCn7E9NcaxQDo3ImJsE31Xt/LkIBSJr/82dj7HAkl8EbEAE+zV3QwcLukwSSspktuGGeeTngisBr7et3reE4vkVjcixjehcXy2d0g6iyJhrQDW2b5O0rnAFtu9JHgasL58h7937LwnFknii4jxGDQ1uQHMtjdSzMbUv+5dA9/PGXHsvCYWqe1Wt2mV1GY7x8B1D40bEUO44tIwdT7ju4RmVVIbeo4hRsWNiAGyKy1NU1via2AltVHneNAccSNiUEtnYF7sXt2dqpwBtVdSq3COfrPFjYh+pnhzo8rSME3p3FiMSmpVjplX3P4qa3s8es85Th+xvIhm3sZWsdgtvqWspFZlkONscWfor7K2KlXWootyq1vJUlZSG3WOB80RNyL6GZhytaVharvVbVoltVHnKK/1attHzxE3Iga09Va3tsTXwEpqs53j6L7PQ+NGxBBJfBHRLc18fldFEl9EjMck8UVEBzVwjF4VSXwRMbZ0bkREtxiYameTL4kvIsaUzo3Ou/PffnbHPxy37vsVd18D3FHDZcwr7qjpaRYad54a8d+iAXHHiP33dcQ9pPr5SeLrOtuPrLqvpC1zFF4ZS9vi1hm7bXHrjF3nNSfxRUS3GJhQQfHFlsQXEWMyOJ0bUd1Fc+/Sibh1xm5b3Dpj1xO3xb26ckvv0SNiae2zcn8/d/9TK+176fbzr6rtOeMY0uKL2kn6c+A1FDNg3wFcBdxNMYnrSooZc37P9q8kXQLcAzyJoofxdRTTiz0HuNL2a8uYvwQuAF4E3AX8KfBe4GDgLbY3SDoU+DjQmyX2LNtfq/en7ZiWNpxSUDxqJWkt8ErgGcArgN5v/c/ZfpbtpwPXA6/vO2w18JvAW4EvAO8DngI8VVJvJp09gS/bPgb4BfAe4ATg5cC55T63AyfYfibwu8D5tfyQnVVxEtIGJse0+KJuzwM+b/seAElfKNcfJek9wL7AXhSTzvZ8wbYlXQv8xPa15bHXAYcCVwP3A5eW+18L3Gf7gfKYQ8v1DwP+tkyWU8AR9fyIHWVgup3P+JL4om6j6hJfArzM9rclvZZi0tqe+8o/p/s+9773/s4+4IceUD+4n+1pSb193gr8BHg6xd3NvWP/FDFcA1tzVeRWN+r2r8BvS1olaS/gP5TrHw7cJulhFM//6rAPcJvtaeD3gBU1nae7cqsbMZPtzZI2AN8Gvg9soejY+HPgynLdtRSJcNI+CHxW0quAy4F/r+Ec3WXjqamlvoqxZDhL1E7SXrZ/KWkP4KvAmba/udTXFQuzz66P9HP2flmlfTfddfGcw1kknQx8gKJlfrHt84bs8zvAORRPGL9t+9Xl+imKX6AAt9p+6WznSosvFsNFko4EVgEfTdJbRibUcJK0gmJ40gkUZV43S9pge2vfPocD7wSOt32XpEf1hbinv3bOXJL4ona938qxzNiT7NU9FrjZ9jYASeuBU4Ctffv8PnCB7buK0/v2GVEqSudGRIxvcp0bB1IMcO/ZXq7rdwRwhKT/K+mK8ta4Z5WkLeX6Oe+/0+KLiLG5eotvjaQtfd8vst3/DvGwYU+DGXNX4HCKoU8HAf9H0lG2fw4cbPtHkh4HfEnStbZvGXUxSXwRMaZ5DVW5Y47Oje3AY/u+HwT8aMg+V9h+APiupBsoEuFm2z8CsL1N0pcp3hQamfhyqxsR4zEwNVVtmdtm4HBJh0laCZwKbBjY5x+BFwBIWkNx67tN0mpJu/WtP56dnw3OkBZfRIzFgCc0EantHZLOonh1cQWwzvZ1ks4FttjeUG47UdJWilcQ/9j2zyQ9F/iwpGmKxtx5/b3Bw2QcX0SMZW/t5+N2PbHSvpft+FSmpYqI5WFSLb7FlhZfRIxF0qUUFdyquMP2yXPvtjiS+CKic9KrGxGdk8QXEZ2TxBcRnZPEFxGdk8QXEZ2TxBcRnZPEFxGdk8QXEZ2TxBcRnfP/ARQGDWLLIZuiAAAAAElFTkSuQmCC\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Best combination of parameters found: {'C': 10.0, 'gamma': 0.1}\n", "\n", "Acc. 10-fold cross on train data= 0.9469710144927538\n" ] } ], "source": [ "# Values we will test for each parameter. When observin results, consider the limits of the \n", "# values tested and increase them if necessary \n", "gammas = [0.000001,0.00001, 0.0001,0.001,0.01,0.1,1,10]\n", "Cs = np.logspace(-1, 6, num=8, base=10.0)\n", "\n", "param_grid = {'C': Cs, 'gamma' : gammas}\n", "grid_search = GridSearchCV(SVC(), param_grid, cv=10)\n", "grid_search.fit(X_train,y_train)\n", "parval=grid_search.best_params_\n", "\n", "# We'll show in a grid, the accuracy for each combination of parameters tester\n", "scores = grid_search.cv_results_['mean_test_score']\n", "scores = np.array(scores).reshape(len(param_grid['C']), len(param_grid['gamma']))\n", "\n", "plt.matshow(scores)\n", "plt.xlabel('gamma')\n", "plt.ylabel('C')\n", "plt.colorbar()\n", "plt.xticks(np.arange(len(param_grid['gamma'])), param_grid['gamma'],rotation='vertical')\n", "plt.yticks(np.arange(len(param_grid['C'])), param_grid['C'])\n", "plt.show()\n", "parval=grid_search.best_params_\n", "print(\"\\nBest combination of parameters found: \",parval)\n", "\n", "\n", "cvacc = cross_val_score(SVC(C=parval['C'], gamma=parval['gamma']) , X=X_train, y=y_train, cv=10, scoring='accuracy')\n", "print('\\nAcc. 10-fold cross on train data= ', cvacc.mean())\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This picture show for each combination of parameters the accuracy obtained in a 10-fold cross-validation. Notice the relation between C and gamma. \n", "\n", "Let's see the performance of the best parameters found on the test set." ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Confusion matrix on test set:\n", " [[34 4]\n", " [ 0 68]]\n", "\n", "Accuracy on test set: 0.9622641509433962\n", "\n", "Number of supports: 61 ( 9 of them have slacks)\n", "Prop. of supports: 0.24897959183673468\n" ] } ], "source": [ "# Let's apply the best combination of parameters found to the test set\n", "\n", "knc = SVC(C=parval['C'], gamma=parval['gamma']) \n", "knc.fit(X_train, y_train)\n", "pred=knc.predict(X_test)\n", "\n", "print(\"Confusion matrix on test set:\\n\",sklearn.metrics.confusion_matrix(y_test, pred))\n", "print(\"\\nAccuracy on test set: \",sklearn.metrics.accuracy_score(y_test, pred))\n", "print(\"\\nNumber of supports: \",np.sum(knc.n_support_), \"(\",np.sum(np.abs(knc.dual_coef_)==parval['C']) ,\"of them have slacks)\")\n", "print(\"Prop. of supports: \",np.sum(knc.n_support_)/X_train.shape[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Wow. Best performance obtained so far using any method, even better than results obtained with meta-methods. \n", "\n", "**Observations about number and percentage of supports vectors:** \n", "It is know that percentage of supports of an SVM is a lower bound for the leave-one-out error. In general, an SVM with a lot of supports will be an overfitted SVM. A percentage of supports higher than 50% should be considered suspicious. If this happens, try to use other kernels. As a rule of thumb, a good SVM has a percentatge of supports vectors about 20-40% of the data (but that depends on a lot of things).\n", "\n", "In our case all SVM have a low number of supports. And notice that the machine with a higher performance is the one with a lower number of supports (24.9%). That's not a coincidence but something common in SVMs." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Appendix: Performance of meta-methods" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Confusion matrix on test set:\n", " [[34 4]\n", " [ 1 67]]\n", "\n", "Accuracy on test set: 0.9528301886792453\n" ] } ], "source": [ "from sklearn.ensemble import ExtraTreesClassifier\n", "from sklearn.model_selection import cross_val_score\n", "\n", "clf=ExtraTreesClassifier(n_estimators=200,random_state=1).fit(X_train, y_train)\n", "pred=clf.predict(X_test)\n", "print(\"Confusion matrix on test set:\\n\",sklearn.metrics.confusion_matrix(y_test, pred))\n", "print(\"\\nAccuracy on test set: \",sklearn.metrics.accuracy_score(y_test, pred))\n", " " ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Confusion matrix on test set:\n", " [[34 4]\n", " [ 2 66]]\n", "\n", "Accuracy on test set: 0.9433962264150944\n" ] } ], "source": [ "from sklearn.ensemble import AdaBoostClassifier\n", "\n", "clf=AdaBoostClassifier(n_estimators=200,random_state=1).fit(X_train, y_train)\n", "pred=clf.predict(X_test)\n", "print(\"Confusion matrix on test set:\\n\",sklearn.metrics.confusion_matrix(y_test, pred))\n", "print(\"\\nAccuracy on test set: \",sklearn.metrics.accuracy_score(y_test, pred))\n" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Confusion matrix on test set:\n", " [[34 4]\n", " [ 2 66]]\n", "\n", "Accuracy on test set: 0.9433962264150944\n" ] } ], "source": [ "from sklearn.ensemble import BaggingClassifier\n", "\n", "clf=BaggingClassifier(n_estimators=200,max_features=0.35,random_state=1).fit(X_train, y_train)\n", "pred=clf.predict(X_test)\n", "print(\"Confusion matrix on test set:\\n\",sklearn.metrics.confusion_matrix(y_test, pred))\n", "print(\"\\nAccuracy on test set: \",sklearn.metrics.accuracy_score(y_test, pred))\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }