{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "acute-darkness", "metadata": {}, "outputs": [], "source": [ "import pandas as pd #biblioteka za rad sa podacima\n", "\n", "#u biblioteci sklearn su implementirane klase i funkcije za istrazivanje podataka\n", "from sklearn.tree import DecisionTreeClassifier, export_graphviz, plot_tree\n", "from sklearn.model_selection import train_test_split\n", "import sklearn.metrics as met #modul metrics sadrzi f-je za evaluaciju modela za klasifikaciju\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "third-authorization", "metadata": {}, "outputs": [], "source": [ "#ucitavanje podataka iz datoteke u csv formatu i\n", "#pravljenje tabele, tj. objekta klase DataFrame\n", "\n", "df=pd.read_csv('C:/Users/student/Desktop/ipIndustija4/ipVezbe62021/iris.csv')" ] }, { "cell_type": "markdown", "id": "weird-evening", "metadata": {}, "source": [ "Deo sa upoznavanjem i pripremom podataka\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "coordinated-group", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Sepal_LengthSepal_WidthPetal_LengthPetal_WidthSpecies
05.13.51.40.2setosa
14.93.01.40.2setosa
24.73.21.30.2setosa
34.63.11.50.2setosa
45.03.61.40.2setosa
\n", "
" ], "text/plain": [ " Sepal_Length Sepal_Width Petal_Length Petal_Width Species\n", "0 5.1 3.5 1.4 0.2 setosa\n", "1 4.9 3.0 1.4 0.2 setosa\n", "2 4.7 3.2 1.3 0.2 setosa\n", "3 4.6 3.1 1.5 0.2 setosa\n", "4 5.0 3.6 1.4 0.2 setosa" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 4, "id": "intellectual-institution", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Sepal_LengthSepal_WidthPetal_LengthPetal_WidthSpecies
count150.000000150.000000150.000000150.000000150
uniqueNaNNaNNaNNaN3
topNaNNaNNaNNaNvirginica
freqNaNNaNNaNNaN50
mean5.8433333.0573333.7580001.199333NaN
std0.8280660.4358661.7652980.762238NaN
min4.3000002.0000001.0000000.100000NaN
25%5.1000002.8000001.6000000.300000NaN
50%5.8000003.0000004.3500001.300000NaN
75%6.4000003.3000005.1000001.800000NaN
max7.9000004.4000006.9000002.500000NaN
\n", "
" ], "text/plain": [ " Sepal_Length Sepal_Width Petal_Length Petal_Width Species\n", "count 150.000000 150.000000 150.000000 150.000000 150\n", "unique NaN NaN NaN NaN 3\n", "top NaN NaN NaN NaN virginica\n", "freq NaN NaN NaN NaN 50\n", "mean 5.843333 3.057333 3.758000 1.199333 NaN\n", "std 0.828066 0.435866 1.765298 0.762238 NaN\n", "min 4.300000 2.000000 1.000000 0.100000 NaN\n", "25% 5.100000 2.800000 1.600000 0.300000 NaN\n", "50% 5.800000 3.000000 4.350000 1.300000 NaN\n", "75% 6.400000 3.300000 5.100000 1.800000 NaN\n", "max 7.900000 4.400000 6.900000 2.500000 NaN" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe(include='all')" ] }, { "cell_type": "code", "execution_count": 5, "id": "actual-death", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#provera da li postoje nedostajuce vrednosti u skupu, jer ako\n", "#postoje moraju biti obradjene\n", "df.isna().any().any()" ] }, { "cell_type": "code", "execution_count": 6, "id": "proper-blame", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Sepal_LengthSepal_WidthPetal_LengthPetal_Width
Sepal_Length1.000000-0.1175700.8717540.817941
Sepal_Width-0.1175701.000000-0.428440-0.366126
Petal_Length0.871754-0.4284401.0000000.962865
Petal_Width0.817941-0.3661260.9628651.000000
\n", "
" ], "text/plain": [ " Sepal_Length Sepal_Width Petal_Length Petal_Width\n", "Sepal_Length 1.000000 -0.117570 0.871754 0.817941\n", "Sepal_Width -0.117570 1.000000 -0.428440 -0.366126\n", "Petal_Length 0.871754 -0.428440 1.000000 0.962865\n", "Petal_Width 0.817941 -0.366126 0.962865 1.000000" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.corr()\n", "#posto je korelacija izmedju Petal_Width i Petal_Length 0.96,\n", "#jedan od ova dva atributa se moze izostaviti pri pravljenju modela" ] }, { "cell_type": "markdown", "id": "tribal-depression", "metadata": {}, "source": [ "Deo sa klasifikacijom\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "deadly-yorkshire", "metadata": {}, "outputs": [], "source": [ "#Posto fja train_test_split kao prvi parametar prima opis instanci (bez ciljnog atributa),\n", "#a kao drugi parametar listu sa klasama instanci koje su zadate kao prvi argument,\n", "#ucitani skup se prema tim potrebama deli na dva dela\n", "\n", "features = df.columns[:4].tolist()" ] }, { "cell_type": "code", "execution_count": 11, "id": "academic-satisfaction", "metadata": {}, "outputs": [], "source": [ "#ako se eliminise jedan atribut zbog korelacije, npr. Petal_Length\n", "#features.remove('Petal_Length')\n" ] }, { "cell_type": "code", "execution_count": 12, "id": "mysterious-score", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width']" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "features" ] }, { "cell_type": "code", "execution_count": 13, "id": "latest-supervision", "metadata": {}, "outputs": [], "source": [ "#features.remove('Petal_Length')" ] }, { "cell_type": "code", "execution_count": 14, "id": "banner-glory", "metadata": {}, "outputs": [], "source": [ "x=df[features] #opis instanci (skup sadrzi samo atribute koji se koriste za pravljenje modela)\n", "y=df['Species'] #skup sa ciljnim atributom\n" ] }, { "cell_type": "code", "execution_count": 15, "id": "finnish-curve", "metadata": {}, "outputs": [], "source": [ "\"\"\"\n", "f-ja train_test_split kao rezultat vraca\n", " x_train: instance u trening skupu (bez ciljnog atributa)\n", " x_test: instance u test skupu (bez ciljnog atributa)\n", " y_train: klase instanci u trening skupu\n", " y_test: klase instanci u test skupu\n", "\"\"\"\n", "\n", "x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=100)" ] }, { "cell_type": "code", "execution_count": 16, "id": "republican-bottle", "metadata": {}, "outputs": [], "source": [ "#pravljenje klasifikatora koriscenjem klase DecisionTreeClassifier (u slajdovima je detaljnije opisana)\n", "\n", "#dt = DecisionTreeClassifier()\n", "#dt = DecisionTreeClassifier(criterion='entropy')\n", "dt = DecisionTreeClassifier(max_depth=2)\n", "#dt = DecisionTreeClassifier(min_samples_split=20, max_depth=4)\n", "#dt = DecisionTreeClassifier(min_samples_split=20, max_depth=4, max_leaf_nodes=4)\n", "#dt = DecisionTreeClassifier(min_impurity_split=0.05)\n" ] }, { "cell_type": "code", "execution_count": 17, "id": "after-twelve", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DecisionTreeClassifier(max_depth=2)" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#pravljenje modela na osnovu trening skupa\n", "\n", "dt.fit(x_train, y_train)" ] }, { "cell_type": "code", "execution_count": 18, "id": "fatty-personal", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['setosa', 'versicolor', 'virginica'], dtype=object)" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dt.classes_" ] }, { "cell_type": "code", "execution_count": 20, "id": "sunset-wholesale", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0. , 0. , 0.56711176, 0.43288824])" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dt.feature_importances_ " ] }, { "cell_type": "code", "execution_count": 21, "id": "brazilian-protein", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Sepal_Length 0.000000\n", "Sepal_Width 0.000000\n", "Petal_Length 0.567112\n", "Petal_Width 0.432888\n", "dtype: float64" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.Series(dt.feature_importances_, index=features) " ] }, { "cell_type": "code", "execution_count": 26, "id": "passing-blues", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
prob_setosaprob_versicolorprob_virginica
790.00.8888890.111111
261.00.0000000.000000
271.00.0000000.000000
31.00.0000000.000000
610.00.8888890.111111
1030.00.0312500.968750
411.00.0000000.000000
670.00.8888890.111111
810.00.8888890.111111
1480.00.0312500.968750
451.00.0000000.000000
91.00.0000000.000000
351.00.0000000.000000
550.00.8888890.111111
1210.00.0312500.968750
1000.00.0312500.968750
441.00.0000000.000000
800.00.8888890.111111
221.00.0000000.000000
580.00.8888890.111111
1130.00.0312500.968750
590.00.8888890.111111
1250.00.0312500.968750
401.00.0000000.000000
41.00.0000000.000000
1140.00.0312500.968750
870.00.8888890.111111
1360.00.0312500.968750
321.00.0000000.000000
1300.00.0312500.968750
51.00.0000000.000000
171.00.0000000.000000
1310.00.0312500.968750
990.00.8888890.111111
900.00.8888890.111111
1160.00.0312500.968750
111.00.0000000.000000
431.00.0000000.000000
710.00.8888890.111111
1380.00.0312500.968750
1190.00.8888890.111111
540.00.8888890.111111
720.00.8888890.111111
481.00.0000000.000000
820.00.8888890.111111
131.00.0000000.000000
630.00.8888890.111111
620.00.8888890.111111
1230.00.0312500.968750
1440.00.0312500.968750
\n", "
" ], "text/plain": [ " prob_setosa prob_versicolor prob_virginica\n", "79 0.0 0.888889 0.111111\n", "26 1.0 0.000000 0.000000\n", "27 1.0 0.000000 0.000000\n", "3 1.0 0.000000 0.000000\n", "61 0.0 0.888889 0.111111\n", "103 0.0 0.031250 0.968750\n", "41 1.0 0.000000 0.000000\n", "67 0.0 0.888889 0.111111\n", "81 0.0 0.888889 0.111111\n", "148 0.0 0.031250 0.968750\n", "45 1.0 0.000000 0.000000\n", "9 1.0 0.000000 0.000000\n", "35 1.0 0.000000 0.000000\n", "55 0.0 0.888889 0.111111\n", "121 0.0 0.031250 0.968750\n", "100 0.0 0.031250 0.968750\n", "44 1.0 0.000000 0.000000\n", "80 0.0 0.888889 0.111111\n", "22 1.0 0.000000 0.000000\n", "58 0.0 0.888889 0.111111\n", "113 0.0 0.031250 0.968750\n", "59 0.0 0.888889 0.111111\n", "125 0.0 0.031250 0.968750\n", "40 1.0 0.000000 0.000000\n", "4 1.0 0.000000 0.000000\n", "114 0.0 0.031250 0.968750\n", "87 0.0 0.888889 0.111111\n", "136 0.0 0.031250 0.968750\n", "32 1.0 0.000000 0.000000\n", "130 0.0 0.031250 0.968750\n", "5 1.0 0.000000 0.000000\n", "17 1.0 0.000000 0.000000\n", "131 0.0 0.031250 0.968750\n", "99 0.0 0.888889 0.111111\n", "90 0.0 0.888889 0.111111\n", "116 0.0 0.031250 0.968750\n", "11 1.0 0.000000 0.000000\n", "43 1.0 0.000000 0.000000\n", "71 0.0 0.888889 0.111111\n", "138 0.0 0.031250 0.968750\n", "119 0.0 0.888889 0.111111\n", "54 0.0 0.888889 0.111111\n", "72 0.0 0.888889 0.111111\n", "48 1.0 0.000000 0.000000\n", "82 0.0 0.888889 0.111111\n", "13 1.0 0.000000 0.000000\n", "63 0.0 0.888889 0.111111\n", "62 0.0 0.888889 0.111111\n", "123 0.0 0.031250 0.968750\n", "144 0.0 0.031250 0.968750" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# pravljenje skupa koji sadrzi oznaku instance i\n", "# verovatnocu pripadnosti svakoj od klasa\n", "pd.DataFrame(dt.predict_proba(x_test),\n", " index= x_test.index,\n", " columns=['prob_' + x for x in dt.classes_])\n" ] }, { "cell_type": "code", "execution_count": 27, "id": "introductory-pleasure", "metadata": {}, "outputs": [], "source": [ "from sklearn.tree import plot_tree" ] }, { "cell_type": "code", "execution_count": 29, "id": "imposed-american", "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "#graficki prikaz drveta\n", "t=plot_tree(dt, rounded=True, filled=True, impurity=False, feature_names=features, class_names=dt.classes_ )" ] }, { "cell_type": "code", "execution_count": 31, "id": "sought-injection", "metadata": {}, "outputs": [], "source": [ "y_pred=dt.predict(x_train) #primena modela na trening podatke\n" ] }, { "cell_type": "code", "execution_count": 32, "id": "living-processing", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.95" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "met.accuracy_score(y_train, y_pred) #preciznost modela na trening skupu" ] }, { "cell_type": "code", "execution_count": 34, "id": "undefined-research", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
setosaversicolorvirginica
setosa3200
versicolor0321
virginica0431
\n", "
" ], "text/plain": [ " setosa versicolor virginica\n", "setosa 32 0 0\n", "versicolor 0 32 1\n", "virginica 0 4 31" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#matrica konfuzije za trening skup\n", "pd.DataFrame(met.confusion_matrix(y_train, y_pred), index=dt.classes_, columns=dt.classes_)" ] }, { "cell_type": "code", "execution_count": 35, "id": "aerial-memory", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " setosa 1.00 1.00 1.00 32\n", " versicolor 0.89 0.97 0.93 33\n", " virginica 0.97 0.89 0.93 35\n", "\n", " accuracy 0.95 100\n", " macro avg 0.95 0.95 0.95 100\n", "weighted avg 0.95 0.95 0.95 100\n", "\n" ] } ], "source": [ "#izvestaj klasifikacije za trening skup\n", "print(met.classification_report(y_train, y_pred))" ] }, { "cell_type": "code", "execution_count": 36, "id": "potential-midwest", "metadata": {}, "outputs": [], "source": [ "#funkcija za primenu modela(clf) na podacima i ispis rezultata\n", "def calculate_metrix(test_data, true_values, clf):\n", " y_pred=clf.predict(test_data)\n", " print('Preciznost:', met.accuracy_score(true_values, y_pred))\n", " print('Matrica kofuzije')\n", " print(pd.DataFrame(met.confusion_matrix(true_values, y_pred), index=clf.classes_, columns=clf.classes_))\n", " print('Izvestaj')\n", " print(met.classification_report(true_values, y_pred))" ] }, { "cell_type": "code", "execution_count": 37, "id": "legal-paper", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Preciznost: 0.95\n", "Matrica kofuzije\n", " setosa versicolor virginica\n", "setosa 32 0 0\n", "versicolor 0 32 1\n", "virginica 0 4 31\n", "Izvestaj\n", " precision recall f1-score support\n", "\n", " setosa 1.00 1.00 1.00 32\n", " versicolor 0.89 0.97 0.93 33\n", " virginica 0.97 0.89 0.93 35\n", "\n", " accuracy 0.95 100\n", " macro avg 0.95 0.95 0.95 100\n", "weighted avg 0.95 0.95 0.95 100\n", "\n" ] } ], "source": [ "calculate_metrix(x_train, y_train, dt)" ] }, { "cell_type": "code", "execution_count": 38, "id": "experimental-practitioner", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Preciznost: 0.98\n", "Matrica kofuzije\n", " setosa versicolor virginica\n", "setosa 18 0 0\n", "versicolor 0 17 0\n", "virginica 0 1 14\n", "Izvestaj\n", " precision recall f1-score support\n", "\n", " setosa 1.00 1.00 1.00 18\n", " versicolor 0.94 1.00 0.97 17\n", " virginica 1.00 0.93 0.97 15\n", "\n", " accuracy 0.98 50\n", " macro avg 0.98 0.98 0.98 50\n", "weighted avg 0.98 0.98 0.98 50\n", "\n" ] } ], "source": [ "calculate_metrix(x_test, y_test, dt) #primena modela na test podatke" ] }, { "cell_type": "code", "execution_count": 34, "id": "correct-sport", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.6" } }, "nbformat": 4, "nbformat_minor": 5 }