{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.model_selection import train_test_split\n", "import pandas as pd\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "#trening skup\n", "corpus = [\n", " 'Chinese Beijing Chinese',\n", " 'Chinese Chinese Shanghai',\n", " 'Chinese Macao',\n", " 'Tokyo Japan Chinese'\n", "]\n", "\n", "classes= ['yes', 'yes', 'yes', 'no']\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "#pravljenje term-matrice sa brojem pojavljivanja reci\n", "vectorizer = CountVectorizer()\n", "x_train = vectorizer.fit_transform(corpus)\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "#ili\n", "#pravljenje term-matrice sa tf-idf merom\n", "#tdidf_vectorizer=TfidfVectorizer(norm='l1')\n", "#x_train=tdidf_vectorizer.fit_transform(corpus)\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Trening skup\n", "Atributi - termi\n", "['beijing', 'chinese', 'japan', 'macao', 'shanghai', 'tokyo']\n" ] } ], "source": [ "print('Trening skup')\n", "print('Atributi - termi')\n", "print(vectorizer.get_feature_names())\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Instance & term-matrica\n", " beijing chinese japan macao shanghai \\\n", "Chinese Beijing Chinese 0.489313 0.510687 0.000000 0.000000 0.000000 \n", "Chinese Chinese Shanghai 0.000000 0.510687 0.000000 0.000000 0.489313 \n", "Chinese Macao 0.000000 0.342901 0.000000 0.657099 0.000000 \n", "Tokyo Japan Chinese 0.000000 0.206929 0.396536 0.000000 0.000000 \n", "\n", " tokyo \n", "Chinese Beijing Chinese 0.000000 \n", "Chinese Chinese Shanghai 0.000000 \n", "Chinese Macao 0.000000 \n", "Tokyo Japan Chinese 0.396536 \n" ] } ], "source": [ "print('Instance & term-matrica')\n", "print(pd.DataFrame(x_train.toarray(), index= corpus, columns=vectorizer.get_feature_names()))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MultinomialNB()" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#pravljenje modela primenom algoritma Multinomojalni naivni Bajes na trening skup\n", "clf_mnb = MultinomialNB()\n", "clf_mnb.fit(x_train, classes)\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Broj instanci po klasama\n", "no 1.0\n", "yes 3.0\n", "dtype: float64\n" ] } ], "source": [ "print('Broj instanci po klasama')\n", "print(pd.Series(clf_mnb.class_count_, index=clf_mnb.classes_))\n", " " ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Broj pojavljivanja reci po klasama\n", " beijing chinese japan macao shanghai tokyo\n", "no 0.000000 0.206929 0.396536 0.000000 0.000000 0.396536\n", "yes 0.489313 1.364276 0.000000 0.657099 0.489313 0.000000\n" ] } ], "source": [ "print('Broj pojavljivanja reci po klasama')\n", "print(pd.DataFrame(clf_mnb.feature_count_, index=clf_mnb.classes_, columns=vectorizer.get_feature_names()))\n", " " ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "#transformisanje test instance u term-matricu\n", "x_test_text = ['Chinese Chinese Chinese Tokyo Japan']\n", "x_test = vectorizer.transform(x_test_text)\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dodeljena klasa yes\n" ] } ], "source": [ "#klasifikacija test instance\n", "y_pred = clf_mnb.predict(x_test)\n", "print('Dodeljena klasa', y_pred[0])\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.6" } }, "nbformat": 4, "nbformat_minor": 2 }