{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
    "from sklearn.naive_bayes import  MultinomialNB\n",
    "from sklearn.model_selection import train_test_split\n",
    "import pandas as pd\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#trening skup\n",
    "corpus = [\n",
    "    'Chinese Beijing Chinese',\n",
    "    'Chinese Chinese Shanghai',\n",
    "    'Chinese Macao',\n",
    "    'Tokyo Japan Chinese'\n",
    "]\n",
    "\n",
    "classes= ['yes', 'yes', 'yes', 'no']\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#pravljenje term-matrice sa brojem pojavljivanja reci\n",
    "vectorizer = CountVectorizer()\n",
    "x_train = vectorizer.fit_transform(corpus)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#ili\n",
    "#pravljenje term-matrice sa tf-idf merom\n",
    "#tdidf_vectorizer=TfidfVectorizer(norm='l1')\n",
    "#x_train=tdidf_vectorizer.fit_transform(corpus)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Trening skup\n",
      "Atributi - termi\n",
      "['beijing', 'chinese', 'japan', 'macao', 'shanghai', 'tokyo']\n"
     ]
    }
   ],
   "source": [
    "print('Trening skup')\n",
    "print('Atributi - termi')\n",
    "print(vectorizer.get_feature_names())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Instance & term-matrica\n",
      "                           beijing   chinese     japan     macao  shanghai  \\\n",
      "Chinese Beijing Chinese   0.489313  0.510687  0.000000  0.000000  0.000000   \n",
      "Chinese Chinese Shanghai  0.000000  0.510687  0.000000  0.000000  0.489313   \n",
      "Chinese Macao             0.000000  0.342901  0.000000  0.657099  0.000000   \n",
      "Tokyo Japan Chinese       0.000000  0.206929  0.396536  0.000000  0.000000   \n",
      "\n",
      "                             tokyo  \n",
      "Chinese Beijing Chinese   0.000000  \n",
      "Chinese Chinese Shanghai  0.000000  \n",
      "Chinese Macao             0.000000  \n",
      "Tokyo Japan Chinese       0.396536  \n"
     ]
    }
   ],
   "source": [
    "print('Instance & term-matrica')\n",
    "print(pd.DataFrame(x_train.toarray(), index= corpus, columns=vectorizer.get_feature_names()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MultinomialNB()"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#pravljenje modela primenom algoritma Multinomojalni naivni Bajes na trening skup\n",
    "clf_mnb = MultinomialNB()\n",
    "clf_mnb.fit(x_train, classes)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Broj instanci po klasama\n",
      "no     1.0\n",
      "yes    3.0\n",
      "dtype: float64\n"
     ]
    }
   ],
   "source": [
    "print('Broj instanci po klasama')\n",
    "print(pd.Series(clf_mnb.class_count_, index=clf_mnb.classes_))\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Broj pojavljivanja reci po klasama\n",
      "      beijing   chinese     japan     macao  shanghai     tokyo\n",
      "no   0.000000  0.206929  0.396536  0.000000  0.000000  0.396536\n",
      "yes  0.489313  1.364276  0.000000  0.657099  0.489313  0.000000\n"
     ]
    }
   ],
   "source": [
    "print('Broj pojavljivanja reci po klasama')\n",
    "print(pd.DataFrame(clf_mnb.feature_count_, index=clf_mnb.classes_, columns=vectorizer.get_feature_names()))\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "#transformisanje test instance u term-matricu\n",
    "x_test_text = ['Chinese Chinese Chinese Tokyo Japan']\n",
    "x_test = vectorizer.transform(x_test_text)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dodeljena klasa yes\n"
     ]
    }
   ],
   "source": [
    "#klasifikacija test instance\n",
    "y_pred = clf_mnb.predict(x_test)\n",
    "print('Dodeljena klasa', y_pred[0])\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}