import pandas as pd from sklearn.model_selection import train_test_split, GridSearchCV from sklearn import preprocessing from sklearn.neighbors import KNeighborsClassifier from sklearn.decomposition import PCA import matplotlib.pyplot as plt import sklearn.metrics as met def classification(message, x_arg, y_arg): print(message, '\n') # podela na trening i test skup x_train, x_test, y_train, y_test = train_test_split(x_arg, y_arg, train_size=0.7, stratify=y) # Parametri za unakrsnu validacuju parameters = [{'n_neighbors': range(3, 15, 2), 'p': [1, 2], 'weights': ['uniform', 'distance'] }] clf = GridSearchCV(KNeighborsClassifier(), parameters, cv=10) clf.fit(x_train, y_train) print("Najbolji parametri:") print(clf.best_params_) print() print("Izvestaj za trening skup:") y_pred =clf.predict(x_train) print(met.classification_report(y_train, y_pred)) print() cnf_matrix = met.confusion_matrix(y_train, y_pred) print("Matrica konfuzije", cnf_matrix, sep="\n") print("\n") print("Izvestaj za test skup:") y_pred =clf.predict(x_test) print(met.classification_report(y_test, y_pred)) print() cnf_matrix = met.confusion_matrix(y_test, y_pred) print("Matrica konfuzije", cnf_matrix, sep="\n") print("\n") if message == 'PCA': colors = ['red', 'blue', 'gold', 'm', 'plum', 'orange', 'black'] x_test.is_copy=False x_test['predicted'] = y_pred classes =x_test['predicted'].unique() for i, class_value in zip(range(0, len(classes)), classes): class_samples = x_test.loc[lambda s: s['predicted'] == class_value, :] plt.scatter(class_samples['pca1'], class_samples['pca2'], color=colors[i], s=10, marker='o', label="class %s" % class_value) plt.title('Classification with PCA') plt.legend(loc='upper right') plt.show() df = pd.read_csv("car.csv") #prikaz imena kolona + 5 prvih instanci print('Prvih 5 instanci', df.head(), sep='\n') print('\n\n') print('Opis podataka', df.describe(), sep='\n') print('\n\n') print('Klase:', print(df["class"].value_counts()), sep='\n') print('\n\n') features=df.columns[1:] x=df[features] y=df["class"] num_features = x.shape[1] #standardizacija podataka scaler = preprocessing.StandardScaler().fit(x) x =pd.DataFrame(scaler.transform(x)) x.columns = features #primena pca pca=PCA() #pca=PCA(n_components=2) pca.fit(x) x_pca = pd.DataFrame(pca.transform(x)) #promena imena kolona za skup sa pca pca_columns = ['pca%d'%i for i in range(1, pca.n_components_+1)] x_pca.columns=pca_columns print('components_ ') for i, component in zip(range(1, pca.n_components_+1), pca.components_): pca_desc="pca%d"%i + "=" for j, value in zip(range(0, num_features), component): pca_desc+="%.2f*%s"%(value, features[j]) print(pca_desc) print() print('explained_variance_ ') for i, ev in zip(range(1, num_features+1), pca.explained_variance_): print("pca%d: %.10f"%(i,ev)) print() print() print('explained_variance_ratio_ ') for i, evr in zip(range(1, num_features+1), pca.explained_variance_ratio_): print("pca%d: %.10f"%(i,evr)) print() print('mean_ ', pca.mean_ , sep='\n') print() print('n_components_ ', pca.n_components_ , sep='\n') print() print('noise_variance_ ', pca.noise_variance_ , sep='\n') print() classification('Original', x, y) classification('PCA', x_pca, y)