import pandas as pd import sklearn.metrics as met from sklearn import preprocessing from sklearn.cluster import AgglomerativeClustering import matplotlib.pyplot as plt import subprocess def make_tree(children, instances, labels, file_name ): with open('dt.dot', 'w') as f: f.write(""" digraph G { nodesep=0.3; ranksep=0.2; margin=0.1; node [shape=circle]; edge [arrowsize=0.8]; """) labels=labels.tolist() for i in range(0, len(labels)): f.write(str(i) + '[label="'+ labels[i] + '" color=blue];\n') for x in children: f.write(str(instances) + "->" + str(x[0].item()) + ";\n") f.write(str(instances) + "->" + str(x[1].item()) + ";\n") instances+=1 f.write("}") f.close() subprocess.call("dot -Tpng dt.dot -o " + file_name + ".png", shell=True) df = pd.read_csv("dogs.csv") #prikaz imena kolona + 5 prvih instanci print('Prvih 5 instanci', df.head(), sep='\n') print('\n\n') num_instances = df.shape[0] featurs = df.columns[1:3].tolist() #print(featurs) x_original=df[featurs] #standardizacija atributa x=pd.DataFrame(preprocessing.scale(x_original)) #normalizacija #x=pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(x_original)) #dodeljivanje imena kolonama x.columns = featurs """ AgglomerativeClustering parametri: n_clusters : broj klastera default=2 connectivity : matrica povezanosti default=None affinity : mera za racnunanje bliskosti default: “euclidean” Moze biti npr. “euclidean”, “l1”, “l2”, “manhattan”, “cosine” linkage : veza default: “ward” Moze biti “ward”, “complete”, “average” atributi: labels_ : oznake klastera za svaku instancu n_leaves_ : broj listova u hijerahijskom drvetu children_ : deca cvorova koji nisu listovi. Vrednosti manje od broja instanci u skupu predstavljaju indekse instanci. """ colors = ['red', 'green', 'gold', 'blue', 'black'] fig = plt.figure() plt_ind=1 for i in range(2,5): for link in ['complete', 'average']: est=AgglomerativeClustering(n_clusters=i, linkage=link) est.fit(x) make_tree(est.children_, num_instances, df['breed'], 'clus_' + str(i) + '_' + link ) df['labels']= est.labels_ #print('Prvih 5 instanci', df.head(), sep='\n') fig.add_subplot(3, 2, plt_ind) for j in range(0,i): cluster= df.loc[lambda x: x['labels'] == j, :] plt.scatter(cluster['height'], cluster['weight'], color=colors[j], s=30, marker='o', label="cluster %d"%j) plt.legend(loc='lower right') plt.title('Broj klastera: %d, veza: %s' % (i, link)) plt_ind += 1 plt.tight_layout() plt.show()