import pandas as pd import numpy as np from sklearn.linear_model import LogisticRegression import pandas as pd import numpy as np import lightgbm as lgb from sklearn.metrics import classification_report from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from drugi_cas.modeli import classification_model # Ucitavanje podataka df_train = pd.read_csv(r'loan_podaci/train_ctrUa4K.csv') df_test = pd.read_csv(r'loan_podaci/test_lAUu6dG.csv') # Spajanje u zajednicki dataframe: df = pd.concat([df_train, df_test]) # postavljanje indeksa df.set_index('Loan_ID', inplace=True) df['Self_Employed'].fillna('No', inplace=True) def zamena_nedostajucih(x): agregacija = df.pivot_table(values='LoanAmount', index='Self_Employed', columns='Education', aggfunc=np.median) return agregacija.loc[x['Self_Employed'], x['Education']] df['LoanAmount'].fillna(df[df['LoanAmount'].isnull()].apply(zamena_nedostajucih, axis=1), inplace=True) # kreiranje novih kolona df['LoanAmount_log'] = np.log(df['LoanAmount']) df['TotalIncome'] = df['ApplicantIncome'] + df['CoapplicantIncome'] df['TotalIncome_log'] = np.log(df['TotalIncome']) df['Gender'].fillna(df['Gender'].mode()[0], inplace=True) df['Married'].fillna(df['Married'].mode()[0], inplace=True) df['Dependents'].fillna(df['Dependents'].mode()[0], inplace=True) df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0], inplace=True) df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True) # enkodiranje promennjivih kategorickog tipa for kolona in df.columns: if df[kolona]._is_numeric_mixed_type == False: df[kolona] = df[kolona].astype('category').cat.codes.astype('category') x_train = df.iloc[:df_train.shape[0]].drop('Loan_Status', axis=1) y_train = df.iloc[:df_train.shape[0]].Loan_Status # ili df_train['Loan_Status'] x_test = df.iloc[df_train.shape[0]:].drop('Loan_Status', axis=1) model = LogisticRegression(solver='lbfgs') predictor_var = ['Credit_History'] rezultat = classification_model(model, x_train, predictor_var, y_train, x_test) prag = 0.5 x_test['Loan_Status'] = np.where(rezultat > prag, 'Y', 'N') x_test['Loan_Status'].reset_index().to_csv('loan_podaci/primer_izvoza_logisticka.csv', index=False)