from sklearn.model_selection import StratifiedKFold, KFold # For K-fold cross validation from sklearn import metrics import pandas as pd import numpy as np # Generic function for making a classification model and accessing performance: def classification_model(model, data, predictors, outcome, test_set): # Fit the model: model.fit(data[predictors], outcome) # Make predictions on training set: predictions = model.predict(data[predictors]) # Print accuracy accuracy = metrics.accuracy_score(predictions, outcome) print("Accuracy : %s" % "{0:.3%}".format(accuracy)) # Perform k-fold cross-validation with 5 folds kf = StratifiedKFold(n_splits=5)#KFold() kf.get_n_splits(data[predictors]) error = [] for train, test in kf.split(data[predictors], outcome): # Filter training data train_predictors = (data[predictors].iloc[train, :]) # The target we're using to train the algorithm. train_target = outcome.iloc[train] # Training the algorithm using the predictors and target. model.fit(train_predictors, train_target) # Record error from each cross-validation run error.append(model.score(data[predictors].iloc[test, :], outcome.iloc[test])) print("Cross-Validation Score : %s" % "{0:.3%}".format(np.mean(error))) # Fit the model again so that it can be refered outside the function: model.fit(data[predictors], outcome) # Only for Random Forest # featimp = pd.Series(model.feature_importances_, index=predictors).sort_values(ascending=False) # print(featimp) predictions_test = model.predict(test_set[predictors]) return predictions_test