From 0ee3fb2c8e9a7c09cebea95404d1844a30437d93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nu=C3=B1o=20Sempere?= Date: Wed, 9 Oct 2019 20:42:03 +0200 Subject: [PATCH] Create AlgorithmsRegression,py --- .../AlgorithmsRegression,py | 196 ++++++++++++++++++ 1 file changed, 196 insertions(+) create mode 100644 maths-prog/MachineLearningDemystified/AlgorithmsRegression,py diff --git a/maths-prog/MachineLearningDemystified/AlgorithmsRegression,py b/maths-prog/MachineLearningDemystified/AlgorithmsRegression,py new file mode 100644 index 0000000..dab02fe --- /dev/null +++ b/maths-prog/MachineLearningDemystified/AlgorithmsRegression,py @@ -0,0 +1,196 @@ +directory = '/home/nuno/Documents/Jobs/IDInsight' + +import pandas as pd +import numpy as np +import joblib +import matplotlib + + +## Install the dataframe +insuranceDataFrame = pd.read_csv(directory +'insurance_clean_continuous.csv') +insuranceDataFrame['charges'] +## We divide our data into training and test sets, and normalize +dfTrain = insuranceDataFrame[:1000] +dfTest = insuranceDataFrame[1000:1300] +dfCheck = insuranceDataFrame[1300:] + +means = np.mean(dfTrain, axis=0) +stds = np.std(dfTrain, axis=0) + +dfTrain = (dfTrain - means) / stds +dfTest = (dfTest - means) / stds +dfCheck = (dfCheck - means) / stds + +## We don't want this part, only when classifying: +## dfTrain['charges'] = (dfTrain['charges']>=0).astype('int') +## dfTest['charges'] = (dfTest['charges']>=0).astype('int') +## dfCheck['charges'] = (dfCheck['charges']>=0).astype('int') + +## Convert our stuff to arrays +trainLabel = np.asarray(dfTrain['charges']) +trainData = np.asarray(dfTrain.drop('charges',1)) + +testLabel = np.asarray(dfTest['charges']) +testData = np.asarray(dfTest.drop('charges',1)) + +## We reuse the old function + +def runAlgorithm(Classifier, algorithmName): + global scores, algorithms + + Classifier.fit(trainData, trainLabel) + scoreInternal = Classifier.score(testData, testLabel) + print("score = ", scoreInternal * 100, "/100") + + scores = np.append(scores, scoreInternal) ## These are global scope variables + algorithms = np.append(algorithms, algorithmName) + + joblib.dump([Classifier, means, stds], + directory +'insuranceModel-' + algorithmName + '.pkl') + +algorithms = [] +scores = [] + +## And we are ready! + +## ****************************************************************** + +## Linear Regression +algorithmName = "LinearRegression" +from sklearn.linear_model import LinearRegression +insuranceCheck = LinearRegression() +runAlgorithm(insuranceCheck, algorithmName) + +### We also want to visualize some stuff this time: + +coeff = list(insuranceCheck.coef_) +labels = list(dfTrain.drop('charges',1).columns) +features = pd.DataFrame() +features['Features'] = labels +features['importance'] = coeff +features.sort_values(by=['importance'], ascending=True, inplace=True) +features['positive'] = features['importance'] > 0 +features.set_index('Features', inplace=True) +features.importance.plot(kind='barh', figsize=(11, 6),color = features.positive.map({True: 'blue', False: 'red'})) + +## ****************************************************************** + +## Lasso +algorithmName = "Lasso" +from sklearn.linear_model import LassoCV +insuranceCheck = LassoCV(cv=5, n_alphas=20, tol=0.0001, n_jobs=-1) +runAlgorithm(insuranceCheck, algorithmName) + +## ****************************************************************** + +## Nearest Neighbours Regression +algorithmName = "NearestNeighboursRegression" + +from sklearn.neighbors import KNeighborsRegressor + +### Let's do some hyperparameter tuning: +score=0 +for n_neighbors in range(1,100): + for leaf_size in range(1,100): + insuranceCheck = KNeighborsRegressor(n_jobs=-1, n_neighbors=n_neighbors, weights="distance", algorithm="brute", leaf_size=leaf_size, p=2) + insuranceCheck.fit(trainData, trainLabel) + + score_temp = insuranceCheck.score(testData, testLabel) + + if(score_temp >score): + print("\nscore = ", score_temp * 100, "/100") + print("n_neighbors = ", n_neighbors) + print("leaf_size = ", leaf_size) + score=score_temp + +insuranceCheck = KNeighborsRegressor(n_jobs=-1, n_neighbors=8, weights="distance", algorithm="brute", leaf_size=1, p=2) +runAlgorithm(insuranceCheck, algorithmName) + +## ****************************************************************** + +## Linear SVR +algorithmName = "LinearSVR" +from sklearn.svm import LinearSVR + +n=4000 +insuranceCheck = LinearSVR(max_iter=n) +runAlgorithm(insuranceCheck, algorithmName) + +## ****************************************************************** + +## SVR different kernels. +algorithmName = "SVR_RBF" +from sklearn.svm import SVR +insuranceCheck = SVR(gamma='auto', C=1, epsilon=0.2, kernel='rbf') + ## There are different kernels available, including polynomial. + ## After some tinkering, RBF seems to be the best +runAlgorithm(insuranceCheck, algorithmName) + +## ****************************************************************** + +## Tree regression. +algorithmName = "TreeRegression" +from sklearn.tree import DecisionTreeRegressor + +## Hyper parameter optimization. +score = 0 + +for criterion in np.array(["mse", "friedman_mse", "mae"]): + for splitter in np.array(["best", "random"]): + for max_depth in np.append(range(1,10), [None]): + print("max_depth=",max_depth) + for min_samples_split in (np.asarray(range(25))+2): + #print("min_samples_split=", min_samples_split) + for min_samples_leaf in (np.asarray(range(25))+1): + for max_features in np.array(["log2", "auto", None]): + insuranceCheck = DecisionTreeRegressor(random_state=0, criterion=criterion, splitter = splitter, max_depth = max_depth, min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf, max_features = max_features) + insuranceCheck.fit(trainData, trainLabel) + score_temp = insuranceCheck.score(testData, testLabel) + if(score_temp > score): + print("score = ", score_temp * 100, "/100") + print("\nNEW BEST\ncriterion=", criterion, "\nsplitter =", splitter, "\nmax_depth =", max_depth, "\nmin_samples_split =", min_samples_split, "\nmin_samples_leaf =", min_samples_leaf, "\nmax_features =", max_features) + score = score_temp + +insuranceCheck = DecisionTreeRegressor(random_state=0, criterion="mse", splitter = "random", min_samples_split=14, min_samples_leaf=4, max_features = "auto", max_depth= 9) +runAlgorithm(insuranceCheck, algorithmName) + +## ****************************************************************** + +## Random forest regression. +algorithmName = "RandomForestRegression" + +from sklearn.ensemble import RandomForestRegressor +insuranceCheck = RandomForestRegressor(n_estimators=500, max_depth=None, random_state=0, n_jobs=-1) +runAlgorithm(insuranceCheck, algorithmName) + +## ****************************************************************** + +## Extra random forest regression. +algorithmName = "ExtraRandomTreesRegression" +from sklearn.ensemble import ExtraTreesRegressor +insuranceCheck = ExtraTreesRegressor(n_estimators=500, max_depth=None, min_samples_split=20, min_samples_leaf=20, n_jobs=-1) +runAlgorithm(insuranceCheck, algorithmName) + +## ****************************************************************** +## MultiLayerPerceptron Regression(NN) + +from sklearn.neural_network import MLPRegressor +algorithmName = "MLP_Regresion" + +for randomstate in range(10): + insuranceCheck = MLPRegressor(max_iter=1200, hidden_layer_sizes=(30,30,30), random_state=randomstate, learning_rate="adaptive", activation="relu") + insuranceCheck.fit(trainData, trainLabel) + score = insuranceCheck.score(testData, testLabel) + print("score = ", score * 100, "/100") + +insuranceCheck = MLPRegressor(max_iter=1200, hidden_layer_sizes=(30,30,30), random_state=5, learning_rate="adaptive", activation="relu") +runAlgorithm(insuranceCheck, algorithmName) + +## ****************************************************************** + +for i in range(len(scores)): + print(algorithms[i], ": ", scores[i]) + +## And that's it. We notice that most of our algorithms do pretty well, hovering around .81. Dishonorable mention to LinearSVR. + +## Overall, Trees do pretty great.