nunosempere.github.io/maths-prog/MachineLearningDemystified/AlgorithmsRegression,py

directory = '/home/nuno/Documents/Jobs/IDInsight'

import pandas as pd
import numpy as np
import joblib
import matplotlib


## Install the dataframe
insuranceDataFrame = pd.read_csv(directory +'insurance_clean_continuous.csv')
insuranceDataFrame['charges']
## We divide our data into training and test sets, and normalize
dfTrain = insuranceDataFrame[:1000]
dfTest = insuranceDataFrame[1000:1300]
dfCheck = insuranceDataFrame[1300:]

means = np.mean(dfTrain, axis=0)
stds = np.std(dfTrain, axis=0)

dfTrain = (dfTrain - means) / stds
dfTest = (dfTest - means) / stds
dfCheck = (dfCheck - means) / stds

## We don't want this part, only when classifying:
## dfTrain['charges'] = (dfTrain['charges']>=0).astype('int')
## dfTest['charges'] = (dfTest['charges']>=0).astype('int')
## dfCheck['charges'] = (dfCheck['charges']>=0).astype('int')

## Convert our stuff to arrays
trainLabel = np.asarray(dfTrain['charges'])
trainData = np.asarray(dfTrain.drop('charges',1))

testLabel = np.asarray(dfTest['charges'])
testData = np.asarray(dfTest.drop('charges',1))

## We reuse the old function

def runAlgorithm(Classifier, algorithmName):
    global scores, algorithms

    Classifier.fit(trainData, trainLabel)
    scoreInternal = Classifier.score(testData, testLabel)
    print("score = ", scoreInternal * 100, "/100")

    scores = np.append(scores, scoreInternal) ## These are global scope variables
    algorithms = np.append(algorithms, algorithmName)

    joblib.dump([Classifier, means, stds],
                directory +'insuranceModel-' + algorithmName + '.pkl')

algorithms = []
scores = []

## And we are ready!

## ******************************************************************

## Linear Regression
algorithmName = "LinearRegression"
from sklearn.linear_model import LinearRegression
insuranceCheck = LinearRegression()
runAlgorithm(insuranceCheck, algorithmName)

### We also want to visualize some stuff this time:

coeff = list(insuranceCheck.coef_)
labels = list(dfTrain.drop('charges',1).columns)
features = pd.DataFrame()
features['Features'] = labels
features['importance'] = coeff
features.sort_values(by=['importance'], ascending=True, inplace=True)
features['positive'] = features['importance'] > 0
features.set_index('Features', inplace=True)
features.importance.plot(kind='barh', figsize=(11, 6),color = features.positive.map({True: 'blue', False: 'red'}))

## ******************************************************************

## Lasso
algorithmName = "Lasso"
from sklearn.linear_model import LassoCV
insuranceCheck = LassoCV(cv=5, n_alphas=20, tol=0.0001, n_jobs=-1)
runAlgorithm(insuranceCheck, algorithmName)

## ******************************************************************

## Nearest Neighbours Regression
algorithmName = "NearestNeighboursRegression"

from sklearn.neighbors import KNeighborsRegressor

### Let's do some hyperparameter tuning:
score=0
for n_neighbors in range(1,100):
    for leaf_size in range(1,100):
        insuranceCheck = KNeighborsRegressor(n_jobs=-1, n_neighbors=n_neighbors, weights="distance", algorithm="brute", leaf_size=leaf_size, p=2)
        insuranceCheck.fit(trainData, trainLabel)

        score_temp = insuranceCheck.score(testData, testLabel)

        if(score_temp >score):
            print("\nscore = ", score_temp * 100, "/100")
            print("n_neighbors = ", n_neighbors)
            print("leaf_size = ", leaf_size)
            score=score_temp

insuranceCheck = KNeighborsRegressor(n_jobs=-1, n_neighbors=8, weights="distance", algorithm="brute", leaf_size=1, p=2)
runAlgorithm(insuranceCheck, algorithmName)

## ******************************************************************

## Linear SVR
algorithmName = "LinearSVR"
from sklearn.svm import LinearSVR

n=4000
insuranceCheck = LinearSVR(max_iter=n)
runAlgorithm(insuranceCheck, algorithmName)

## ******************************************************************

## SVR different kernels.
algorithmName = "SVR_RBF"
from sklearn.svm import SVR
insuranceCheck = SVR(gamma='auto', C=1, epsilon=0.2, kernel='rbf')
    ## There are different kernels available, including polynomial.
    ## After some tinkering, RBF seems to be the best
runAlgorithm(insuranceCheck, algorithmName)

## ******************************************************************

## Tree regression.
algorithmName = "TreeRegression"
from sklearn.tree import DecisionTreeRegressor

## Hyper parameter optimization.
score = 0

for criterion in np.array(["mse", "friedman_mse", "mae"]):
    for splitter in np.array(["best", "random"]):
        for max_depth in np.append(range(1,10), [None]):
            print("max_depth=",max_depth)
            for min_samples_split in (np.asarray(range(25))+2):
                #print("min_samples_split=", min_samples_split)
                for min_samples_leaf in (np.asarray(range(25))+1):
                    for max_features in np.array(["log2", "auto", None]):
                        insuranceCheck = DecisionTreeRegressor(random_state=0, criterion=criterion, splitter = splitter, max_depth = max_depth, min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf, max_features = max_features)
                        insuranceCheck.fit(trainData, trainLabel)
                        score_temp = insuranceCheck.score(testData, testLabel)
                        if(score_temp > score):
                            print("score = ", score_temp * 100, "/100")
                            print("\nNEW BEST\ncriterion=", criterion, "\nsplitter =", splitter, "\nmax_depth =", max_depth, "\nmin_samples_split =", min_samples_split, "\nmin_samples_leaf =", min_samples_leaf, "\nmax_features =", max_features)
                            score = score_temp

insuranceCheck = DecisionTreeRegressor(random_state=0, criterion="mse", splitter = "random", min_samples_split=14, min_samples_leaf=4, max_features = "auto", max_depth= 9)
runAlgorithm(insuranceCheck, algorithmName)

## ******************************************************************

## Random forest regression.
algorithmName = "RandomForestRegression"

from sklearn.ensemble import RandomForestRegressor
insuranceCheck = RandomForestRegressor(n_estimators=500, max_depth=None, random_state=0, n_jobs=-1)
runAlgorithm(insuranceCheck, algorithmName)

## ******************************************************************

## Extra random forest regression.
algorithmName = "ExtraRandomTreesRegression"
from sklearn.ensemble import ExtraTreesRegressor
insuranceCheck = ExtraTreesRegressor(n_estimators=500, max_depth=None, min_samples_split=20, min_samples_leaf=20, n_jobs=-1)
runAlgorithm(insuranceCheck, algorithmName)

## ******************************************************************
## MultiLayerPerceptron Regression(NN)

from sklearn.neural_network import MLPRegressor
algorithmName = "MLP_Regresion"

for randomstate in range(10):
    insuranceCheck = MLPRegressor(max_iter=1200, hidden_layer_sizes=(30,30,30), random_state=randomstate, learning_rate="adaptive", activation="relu")
    insuranceCheck.fit(trainData, trainLabel)
    score = insuranceCheck.score(testData, testLabel)
    print("score = ", score * 100, "/100")

insuranceCheck = MLPRegressor(max_iter=1200, hidden_layer_sizes=(30,30,30), random_state=5, learning_rate="adaptive", activation="relu")
runAlgorithm(insuranceCheck, algorithmName)

## ******************************************************************

for i in range(len(scores)):
    print(algorithms[i], ": ", scores[i])

## And that's it. We notice that most of our algorithms do pretty well, hovering around .81. Dishonorable mention to LinearSVR.

## Overall, Trees do pretty great.
Create AlgorithmsRegression,py 2019-10-09 18:42:03 +00:00			`directory = '/home/nuno/Documents/Jobs/IDInsight'`

			`import pandas as pd`
			`import numpy as np`
			`import joblib`
			`import matplotlib`


			`## Install the dataframe`
			`insuranceDataFrame = pd.read_csv(directory +'insurance_clean_continuous.csv')`
			`insuranceDataFrame['charges']`
			`## We divide our data into training and test sets, and normalize`
			`dfTrain = insuranceDataFrame[:1000]`
			`dfTest = insuranceDataFrame[1000:1300]`
			`dfCheck = insuranceDataFrame[1300:]`

			`means = np.mean(dfTrain, axis=0)`
			`stds = np.std(dfTrain, axis=0)`

			`dfTrain = (dfTrain - means) / stds`
			`dfTest = (dfTest - means) / stds`
			`dfCheck = (dfCheck - means) / stds`

			`## We don't want this part, only when classifying:`
			`## dfTrain['charges'] = (dfTrain['charges']>=0).astype('int')`
			`## dfTest['charges'] = (dfTest['charges']>=0).astype('int')`
			`## dfCheck['charges'] = (dfCheck['charges']>=0).astype('int')`

			`## Convert our stuff to arrays`
			`trainLabel = np.asarray(dfTrain['charges'])`
			`trainData = np.asarray(dfTrain.drop('charges',1))`

			`testLabel = np.asarray(dfTest['charges'])`
			`testData = np.asarray(dfTest.drop('charges',1))`

			`## We reuse the old function`

			`def runAlgorithm(Classifier, algorithmName):`
			`global scores, algorithms`

			`Classifier.fit(trainData, trainLabel)`
			`scoreInternal = Classifier.score(testData, testLabel)`
			`print("score = ", scoreInternal * 100, "/100")`

			`scores = np.append(scores, scoreInternal) ## These are global scope variables`
			`algorithms = np.append(algorithms, algorithmName)`

			`joblib.dump([Classifier, means, stds],`
			`directory +'insuranceModel-' + algorithmName + '.pkl')`

			`algorithms = []`
			`scores = []`

			`## And we are ready!`

			`## ******************************************************************`

			`## Linear Regression`
			`algorithmName = "LinearRegression"`
			`from sklearn.linear_model import LinearRegression`
			`insuranceCheck = LinearRegression()`
			`runAlgorithm(insuranceCheck, algorithmName)`

			`### We also want to visualize some stuff this time:`

			`coeff = list(insuranceCheck.coef_)`
			`labels = list(dfTrain.drop('charges',1).columns)`
			`features = pd.DataFrame()`
			`features['Features'] = labels`
			`features['importance'] = coeff`
			`features.sort_values(by=['importance'], ascending=True, inplace=True)`
			`features['positive'] = features['importance'] > 0`
			`features.set_index('Features', inplace=True)`
			`features.importance.plot(kind='barh', figsize=(11, 6),color = features.positive.map({True: 'blue', False: 'red'}))`

			`## ******************************************************************`

			`## Lasso`
			`algorithmName = "Lasso"`
			`from sklearn.linear_model import LassoCV`
			`insuranceCheck = LassoCV(cv=5, n_alphas=20, tol=0.0001, n_jobs=-1)`
			`runAlgorithm(insuranceCheck, algorithmName)`

			`## ******************************************************************`

			`## Nearest Neighbours Regression`
			`algorithmName = "NearestNeighboursRegression"`

			`from sklearn.neighbors import KNeighborsRegressor`

			`### Let's do some hyperparameter tuning:`
			`score=0`
			`for n_neighbors in range(1,100):`
			`for leaf_size in range(1,100):`
			`insuranceCheck = KNeighborsRegressor(n_jobs=-1, n_neighbors=n_neighbors, weights="distance", algorithm="brute", leaf_size=leaf_size, p=2)`
			`insuranceCheck.fit(trainData, trainLabel)`

			`score_temp = insuranceCheck.score(testData, testLabel)`

			`if(score_temp >score):`
			`print("\nscore = ", score_temp * 100, "/100")`
			`print("n_neighbors = ", n_neighbors)`
			`print("leaf_size = ", leaf_size)`
			`score=score_temp`

			`insuranceCheck = KNeighborsRegressor(n_jobs=-1, n_neighbors=8, weights="distance", algorithm="brute", leaf_size=1, p=2)`
			`runAlgorithm(insuranceCheck, algorithmName)`

			`## ******************************************************************`

			`## Linear SVR`
			`algorithmName = "LinearSVR"`
			`from sklearn.svm import LinearSVR`

			`n=4000`
			`insuranceCheck = LinearSVR(max_iter=n)`
			`runAlgorithm(insuranceCheck, algorithmName)`

			`## ******************************************************************`

			`## SVR different kernels.`
			`algorithmName = "SVR_RBF"`
			`from sklearn.svm import SVR`
			`insuranceCheck = SVR(gamma='auto', C=1, epsilon=0.2, kernel='rbf')`
			`## There are different kernels available, including polynomial.`
			`## After some tinkering, RBF seems to be the best`
			`runAlgorithm(insuranceCheck, algorithmName)`

			`## ******************************************************************`

			`## Tree regression.`
			`algorithmName = "TreeRegression"`
			`from sklearn.tree import DecisionTreeRegressor`

			`## Hyper parameter optimization.`
			`score = 0`

			`for criterion in np.array(["mse", "friedman_mse", "mae"]):`
			`for splitter in np.array(["best", "random"]):`
			`for max_depth in np.append(range(1,10), [None]):`
			`print("max_depth=",max_depth)`
			`for min_samples_split in (np.asarray(range(25))+2):`
			`#print("min_samples_split=", min_samples_split)`
			`for min_samples_leaf in (np.asarray(range(25))+1):`
			`for max_features in np.array(["log2", "auto", None]):`
			`insuranceCheck = DecisionTreeRegressor(random_state=0, criterion=criterion, splitter = splitter, max_depth = max_depth, min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf, max_features = max_features)`
			`insuranceCheck.fit(trainData, trainLabel)`
			`score_temp = insuranceCheck.score(testData, testLabel)`
			`if(score_temp > score):`
			`print("score = ", score_temp * 100, "/100")`
			`print("\nNEW BEST\ncriterion=", criterion, "\nsplitter =", splitter, "\nmax_depth =", max_depth, "\nmin_samples_split =", min_samples_split, "\nmin_samples_leaf =", min_samples_leaf, "\nmax_features =", max_features)`
			`score = score_temp`

			`insuranceCheck = DecisionTreeRegressor(random_state=0, criterion="mse", splitter = "random", min_samples_split=14, min_samples_leaf=4, max_features = "auto", max_depth= 9)`
			`runAlgorithm(insuranceCheck, algorithmName)`

			`## ******************************************************************`

			`## Random forest regression.`
			`algorithmName = "RandomForestRegression"`

			`from sklearn.ensemble import RandomForestRegressor`
			`insuranceCheck = RandomForestRegressor(n_estimators=500, max_depth=None, random_state=0, n_jobs=-1)`
			`runAlgorithm(insuranceCheck, algorithmName)`

			`## ******************************************************************`

			`## Extra random forest regression.`
			`algorithmName = "ExtraRandomTreesRegression"`
			`from sklearn.ensemble import ExtraTreesRegressor`
			`insuranceCheck = ExtraTreesRegressor(n_estimators=500, max_depth=None, min_samples_split=20, min_samples_leaf=20, n_jobs=-1)`
			`runAlgorithm(insuranceCheck, algorithmName)`

			`## ******************************************************************`
			`## MultiLayerPerceptron Regression(NN)`

			`from sklearn.neural_network import MLPRegressor`
			`algorithmName = "MLP_Regresion"`

			`for randomstate in range(10):`
			`insuranceCheck = MLPRegressor(max_iter=1200, hidden_layer_sizes=(30,30,30), random_state=randomstate, learning_rate="adaptive", activation="relu")`
			`insuranceCheck.fit(trainData, trainLabel)`
			`score = insuranceCheck.score(testData, testLabel)`
			`print("score = ", score * 100, "/100")`

			`insuranceCheck = MLPRegressor(max_iter=1200, hidden_layer_sizes=(30,30,30), random_state=5, learning_rate="adaptive", activation="relu")`
			`runAlgorithm(insuranceCheck, algorithmName)`

			`## ******************************************************************`

			`for i in range(len(scores)):`
			`print(algorithms[i], ": ", scores[i])`

			`## And that's it. We notice that most of our algorithms do pretty well, hovering around .81. Dishonorable mention to LinearSVR.`

			`## Overall, Trees do pretty great.`