197 lines
		
	
	
		
			7.5 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			197 lines
		
	
	
		
			7.5 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| directory = '/home/nuno/Documents/Jobs/IDInsight'
 | |
| 
 | |
| import pandas as pd
 | |
| import numpy as np
 | |
| import joblib
 | |
| import matplotlib
 | |
| 
 | |
| 
 | |
| ## Install the dataframe
 | |
| insuranceDataFrame = pd.read_csv(directory +'insurance_clean_continuous.csv')
 | |
| insuranceDataFrame['charges']
 | |
| ## We divide our data into training and test sets, and normalize
 | |
| dfTrain = insuranceDataFrame[:1000]
 | |
| dfTest = insuranceDataFrame[1000:1300]
 | |
| dfCheck = insuranceDataFrame[1300:]
 | |
| 
 | |
| means = np.mean(dfTrain, axis=0)
 | |
| stds = np.std(dfTrain, axis=0)
 | |
| 
 | |
| dfTrain = (dfTrain - means) / stds
 | |
| dfTest = (dfTest - means) / stds
 | |
| dfCheck = (dfCheck - means) / stds
 | |
| 
 | |
| ## We don't want this part, only when classifying:
 | |
| ## dfTrain['charges'] = (dfTrain['charges']>=0).astype('int')
 | |
| ## dfTest['charges'] = (dfTest['charges']>=0).astype('int')
 | |
| ## dfCheck['charges'] = (dfCheck['charges']>=0).astype('int')
 | |
| 
 | |
| ## Convert our stuff to arrays
 | |
| trainLabel = np.asarray(dfTrain['charges'])
 | |
| trainData = np.asarray(dfTrain.drop('charges',1))
 | |
| 
 | |
| testLabel = np.asarray(dfTest['charges'])
 | |
| testData = np.asarray(dfTest.drop('charges',1))
 | |
| 
 | |
| ## We reuse the old function
 | |
| 
 | |
| def runAlgorithm(Classifier, algorithmName):
 | |
|     global scores, algorithms
 | |
| 
 | |
|     Classifier.fit(trainData, trainLabel)
 | |
|     scoreInternal = Classifier.score(testData, testLabel)
 | |
|     print("score = ", scoreInternal * 100, "/100")
 | |
| 
 | |
|     scores = np.append(scores, scoreInternal) ## These are global scope variables
 | |
|     algorithms = np.append(algorithms, algorithmName)
 | |
| 
 | |
|     joblib.dump([Classifier, means, stds],
 | |
|                 directory +'insuranceModel-' + algorithmName + '.pkl')
 | |
| 
 | |
| algorithms = []
 | |
| scores = []
 | |
| 
 | |
| ## And we are ready!
 | |
| 
 | |
| ## ******************************************************************
 | |
| 
 | |
| ## Linear Regression
 | |
| algorithmName = "LinearRegression"
 | |
| from sklearn.linear_model import LinearRegression
 | |
| insuranceCheck = LinearRegression()
 | |
| runAlgorithm(insuranceCheck, algorithmName)
 | |
| 
 | |
| ### We also want to visualize some stuff this time:
 | |
| 
 | |
| coeff = list(insuranceCheck.coef_)
 | |
| labels = list(dfTrain.drop('charges',1).columns)
 | |
| features = pd.DataFrame()
 | |
| features['Features'] = labels
 | |
| features['importance'] = coeff
 | |
| features.sort_values(by=['importance'], ascending=True, inplace=True)
 | |
| features['positive'] = features['importance'] > 0
 | |
| features.set_index('Features', inplace=True)
 | |
| features.importance.plot(kind='barh', figsize=(11, 6),color = features.positive.map({True: 'blue', False: 'red'}))
 | |
| 
 | |
| ## ******************************************************************
 | |
| 
 | |
| ## Lasso
 | |
| algorithmName = "Lasso"
 | |
| from sklearn.linear_model import LassoCV
 | |
| insuranceCheck = LassoCV(cv=5, n_alphas=20, tol=0.0001, n_jobs=-1)
 | |
| runAlgorithm(insuranceCheck, algorithmName)
 | |
| 
 | |
| ## ******************************************************************
 | |
| 
 | |
| ## Nearest Neighbours Regression
 | |
| algorithmName = "NearestNeighboursRegression"
 | |
| 
 | |
| from sklearn.neighbors import KNeighborsRegressor
 | |
| 
 | |
| ### Let's do some hyperparameter tuning:
 | |
| score=0
 | |
| for n_neighbors in range(1,100):
 | |
|     for leaf_size in range(1,100):
 | |
|         insuranceCheck = KNeighborsRegressor(n_jobs=-1, n_neighbors=n_neighbors, weights="distance", algorithm="brute", leaf_size=leaf_size, p=2)
 | |
|         insuranceCheck.fit(trainData, trainLabel)
 | |
| 
 | |
|         score_temp = insuranceCheck.score(testData, testLabel)
 | |
| 
 | |
|         if(score_temp >score):
 | |
|             print("\nscore = ", score_temp * 100, "/100")
 | |
|             print("n_neighbors = ", n_neighbors)
 | |
|             print("leaf_size = ", leaf_size)
 | |
|             score=score_temp
 | |
| 
 | |
| insuranceCheck = KNeighborsRegressor(n_jobs=-1, n_neighbors=8, weights="distance", algorithm="brute", leaf_size=1, p=2)
 | |
| runAlgorithm(insuranceCheck, algorithmName)
 | |
| 
 | |
| ## ******************************************************************
 | |
| 
 | |
| ## Linear SVR
 | |
| algorithmName = "LinearSVR"
 | |
| from sklearn.svm import LinearSVR
 | |
| 
 | |
| n=4000
 | |
| insuranceCheck = LinearSVR(max_iter=n)
 | |
| runAlgorithm(insuranceCheck, algorithmName)
 | |
| 
 | |
| ## ******************************************************************
 | |
| 
 | |
| ## SVR different kernels.
 | |
| algorithmName = "SVR_RBF"
 | |
| from sklearn.svm import SVR
 | |
| insuranceCheck = SVR(gamma='auto', C=1, epsilon=0.2, kernel='rbf')
 | |
|     ## There are different kernels available, including polynomial.
 | |
|     ## After some tinkering, RBF seems to be the best
 | |
| runAlgorithm(insuranceCheck, algorithmName)
 | |
| 
 | |
| ## ******************************************************************
 | |
| 
 | |
| ## Tree regression.
 | |
| algorithmName = "TreeRegression"
 | |
| from sklearn.tree import DecisionTreeRegressor
 | |
| 
 | |
| ## Hyper parameter optimization.
 | |
| score = 0
 | |
| 
 | |
| for criterion in np.array(["mse", "friedman_mse", "mae"]):
 | |
|     for splitter in np.array(["best", "random"]):
 | |
|         for max_depth in np.append(range(1,10), [None]):
 | |
|             print("max_depth=",max_depth)
 | |
|             for min_samples_split in (np.asarray(range(25))+2):
 | |
|                 #print("min_samples_split=", min_samples_split)
 | |
|                 for min_samples_leaf in (np.asarray(range(25))+1):
 | |
|                     for max_features in np.array(["log2", "auto", None]):
 | |
|                         insuranceCheck = DecisionTreeRegressor(random_state=0, criterion=criterion, splitter = splitter, max_depth = max_depth, min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf, max_features = max_features)
 | |
|                         insuranceCheck.fit(trainData, trainLabel)
 | |
|                         score_temp = insuranceCheck.score(testData, testLabel)
 | |
|                         if(score_temp > score):
 | |
|                             print("score = ", score_temp * 100, "/100")
 | |
|                             print("\nNEW BEST\ncriterion=", criterion, "\nsplitter =", splitter, "\nmax_depth =", max_depth, "\nmin_samples_split =", min_samples_split, "\nmin_samples_leaf =", min_samples_leaf, "\nmax_features =", max_features)
 | |
|                             score = score_temp
 | |
| 
 | |
| insuranceCheck = DecisionTreeRegressor(random_state=0, criterion="mse", splitter = "random", min_samples_split=14, min_samples_leaf=4, max_features = "auto", max_depth= 9)
 | |
| runAlgorithm(insuranceCheck, algorithmName)
 | |
| 
 | |
| ## ******************************************************************
 | |
| 
 | |
| ## Random forest regression.
 | |
| algorithmName = "RandomForestRegression"
 | |
| 
 | |
| from sklearn.ensemble import RandomForestRegressor
 | |
| insuranceCheck = RandomForestRegressor(n_estimators=500, max_depth=None, random_state=0, n_jobs=-1)
 | |
| runAlgorithm(insuranceCheck, algorithmName)
 | |
| 
 | |
| ## ******************************************************************
 | |
| 
 | |
| ## Extra random forest regression.
 | |
| algorithmName = "ExtraRandomTreesRegression"
 | |
| from sklearn.ensemble import ExtraTreesRegressor
 | |
| insuranceCheck = ExtraTreesRegressor(n_estimators=500, max_depth=None, min_samples_split=20, min_samples_leaf=20, n_jobs=-1)
 | |
| runAlgorithm(insuranceCheck, algorithmName)
 | |
| 
 | |
| ## ******************************************************************
 | |
| ## MultiLayerPerceptron Regression(NN)
 | |
| 
 | |
| from sklearn.neural_network import MLPRegressor
 | |
| algorithmName = "MLP_Regresion"
 | |
| 
 | |
| for randomstate in range(10):
 | |
|     insuranceCheck = MLPRegressor(max_iter=1200, hidden_layer_sizes=(30,30,30), random_state=randomstate, learning_rate="adaptive", activation="relu")
 | |
|     insuranceCheck.fit(trainData, trainLabel)
 | |
|     score = insuranceCheck.score(testData, testLabel)
 | |
|     print("score = ", score * 100, "/100")
 | |
| 
 | |
| insuranceCheck = MLPRegressor(max_iter=1200, hidden_layer_sizes=(30,30,30), random_state=5, learning_rate="adaptive", activation="relu")
 | |
| runAlgorithm(insuranceCheck, algorithmName)
 | |
| 
 | |
| ## ******************************************************************
 | |
| 
 | |
| for i in range(len(scores)):
 | |
|     print(algorithms[i], ": ", scores[i])
 | |
| 
 | |
| ## And that's it. We notice that most of our algorithms do pretty well, hovering around .81. Dishonorable mention to LinearSVR.
 | |
| 
 | |
| ## Overall, Trees do pretty great.
 |