Create AlgorithmsRegression,py
This commit is contained in:
		
							parent
							
								
									a17172db7b
								
							
						
					
					
						commit
						0ee3fb2c8e
					
				
							
								
								
									
										196
									
								
								maths-prog/MachineLearningDemystified/AlgorithmsRegression,py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										196
									
								
								maths-prog/MachineLearningDemystified/AlgorithmsRegression,py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,196 @@ | |||
| directory = '/home/nuno/Documents/Jobs/IDInsight' | ||||
| 
 | ||||
| import pandas as pd | ||||
| import numpy as np | ||||
| import joblib | ||||
| import matplotlib | ||||
| 
 | ||||
| 
 | ||||
| ## Install the dataframe | ||||
| insuranceDataFrame = pd.read_csv(directory +'insurance_clean_continuous.csv') | ||||
| insuranceDataFrame['charges'] | ||||
| ## We divide our data into training and test sets, and normalize | ||||
| dfTrain = insuranceDataFrame[:1000] | ||||
| dfTest = insuranceDataFrame[1000:1300] | ||||
| dfCheck = insuranceDataFrame[1300:] | ||||
| 
 | ||||
| means = np.mean(dfTrain, axis=0) | ||||
| stds = np.std(dfTrain, axis=0) | ||||
| 
 | ||||
| dfTrain = (dfTrain - means) / stds | ||||
| dfTest = (dfTest - means) / stds | ||||
| dfCheck = (dfCheck - means) / stds | ||||
| 
 | ||||
| ## We don't want this part, only when classifying: | ||||
| ## dfTrain['charges'] = (dfTrain['charges']>=0).astype('int') | ||||
| ## dfTest['charges'] = (dfTest['charges']>=0).astype('int') | ||||
| ## dfCheck['charges'] = (dfCheck['charges']>=0).astype('int') | ||||
| 
 | ||||
| ## Convert our stuff to arrays | ||||
| trainLabel = np.asarray(dfTrain['charges']) | ||||
| trainData = np.asarray(dfTrain.drop('charges',1)) | ||||
| 
 | ||||
| testLabel = np.asarray(dfTest['charges']) | ||||
| testData = np.asarray(dfTest.drop('charges',1)) | ||||
| 
 | ||||
| ## We reuse the old function | ||||
| 
 | ||||
| def runAlgorithm(Classifier, algorithmName): | ||||
|     global scores, algorithms | ||||
| 
 | ||||
|     Classifier.fit(trainData, trainLabel) | ||||
|     scoreInternal = Classifier.score(testData, testLabel) | ||||
|     print("score = ", scoreInternal * 100, "/100") | ||||
| 
 | ||||
|     scores = np.append(scores, scoreInternal) ## These are global scope variables | ||||
|     algorithms = np.append(algorithms, algorithmName) | ||||
| 
 | ||||
|     joblib.dump([Classifier, means, stds], | ||||
|                 directory +'insuranceModel-' + algorithmName + '.pkl') | ||||
| 
 | ||||
| algorithms = [] | ||||
| scores = [] | ||||
| 
 | ||||
| ## And we are ready! | ||||
| 
 | ||||
| ## ****************************************************************** | ||||
| 
 | ||||
| ## Linear Regression | ||||
| algorithmName = "LinearRegression" | ||||
| from sklearn.linear_model import LinearRegression | ||||
| insuranceCheck = LinearRegression() | ||||
| runAlgorithm(insuranceCheck, algorithmName) | ||||
| 
 | ||||
| ### We also want to visualize some stuff this time: | ||||
| 
 | ||||
| coeff = list(insuranceCheck.coef_) | ||||
| labels = list(dfTrain.drop('charges',1).columns) | ||||
| features = pd.DataFrame() | ||||
| features['Features'] = labels | ||||
| features['importance'] = coeff | ||||
| features.sort_values(by=['importance'], ascending=True, inplace=True) | ||||
| features['positive'] = features['importance'] > 0 | ||||
| features.set_index('Features', inplace=True) | ||||
| features.importance.plot(kind='barh', figsize=(11, 6),color = features.positive.map({True: 'blue', False: 'red'})) | ||||
| 
 | ||||
| ## ****************************************************************** | ||||
| 
 | ||||
| ## Lasso | ||||
| algorithmName = "Lasso" | ||||
| from sklearn.linear_model import LassoCV | ||||
| insuranceCheck = LassoCV(cv=5, n_alphas=20, tol=0.0001, n_jobs=-1) | ||||
| runAlgorithm(insuranceCheck, algorithmName) | ||||
| 
 | ||||
| ## ****************************************************************** | ||||
| 
 | ||||
| ## Nearest Neighbours Regression | ||||
| algorithmName = "NearestNeighboursRegression" | ||||
| 
 | ||||
| from sklearn.neighbors import KNeighborsRegressor | ||||
| 
 | ||||
| ### Let's do some hyperparameter tuning: | ||||
| score=0 | ||||
| for n_neighbors in range(1,100): | ||||
|     for leaf_size in range(1,100): | ||||
|         insuranceCheck = KNeighborsRegressor(n_jobs=-1, n_neighbors=n_neighbors, weights="distance", algorithm="brute", leaf_size=leaf_size, p=2) | ||||
|         insuranceCheck.fit(trainData, trainLabel) | ||||
| 
 | ||||
|         score_temp = insuranceCheck.score(testData, testLabel) | ||||
| 
 | ||||
|         if(score_temp >score): | ||||
|             print("\nscore = ", score_temp * 100, "/100") | ||||
|             print("n_neighbors = ", n_neighbors) | ||||
|             print("leaf_size = ", leaf_size) | ||||
|             score=score_temp | ||||
| 
 | ||||
| insuranceCheck = KNeighborsRegressor(n_jobs=-1, n_neighbors=8, weights="distance", algorithm="brute", leaf_size=1, p=2) | ||||
| runAlgorithm(insuranceCheck, algorithmName) | ||||
| 
 | ||||
| ## ****************************************************************** | ||||
| 
 | ||||
| ## Linear SVR | ||||
| algorithmName = "LinearSVR" | ||||
| from sklearn.svm import LinearSVR | ||||
| 
 | ||||
| n=4000 | ||||
| insuranceCheck = LinearSVR(max_iter=n) | ||||
| runAlgorithm(insuranceCheck, algorithmName) | ||||
| 
 | ||||
| ## ****************************************************************** | ||||
| 
 | ||||
| ## SVR different kernels. | ||||
| algorithmName = "SVR_RBF" | ||||
| from sklearn.svm import SVR | ||||
| insuranceCheck = SVR(gamma='auto', C=1, epsilon=0.2, kernel='rbf') | ||||
|     ## There are different kernels available, including polynomial. | ||||
|     ## After some tinkering, RBF seems to be the best | ||||
| runAlgorithm(insuranceCheck, algorithmName) | ||||
| 
 | ||||
| ## ****************************************************************** | ||||
| 
 | ||||
| ## Tree regression. | ||||
| algorithmName = "TreeRegression" | ||||
| from sklearn.tree import DecisionTreeRegressor | ||||
| 
 | ||||
| ## Hyper parameter optimization. | ||||
| score = 0 | ||||
| 
 | ||||
| for criterion in np.array(["mse", "friedman_mse", "mae"]): | ||||
|     for splitter in np.array(["best", "random"]): | ||||
|         for max_depth in np.append(range(1,10), [None]): | ||||
|             print("max_depth=",max_depth) | ||||
|             for min_samples_split in (np.asarray(range(25))+2): | ||||
|                 #print("min_samples_split=", min_samples_split) | ||||
|                 for min_samples_leaf in (np.asarray(range(25))+1): | ||||
|                     for max_features in np.array(["log2", "auto", None]): | ||||
|                         insuranceCheck = DecisionTreeRegressor(random_state=0, criterion=criterion, splitter = splitter, max_depth = max_depth, min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf, max_features = max_features) | ||||
|                         insuranceCheck.fit(trainData, trainLabel) | ||||
|                         score_temp = insuranceCheck.score(testData, testLabel) | ||||
|                         if(score_temp > score): | ||||
|                             print("score = ", score_temp * 100, "/100") | ||||
|                             print("\nNEW BEST\ncriterion=", criterion, "\nsplitter =", splitter, "\nmax_depth =", max_depth, "\nmin_samples_split =", min_samples_split, "\nmin_samples_leaf =", min_samples_leaf, "\nmax_features =", max_features) | ||||
|                             score = score_temp | ||||
| 
 | ||||
| insuranceCheck = DecisionTreeRegressor(random_state=0, criterion="mse", splitter = "random", min_samples_split=14, min_samples_leaf=4, max_features = "auto", max_depth= 9) | ||||
| runAlgorithm(insuranceCheck, algorithmName) | ||||
| 
 | ||||
| ## ****************************************************************** | ||||
| 
 | ||||
| ## Random forest regression. | ||||
| algorithmName = "RandomForestRegression" | ||||
| 
 | ||||
| from sklearn.ensemble import RandomForestRegressor | ||||
| insuranceCheck = RandomForestRegressor(n_estimators=500, max_depth=None, random_state=0, n_jobs=-1) | ||||
| runAlgorithm(insuranceCheck, algorithmName) | ||||
| 
 | ||||
| ## ****************************************************************** | ||||
| 
 | ||||
| ## Extra random forest regression. | ||||
| algorithmName = "ExtraRandomTreesRegression" | ||||
| from sklearn.ensemble import ExtraTreesRegressor | ||||
| insuranceCheck = ExtraTreesRegressor(n_estimators=500, max_depth=None, min_samples_split=20, min_samples_leaf=20, n_jobs=-1) | ||||
| runAlgorithm(insuranceCheck, algorithmName) | ||||
| 
 | ||||
| ## ****************************************************************** | ||||
| ## MultiLayerPerceptron Regression(NN) | ||||
| 
 | ||||
| from sklearn.neural_network import MLPRegressor | ||||
| algorithmName = "MLP_Regresion" | ||||
| 
 | ||||
| for randomstate in range(10): | ||||
|     insuranceCheck = MLPRegressor(max_iter=1200, hidden_layer_sizes=(30,30,30), random_state=randomstate, learning_rate="adaptive", activation="relu") | ||||
|     insuranceCheck.fit(trainData, trainLabel) | ||||
|     score = insuranceCheck.score(testData, testLabel) | ||||
|     print("score = ", score * 100, "/100") | ||||
| 
 | ||||
| insuranceCheck = MLPRegressor(max_iter=1200, hidden_layer_sizes=(30,30,30), random_state=5, learning_rate="adaptive", activation="relu") | ||||
| runAlgorithm(insuranceCheck, algorithmName) | ||||
| 
 | ||||
| ## ****************************************************************** | ||||
| 
 | ||||
| for i in range(len(scores)): | ||||
|     print(algorithms[i], ": ", scores[i]) | ||||
| 
 | ||||
| ## And that's it. We notice that most of our algorithms do pretty well, hovering around .81. Dishonorable mention to LinearSVR. | ||||
| 
 | ||||
| ## Overall, Trees do pretty great. | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user