From e4570449a7c9a0c676e62ec1a336b5a8ee947dd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nu=C3=B1o=20Sempere?= Date: Sat, 12 Oct 2019 18:55:49 +0200 Subject: [PATCH] Create Clustering.py --- .../MachineLearningDemystified/Clustering.py | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 maths-prog/MachineLearningDemystified/Clustering.py diff --git a/maths-prog/MachineLearningDemystified/Clustering.py b/maths-prog/MachineLearningDemystified/Clustering.py new file mode 100644 index 0000000..04dd956 --- /dev/null +++ b/maths-prog/MachineLearningDemystified/Clustering.py @@ -0,0 +1,52 @@ +directory = '/home/nuno/Documents/Jobs/IDInsight' + +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns + +# Import the algorithms +from sklearn.cluster import KMeans +from sklearn.cluster import AffinityPropagation +from sklearn.cluster import MeanShift +from sklearn.cluster import SpectralClustering +from sklearn.cluster import AgglomerativeClustering +from sklearn.cluster import DBSCAN +from sklearn.cluster import Birch +from sklearn.mixture import GaussianMixture + +## Install the dataframe +insuranceDataFrame = pd.read_csv(directory + '/insurance_clean_continuous.csv') + +## Gather the algorithms +clusterings = [KMeans(n_clusters=3, random_state=0).fit(insuranceDataFrame), AffinityPropagation(affinity='euclidean', convergence_iter=100, copy=True, damping=.9, max_iter=200, preference=None, verbose=False).fit(insuranceDataFrame), MeanShift().fit(insuranceDataFrame), SpectralClustering(n_clusters=3, affinity='nearest_neighbors', n_neighbors = 20, random_state=0).fit(insuranceDataFrame), AgglomerativeClustering(n_clusters=3).fit(insuranceDataFrame), DBSCAN(eps=500, min_samples=3).fit(insuranceDataFrame), Birch(branching_factor=50, n_clusters=3, threshold=0.5, compute_labels=True).fit(insuranceDataFrame)] + # This takes a while +names = ["KMeans", "AffinityPropagation", "MeanShift", "SpectralClustering", "AgglomerativeClustering", "DBSCAN", "Birch"] +assert(len(names)==len(clusterings)) + +## Plot the plots +sns.set() + +## This produces *a lot* of plots! +for k in range(len(names)): + + insuranceDataFrame["cluster"] = clusterings[k].labels_ + + n= len(np.unique(clusterings[k].labels_)) + + for variable in insuranceDataFrame.columns: + sns_plot = sns.relplot(x="charges", y=variable, hue="cluster", data=insuranceDataFrame, palette = sns.color_palette("hls", n)) + plt.subplots_adjust(top=0.9) ## Makes more room at the top + plt.title(names[k] + ": charges ~ " + variable, y=1.3) + sns_plot.savefig("figures/" + names[k] + "-" + variable + ".png") + +## Gaussian Requires different syntax +clustering = GaussianMixture(n_components=3).fit(insuranceDataFrame) +n= len(np.unique(clustering.predict(insuranceDataFrame))) +insuranceDataFrame["cluster"] = clustering.predict(insuranceDataFrame) + +for variable in insuranceDataFrame.columns: + sns_plot = sns.relplot(x="charges", y=variable, hue="cluster", data=insuranceDataFrame,palette=sns.color_palette("hls", n)) + plt.subplots_adjust(top=0.9) ## Makes more room at the top + plt.title("GaussianMixture" + ": charges ~ " + variable, y=1.3) + sns_plot.savefig("figures/" +"GaussianMixture" + "-" + variable + ".png")