Create Clustering.py

2019-10-12 18:55:49 +02:00 · 2019-10-12 18:55:49 +02:00 · e4570449a7
commit e4570449a7
parent 2dedde124c
1 changed files with 52 additions and 0 deletions
--- a/maths-prog/MachineLearningDemystified/Clustering.py
+++ b/maths-prog/MachineLearningDemystified/Clustering.py
@ -0,0 +1,52 @@
 directory = '/home/nuno/Documents/Jobs/IDInsight'
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
 # Import the algorithms
 from sklearn.cluster import KMeans
 from sklearn.cluster import AffinityPropagation
 from sklearn.cluster import MeanShift
 from sklearn.cluster import SpectralClustering
 from sklearn.cluster import AgglomerativeClustering
 from sklearn.cluster import DBSCAN
 from sklearn.cluster import Birch
 from sklearn.mixture import GaussianMixture
 ## Install the dataframe
 insuranceDataFrame = pd.read_csv(directory + '/insurance_clean_continuous.csv')
 ## Gather the algorithms
 clusterings = [KMeans(n_clusters=3, random_state=0).fit(insuranceDataFrame), AffinityPropagation(affinity='euclidean', convergence_iter=100, copy=True, damping=.9, max_iter=200, preference=None, verbose=False).fit(insuranceDataFrame), MeanShift().fit(insuranceDataFrame), SpectralClustering(n_clusters=3, affinity='nearest_neighbors', n_neighbors = 20, random_state=0).fit(insuranceDataFrame), AgglomerativeClustering(n_clusters=3).fit(insuranceDataFrame), DBSCAN(eps=500, min_samples=3).fit(insuranceDataFrame), Birch(branching_factor=50, n_clusters=3, threshold=0.5, compute_labels=True).fit(insuranceDataFrame)]
    # This takes a while
 names = ["KMeans", "AffinityPropagation", "MeanShift", "SpectralClustering", "AgglomerativeClustering", "DBSCAN", "Birch"]
 assert(len(names)==len(clusterings))
 ## Plot the plots
 sns.set()
 ## This produces *a lot* of plots!
 for k in range(len(names)):
    insuranceDataFrame["cluster"] = clusterings[k].labels_
    n= len(np.unique(clusterings[k].labels_))
    for variable in insuranceDataFrame.columns:
        sns_plot = sns.relplot(x="charges", y=variable, hue="cluster", data=insuranceDataFrame, palette = sns.color_palette("hls", n))
        plt.subplots_adjust(top=0.9)  ## Makes more room at the top
        plt.title(names[k] + ": charges ~ " + variable, y=1.3)
        sns_plot.savefig("figures/" + names[k] + "-" + variable + ".png")
 ## Gaussian Requires different syntax
 clustering = GaussianMixture(n_components=3).fit(insuranceDataFrame)
 n= len(np.unique(clustering.predict(insuranceDataFrame)))
 insuranceDataFrame["cluster"] = clustering.predict(insuranceDataFrame)
 for variable in insuranceDataFrame.columns:
    sns_plot = sns.relplot(x="charges", y=variable, hue="cluster", data=insuranceDataFrame,palette=sns.color_palette("hls", n))
    plt.subplots_adjust(top=0.9)  ## Makes more room at the top
    plt.title("GaussianMixture" + ": charges ~ " + variable, y=1.3)
    sns_plot.savefig("figures/" +"GaussianMixture" + "-" + variable + ".png")