nunosempere.github.io/maths-prog/MachineLearningDemystified/Clustering.py

53 lines
2.6 KiB
Python
Raw Normal View History

2019-10-12 16:55:49 +00:00
directory = '/home/nuno/Documents/Jobs/IDInsight'
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Import the algorithms
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import MeanShift
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import Birch
from sklearn.mixture import GaussianMixture
## Install the dataframe
insuranceDataFrame = pd.read_csv(directory + '/insurance_clean_continuous.csv')
## Gather the algorithms
clusterings = [KMeans(n_clusters=3, random_state=0).fit(insuranceDataFrame), AffinityPropagation(affinity='euclidean', convergence_iter=100, copy=True, damping=.9, max_iter=200, preference=None, verbose=False).fit(insuranceDataFrame), MeanShift().fit(insuranceDataFrame), SpectralClustering(n_clusters=3, affinity='nearest_neighbors', n_neighbors = 20, random_state=0).fit(insuranceDataFrame), AgglomerativeClustering(n_clusters=3).fit(insuranceDataFrame), DBSCAN(eps=500, min_samples=3).fit(insuranceDataFrame), Birch(branching_factor=50, n_clusters=3, threshold=0.5, compute_labels=True).fit(insuranceDataFrame)]
# This takes a while
names = ["KMeans", "AffinityPropagation", "MeanShift", "SpectralClustering", "AgglomerativeClustering", "DBSCAN", "Birch"]
assert(len(names)==len(clusterings))
## Plot the plots
sns.set()
## This produces *a lot* of plots!
for k in range(len(names)):
insuranceDataFrame["cluster"] = clusterings[k].labels_
n= len(np.unique(clusterings[k].labels_))
for variable in insuranceDataFrame.columns:
sns_plot = sns.relplot(x="charges", y=variable, hue="cluster", data=insuranceDataFrame, palette = sns.color_palette("hls", n))
plt.subplots_adjust(top=0.9) ## Makes more room at the top
plt.title(names[k] + ": charges ~ " + variable, y=1.3)
sns_plot.savefig("figures/" + names[k] + "-" + variable + ".png")
## Gaussian Requires different syntax
clustering = GaussianMixture(n_components=3).fit(insuranceDataFrame)
n= len(np.unique(clustering.predict(insuranceDataFrame)))
insuranceDataFrame["cluster"] = clustering.predict(insuranceDataFrame)
for variable in insuranceDataFrame.columns:
sns_plot = sns.relplot(x="charges", y=variable, hue="cluster", data=insuranceDataFrame,palette=sns.color_palette("hls", n))
plt.subplots_adjust(top=0.9) ## Makes more room at the top
plt.title("GaussianMixture" + ": charges ~ " + variable, y=1.3)
sns_plot.savefig("figures/" +"GaussianMixture" + "-" + variable + ".png")