53 lines
2.6 KiB
Python
53 lines
2.6 KiB
Python
|
directory = '/home/nuno/Documents/Jobs/IDInsight'
|
||
|
|
||
|
import pandas as pd
|
||
|
import numpy as np
|
||
|
import matplotlib.pyplot as plt
|
||
|
import seaborn as sns
|
||
|
|
||
|
# Import the algorithms
|
||
|
from sklearn.cluster import KMeans
|
||
|
from sklearn.cluster import AffinityPropagation
|
||
|
from sklearn.cluster import MeanShift
|
||
|
from sklearn.cluster import SpectralClustering
|
||
|
from sklearn.cluster import AgglomerativeClustering
|
||
|
from sklearn.cluster import DBSCAN
|
||
|
from sklearn.cluster import Birch
|
||
|
from sklearn.mixture import GaussianMixture
|
||
|
|
||
|
## Install the dataframe
|
||
|
insuranceDataFrame = pd.read_csv(directory + '/insurance_clean_continuous.csv')
|
||
|
|
||
|
## Gather the algorithms
|
||
|
clusterings = [KMeans(n_clusters=3, random_state=0).fit(insuranceDataFrame), AffinityPropagation(affinity='euclidean', convergence_iter=100, copy=True, damping=.9, max_iter=200, preference=None, verbose=False).fit(insuranceDataFrame), MeanShift().fit(insuranceDataFrame), SpectralClustering(n_clusters=3, affinity='nearest_neighbors', n_neighbors = 20, random_state=0).fit(insuranceDataFrame), AgglomerativeClustering(n_clusters=3).fit(insuranceDataFrame), DBSCAN(eps=500, min_samples=3).fit(insuranceDataFrame), Birch(branching_factor=50, n_clusters=3, threshold=0.5, compute_labels=True).fit(insuranceDataFrame)]
|
||
|
# This takes a while
|
||
|
names = ["KMeans", "AffinityPropagation", "MeanShift", "SpectralClustering", "AgglomerativeClustering", "DBSCAN", "Birch"]
|
||
|
assert(len(names)==len(clusterings))
|
||
|
|
||
|
## Plot the plots
|
||
|
sns.set()
|
||
|
|
||
|
## This produces *a lot* of plots!
|
||
|
for k in range(len(names)):
|
||
|
|
||
|
insuranceDataFrame["cluster"] = clusterings[k].labels_
|
||
|
|
||
|
n= len(np.unique(clusterings[k].labels_))
|
||
|
|
||
|
for variable in insuranceDataFrame.columns:
|
||
|
sns_plot = sns.relplot(x="charges", y=variable, hue="cluster", data=insuranceDataFrame, palette = sns.color_palette("hls", n))
|
||
|
plt.subplots_adjust(top=0.9) ## Makes more room at the top
|
||
|
plt.title(names[k] + ": charges ~ " + variable, y=1.3)
|
||
|
sns_plot.savefig("figures/" + names[k] + "-" + variable + ".png")
|
||
|
|
||
|
## Gaussian Requires different syntax
|
||
|
clustering = GaussianMixture(n_components=3).fit(insuranceDataFrame)
|
||
|
n= len(np.unique(clustering.predict(insuranceDataFrame)))
|
||
|
insuranceDataFrame["cluster"] = clustering.predict(insuranceDataFrame)
|
||
|
|
||
|
for variable in insuranceDataFrame.columns:
|
||
|
sns_plot = sns.relplot(x="charges", y=variable, hue="cluster", data=insuranceDataFrame,palette=sns.color_palette("hls", n))
|
||
|
plt.subplots_adjust(top=0.9) ## Makes more room at the top
|
||
|
plt.title("GaussianMixture" + ": charges ~ " + variable, y=1.3)
|
||
|
sns_plot.savefig("figures/" +"GaussianMixture" + "-" + variable + ".png")
|