Create Clustering.py

This commit is contained in:
Nuño Sempere 2019-10-12 18:55:49 +02:00 committed by GitHub
parent 2dedde124c
commit e4570449a7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -0,0 +1,52 @@
directory = '/home/nuno/Documents/Jobs/IDInsight'
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Import the algorithms
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import MeanShift
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import Birch
from sklearn.mixture import GaussianMixture
## Install the dataframe
insuranceDataFrame = pd.read_csv(directory + '/insurance_clean_continuous.csv')
## Gather the algorithms
clusterings = [KMeans(n_clusters=3, random_state=0).fit(insuranceDataFrame), AffinityPropagation(affinity='euclidean', convergence_iter=100, copy=True, damping=.9, max_iter=200, preference=None, verbose=False).fit(insuranceDataFrame), MeanShift().fit(insuranceDataFrame), SpectralClustering(n_clusters=3, affinity='nearest_neighbors', n_neighbors = 20, random_state=0).fit(insuranceDataFrame), AgglomerativeClustering(n_clusters=3).fit(insuranceDataFrame), DBSCAN(eps=500, min_samples=3).fit(insuranceDataFrame), Birch(branching_factor=50, n_clusters=3, threshold=0.5, compute_labels=True).fit(insuranceDataFrame)]
# This takes a while
names = ["KMeans", "AffinityPropagation", "MeanShift", "SpectralClustering", "AgglomerativeClustering", "DBSCAN", "Birch"]
assert(len(names)==len(clusterings))
## Plot the plots
sns.set()
## This produces *a lot* of plots!
for k in range(len(names)):
insuranceDataFrame["cluster"] = clusterings[k].labels_
n= len(np.unique(clusterings[k].labels_))
for variable in insuranceDataFrame.columns:
sns_plot = sns.relplot(x="charges", y=variable, hue="cluster", data=insuranceDataFrame, palette = sns.color_palette("hls", n))
plt.subplots_adjust(top=0.9) ## Makes more room at the top
plt.title(names[k] + ": charges ~ " + variable, y=1.3)
sns_plot.savefig("figures/" + names[k] + "-" + variable + ".png")
## Gaussian Requires different syntax
clustering = GaussianMixture(n_components=3).fit(insuranceDataFrame)
n= len(np.unique(clustering.predict(insuranceDataFrame)))
insuranceDataFrame["cluster"] = clustering.predict(insuranceDataFrame)
for variable in insuranceDataFrame.columns:
sns_plot = sns.relplot(x="charges", y=variable, hue="cluster", data=insuranceDataFrame,palette=sns.color_palette("hls", n))
plt.subplots_adjust(top=0.9) ## Makes more room at the top
plt.title("GaussianMixture" + ": charges ~ " + variable, y=1.3)
sns_plot.savefig("figures/" +"GaussianMixture" + "-" + variable + ".png")