From 7f500e8250f703338e1d212da343c6f6ed4fe915 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nu=C3=B1o=20Sempere?= Date: Wed, 9 Oct 2019 20:40:56 +0200 Subject: [PATCH] Create CleaningUpData.py --- .../CleaningUpData.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 maths-prog/MachineLearningDemystified/CleaningUpData.py diff --git a/maths-prog/MachineLearningDemystified/CleaningUpData.py b/maths-prog/MachineLearningDemystified/CleaningUpData.py new file mode 100644 index 0000000..6723042 --- /dev/null +++ b/maths-prog/MachineLearningDemystified/CleaningUpData.py @@ -0,0 +1,36 @@ +directory = '/home/nuno/Documents/Jobs/IDInsight' + +import pandas as pd +import numpy as np + +## Install the dataframe +insuranceDataFrame = pd.read_csv(directory + '/insurance.csv') + +## Some functions for cleaning up, inspired by R's ifelse function + +def ifelse1(x, listOfChecks, yesLabel, noLabel): + if x in listOfChecks: + return (yesLabel) + else: + return (noLabel) + +def ifelse2(x,listOfChecks, listOfLabels): + n = len(listOfChecks) + for i in range(n): + if x == listOfChecks[i]: + return (listOfLabels[i]) + return None + +insuranceDataFrame['sex_numeric'] =insuranceDataFrame['sex'].apply(lambda x: ifelse1(x, np.array(['male']),1,0)) + +insuranceDataFrame['smoker_numeric'] =insuranceDataFrame['smoker'].apply(lambda x: ifelse1(x, np.array(['yes']),1,0)) + +insuranceDataFrame['region_numeric'] =insuranceDataFrame['region'].apply(lambda x: ifelse2(x, np.unique(insuranceDataFrame['region']), np.array([0,1,2,3]))) + +insuranceDataFrame = insuranceDataFrame.drop(["sex", "smoker", "region"],axis=1) + +## We save the database + +insuranceDataFrame.to_csv(directory +'insurance_clean_continuous.csv', index=False) + +