nunosempere.github.io/maths-prog/MachineLearningDemystified/CleaningUpData.py

directory = '/home/nuno/Documents/Jobs/IDInsight'

import pandas as pd
import numpy as np

## Install the dataframe
insuranceDataFrame = pd.read_csv(directory + '/insurance.csv')

## Some functions for cleaning up, inspired by R's ifelse function

def ifelse1(x, listOfChecks, yesLabel, noLabel):
    if x in listOfChecks:
        return (yesLabel)
    else:
        return (noLabel)

def ifelse2(x,listOfChecks, listOfLabels):
    n = len(listOfChecks)
    for i in range(n):
        if x == listOfChecks[i]:
            return (listOfLabels[i])
    return None

insuranceDataFrame['sex_numeric'] =insuranceDataFrame['sex'].apply(lambda x: ifelse1(x, np.array(['male']),1,0))

insuranceDataFrame['smoker_numeric'] =insuranceDataFrame['smoker'].apply(lambda x: ifelse1(x, np.array(['yes']),1,0))

insuranceDataFrame['region_numeric'] =insuranceDataFrame['region'].apply(lambda x: ifelse2(x, np.unique(insuranceDataFrame['region']), np.array([0,1,2,3])))

insuranceDataFrame = insuranceDataFrame.drop(["sex", "smoker", "region"],axis=1)

## We save the database

insuranceDataFrame.to_csv(directory +'insurance_clean_continuous.csv', index=False)