119: Optimising scikit-learn machine learning models with grid search or randomized search

Machine learning models have many hyper-parameters (parameters set before a model is fitted, and which remain constant throughout model fitting). Optimising model hyper-parameters may involve many model runs with alternative hyper-parameters. In SciKit-Learn, this may be performed in an automated fashion using Grid Search (which explores all combinations of provided hyper-parameters) or Randomized Search (which randomly selects combinations to test).

Grid search and randomized search will perform this optimisation using k-fold validation which avoids potential bias in training/test splits.

Here we will revisit a previous example of machine learning, using Random Forests to predict whether a person has breast cancer. We will then use Grid Search to optimise performance, using the ‘f1’ performance score (https://en.wikipedia.org/wiki/F1_score) as an accuracy score that balances the importance of false negatives and false positives.

First we will look at how we previously built the Random Forests model.

(See https://pythonhealthcare.org/2018/04/17/72-machine-learning-random-forests/ for previous post on Random Forest method)

# import required modules

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

def calculate_diagnostic_performance (actual_predicted):
    """ Calculate diagnostic performance.
    
    Takes a Numpy array of 1 and zero, two columns: actual and predicted
    
    Note that some statistics are repeats with different names
    (precision = positive_predictive_value and recall = sensitivity).
    Both names are returned
    
    Returns a dictionary of results:
        
    1) accuracy: proportion of test results that are correct    
    2) sensitivity: proportion of true +ve identified
    3) specificity: proportion of true -ve identified
    4) positive likelihood: increased probability of true +ve if test +ve
    5) negative likelihood: reduced probability of true +ve if test -ve
    6) false positive rate: proportion of false +ves in true -ve patients
    7) false negative rate:  proportion of false -ves in true +ve patients
    8) positive predictive value: chance of true +ve if test +ve
    9) negative predictive value: chance of true -ve if test -ve
    10) precision = positive predictive value 
    11) recall = sensitivity
    12) f1 = (2 * precision * recall) / (precision + recall)
    13) positive rate = rate of true +ve (not strictly a performance measure)
    """
    # Calculate results
    actual_positives = actual_predicted[:, 0] == 1
    actual_negatives = actual_predicted[:, 0] == 0
    test_positives = actual_predicted[:, 1] == 1
    test_negatives = actual_predicted[:, 1] == 0
    test_correct = actual_predicted[:, 0] == actual_predicted[:, 1]
    accuracy = np.average(test_correct)
    true_positives = actual_positives & test_positives
    true_negatives = actual_negatives & test_negatives
    sensitivity = np.sum(true_positives) / np.sum(actual_positives)
    specificity = np.sum(true_negatives) / np.sum(actual_negatives)
    positive_likelihood = sensitivity / (1 - specificity)
    negative_likelihood = (1 - sensitivity) / specificity
    false_positive_rate = 1 - specificity
    false_negative_rate = 1 - sensitivity
    positive_predictive_value = np.sum(true_positives) / np.sum(test_positives)
    negative_predictive_value = np.sum(true_negatives) / np.sum(test_negatives)
    precision = positive_predictive_value
    recall = sensitivity
    f1 = (2 * precision * recall) / (precision + recall)
    positive_rate = np.mean(actual_predicted[:,1])
    
    # Add results to dictionary
    performance = {}
    performance['accuracy'] = accuracy
    performance['sensitivity'] = sensitivity
    performance['specificity'] = specificity
    performance['positive_likelihood'] = positive_likelihood
    performance['negative_likelihood'] = negative_likelihood
    performance['false_positive_rate'] = false_positive_rate
    performance['false_negative_rate'] = false_negative_rate
    performance['positive_predictive_value'] = positive_predictive_value
    performance['negative_predictive_value'] = negative_predictive_value
    performance['precision'] = precision
    performance['recall'] = recall
    performance['f1'] = f1
    performance['positive_rate'] = positive_rate

    return performance

def load_data ():
    """Load the data set. Here we load the Breast Cancer Wisconsin (Diagnostic)
    Data Set. Data could be loaded from other sources though the structure
    should be compatible with this data set, that is an object with the 
    following attributes:
        .data (holds feature data)
        .feature_names (holds feature titles)
        .target_names (holds outcome classification names)
        .target (holds classification as zero-based number)
        .DESCR (holds text-based description of data set)"""
    
    data_set = datasets.load_breast_cancer()
    return data_set

def normalise (X_train,X_test):
    """Normalise X data, so that training set has mean of zero and standard
    deviation of one"""
    
    # Initialise a new scaling object for normalising input data
    sc=StandardScaler() 
    # Set up the scaler just on the training set
    sc.fit(X_train)
    # Apply the scaler to the training and test sets
    X_train_std=sc.transform(X_train)
    X_test_std=sc.transform(X_test)
    return X_train_std, X_test_std


def print_diagnostic_results (performance):
    """Iterate through, and print, the performance metrics dictionary"""
    
    print('\nMachine learning diagnostic performance measures:')
    print('-------------------------------------------------')
    for key, value in performance.items():
        print (key,'= %0.3f' %value) # print 3 decimal places
    return

def print_feaure_importances (model, features):
    print ()
    print ('Feature importances:')
    print ('--------------------')
    df = pd.DataFrame()
    df['feature'] = features
    df['importance'] = model.feature_importances_
    df = df.sort_values('importance', ascending = False)
    print (df)
    return

def split_data (data_set, split=0.25):
    """Extract X and y data from data_set object, and split into training and
    test data. Split defaults to 75% training, 25% test if not other value 
    passed to function"""
    
    X=data_set.data
    y=data_set.target
    X_train,X_test,y_train,y_test=train_test_split(
        X,y,test_size=split, random_state=0)
    return X_train,X_test,y_train,y_test

def test_model(model, X, y):
    """Return predicted y given X (attributes)"""
    
    y_pred = model.predict(X)
    test_results = np.vstack((y, y_pred)).T
    return test_results

def train_model (X, y):
    """Train the model. Note n_jobs=-1 uses all cores on a computer"""
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(n_jobs=-1)
    model.fit (X,y)
    return model

###### Main code #######

# Load data
data_set = load_data()

# Split data into training and test sets
X_train,X_test,y_train,y_test = split_data(data_set, 0.25)

# Normalise data (not needed for Random Forests)
# X_train_std, X_test_std = normalise(X_train,X_test)

# Train model
model = train_model(X_train, y_train)

# Produce results for test set
test_results = test_model(model, X_test, y_test)

# Measure performance of test set predictions
performance = calculate_diagnostic_performance(test_results)

# Print performance metrics
print_diagnostic_results(performance)
Out: Machine learning diagnostic performance measures:

accuracy = 0.951
sensitivity = 0.944
specificity = 0.962
positive_likelihood = 25.028
negative_likelihood = 0.058
false_positive_rate = 0.038
false_negative_rate = 0.056
positive_predictive_value = 0.977
negative_predictive_value = 0.911
precision = 0.977
recall = 0.944
f1 = 0.960
positive_rate = 0.608

Optimise with grid search

NOTE: Grid search may take considerable time to run!

Grid search enables us to perform an exhaustive search of hyper-parameters (those model parameters that are constant in any one model). We define which hyper-parameters we wish to change, and what values we wish to try. All combinations are tested. Test are performed using k-fold validation which re-runs the model with different train/test splits (this avoids bias in our train/test split, but does increase the time required). You may wish to time small grid search first, so you have a better idea of how many parameter combinations you can realistically look at.

We pass four arguments to the grid search method:

1) The range of values for the hyper-parameters, defined in a dictionary 2) The machine learning model to use 3) The number of k-fold splits to use (cv); a value of 5 will give five 80:20 training/test splits with each sample being present in the test set once 4) The accuracy score to use. In a classification model ‘accuracy’ is common. For a regression model using scoring='neg_mean_squared_error' is common (for grid search an accuracy score must be a ‘utility function’ rather than a ‘cost function’, that is, higher values are better).

If the best model uses a value at one extreme of the provided hyper-paramter ranges then it is best to expand the range of that hyper-paraemter to be sure an optimum has been found.

More info on grid search: https://scikit-learn.org/stable/modules/grid_search.html

An alternative approach is randomised hyper-parameter searching. See https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html

# Use Grid search to optimise
# n_jobs is set to -1 to use all cores on the CPU

from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [10, 30, 100, 300, 1000, 3000],
              'bootstrap': [True, False],
              'min_samples_split': [2, 4, 6, 8, 10],
              'n_jobs': [-1]}

# Grid search will use k-fold cross-validation (CV is number of splits)
# Grid search also needs a ultility function (higher is better) rather than
# a cost function (lower is better) so use neg square mean error

from sklearn.ensemble import RandomForestClassifier
forest_grid = RandomForestClassifier()
grid_search = GridSearchCV(forest_grid, param_grid, cv=10,
                           scoring='accuracy')

grid_search.fit(X_train, y_train); #';' suppresses printed output

Show optimised model hyper-parameters:

# show best parameters
# If best parameters are at the extremes of the searches then extend the range

grid_search.best_params_

Out:

{'bootstrap': True, 'min_samples_split': 6, 'n_estimators': 30, 'n_jobs': -1}
# Or, show full description
grid_search.best_estimator_

Out:

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=6,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

Now we will use the optimised model. We could use the text above (from the output of grid_search.best_estimator_, or we can use grid_search.best_estimator_ directly.

# Use optimised model
model = grid_search.best_estimator_
model.fit (X_train, y_train);

Test optimised model:

test_results = test_model(model, X_test, y_test)

# Measure performance of test set predictions
performance = calculate_diagnostic_performance(test_results)

# Print performance metrics
print_diagnostic_results(performance)

Out:

Machine learning diagnostic performance measures:

accuracy = 0.972
sensitivity = 0.967
specificity = 0.981
positive_likelihood = 51.233
negative_likelihood = 0.034
false_positive_rate = 0.019
false_negative_rate = 0.033
positive_predictive_value = 0.989
negative_predictive_value = 0.945
precision = 0.989
recall = 0.967
f1 = 0.978
positive_rate = 0.615

Our accuracy has now increased from 95.1% to 97.2%.

When the number of parameter combinations because unreasonable large for grid search, and alternative is to use random search, which will select parameters randomly from the ranges given. The number of combinations tried is given by the argument n_iter.

Below is an example where we expand the number of arguments varied (becoming too large for grid search) and use random search to test 50 different samples.

For more details see https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html

## Use Grid search to optimise
# n_jobs is set to -1 to use all cores on the CPU

from sklearn.model_selection import RandomizedSearchCV

param_grid = {'n_estimators': [10, 30, 100, 300, 1000, 3000],
              'bootstrap': [True, False],
              'min_samples_split': range(2,11),
              'max_depth': range(1,30),
              'min_samples_split': [2, 4, 6, 8, 10],
              'n_jobs': [-1]}

n_iter_search = 50

from sklearn.ensemble import RandomForestClassifier
forest_grid = RandomForestClassifier()
random_search = RandomizedSearchCV(forest_grid, param_grid, cv=10,
                           n_iter=n_iter_search, scoring='accuracy')

random_search.fit(X_train, y_train); #';' suppresses printed output
# show best parameters
# If best parameters are at the extremes of the searches then extend the range

random_search.best_params_

Out:

{'n_jobs': -1,
 'n_estimators': 100,
 'min_samples_split': 2,
 'max_depth': 29,
 'bootstrap': False}
# Or, show full description
random_search.best_estimator_

Out:

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=29, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

Now we will train a model with the optimized model hyper-parameters, and test against the test set.

# Use optimised model
model = random_search.best_estimator_
model.fit (X_train, y_train);

test_results = test_model(model, X_test, y_test)

# Measure performance of test set predictions
performance = calculate_diagnostic_performance(test_results)

# Print performance metrics
print_diagnostic_results(performance)

Out:

Machine learning diagnostic performance measures:

accuracy = 0.986
sensitivity = 0.989
specificity = 0.981
positive_likelihood = 52.411
negative_likelihood = 0.011
false_positive_rate = 0.019
false_negative_rate = 0.011
positive_predictive_value = 0.989
negative_predictive_value = 0.981
precision = 0.989
recall = 0.989
f1 = 0.989
positive_rate = 0.629

So though random search does not explore all combinations, because we can increase the number of parameters to explore, comapred with grid search, we have increased our accuracy to 98.6%

116. Random Forests regression

This tutorial provides an alternative regression method to a linear/multiple regression previously described at:

https://pythonhealthcare.org/2018/06/14/86-linear-regression-and-multiple-linear-regression/

Random Forests regression may provide a better predictor than multiple linear regression when the relationship between features (X) and dependent variable (y) is complex.

In regression we seek to predict the value of a continuous variable based on either a single variable, or a set of variables.

The example we will look at below seeks to predict life span based on weight, height, physical activity, BMI, gender, and whether the person has a history of smoking.

This example uses a synthetic data set, which will be downloaded.

Load common libraries and data

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

filename = 'https://gitlab.com/michaelallen1966/1804_python_healthcare_wordpress/raw/master/jupyter_notebooks/life_expectancy.csv'
df = pd.read_csv(filename)
df.head()

Out:

weight 	smoker 	physical_activity_scale 	BMI 	height 	male 	life_expectancy
0 	51 	1 	6 	22 	152 	1 	57
1 	83 	1 	5 	34 	156 	1 	36
2 	78 	1 	10 	18 	208 	0 	78
3 	106 	1 	3 	28 	194 	0 	49
4 	92 	1 	7 	23 	200 	0 	67

Fit model

# Extract features (X) and taregt life expectancy (y)

X = df.values[:, :-1]
y = df.values[:, -1]

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X, y)

Out:

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

Predict values, calculate error, and show predicted vs. actual

# Predict values

predicted = model.predict(X)

# Show mean squared error

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y, predicted)
rmse = np.sqrt(mse)
print (rmse)

Out:
1.4628948576409964

# Plot actual vs predicted

plt.scatter(y,predicted, alpha = 0.5)
plt.xlabel('Actutal')
plt.ylabel('Predicted')
plt.show()

forest_regression

115. A short function to replace (impute) missing numerical data in Pandas DataFrames with median of column values

When we import data into NumPy or Pandas, any empty cells of numerical data will be labelled np.NaN on import. In techniques such as machine learning we may wish to either 1) remove rows with any missing data, or 2) fill in the missing data with a set value, often the median of all other values in that data column. The latter has an advantage that the technique can be used both in training the machine learning model, and in predicting output when we are given examples with some missing data.

Here we define a function that goes through data columns in a Pandas DataFrame, looks to see if there is any missing data and, of there is, replaces np.NaN with the median of all other values in that data column.

import pandas as pd
import numpy as np

def impute_with_median (df):
    """Iterate through columns of Pandas DataFrame.
    Where NaNs exist replace with median"""
    
    # Get list of DataFrame column names
    cols = list(df)
    # Loop through columns
    for column in cols:
        # Transfer column to independent series
        col_data = df[column]
        # Look to see if there is any missing numerical data
        missing_data = sum(col_data.isna())
        if missing_data > 0:
            # Get median and replace missing numerical data with median
            col_median = col_data.median()
            col_data.fillna(col_median, inplace=True)
            df[column] = col_data
    return df   

We will mimic importing data with missing numerical data.

name = ['Bob', 'Jim', 'Anne', 'Rosie', 'Ben', 'Tom']
colour = ['red', 'red', 'red', 'blue', 'red', 'blue']
age = [23, 45, np.NaN, 21, 18, 20]
height = [1.80, np.NaN, 1.65, 1.71, 1.61, 1.76] 

data =pd.DataFrame()
data['name'] = name
data['colour'] = colour
data['age'] = age
data['height'] = height

View the data with missing values.

print (data)

Out:

 	name 	colour 	age 	height
0 	Bob 	red 	23.0 	1.80
1 	Jim 	red 	45.0 	NaN
2 	Anne 	red 	NaN 	1.65
3 	Rosie 	blue 	21.0 	1.71
4 	Ben 	red 	18.0 	1.61
5 	Tom 	blue 	20.0 	1.76

Call the function to replace missing data with the median, and re-examine data.

data = impute_with_median(data)
print (data)

Out:

 	name 	colour 	age 	height
0 	Bob 	red 	23.0 	1.80
1 	Jim 	red 	45.0 	1.71
2 	Anne 	red 	21.0 	1.65
3 	Rosie 	blue 	21.0 	1.71
4 	Ben 	red 	18.0 	1.61
5 	Tom 	blue 	20.0 	1.76

113: Regression analysis with TensorFlow

This code comes from the TensorFlow tutorial here, with minor modifications (such as the additional of regularization to avoid over-fitting).

In a regression problem, we aim to predict the output of a continuous value, like a price or a probability. Contrast this with a classification problem, where we aim to predict a discrete label (for example, where a picture contains an apple or an orange).

This notebook uses the classic Auto MPG Dataset and builds a model to predict the fuel efficiency of late-1970s and early 1980s automobiles. To do this, we’ll provide the model with a description of many models from that time period. This description includes attributes like: cylinders, displacement, horsepower, and weight.


# If needed install seaborn (conda install seaborn or pip install seaborn)

import pathlib
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras import layers

###############################################################################
############################## LOAD DATA ######################################
###############################################################################

# Load data from web and save locally
dataset_path = keras.utils.get_file("auto-mpg.data", 
    "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data")
dataset_path
column_names = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year', 'Origin'] 
raw_dataset = pd.read_csv(dataset_path, names=column_names,
                      na_values = "?", comment='\t',
                      sep=" ", skipinitialspace=True)

raw_dataset.to_csv('mpg.csv', index=False)

# Load data locally
#data = pd.read_csv('mpg.csv')

###############################################################################
############################## CLEAN DATA #####################################
###############################################################################

# Dataset contains some missing data (see by using print(data.isna().sum()))
# Drop rows with missing data
data = data.dropna()

# The "Origin" column is really categorical, not numeric. 
# So convert that to a one-hot:

origin = data.pop('Origin')
data['USA'] = (origin == 1)*1.0
data['Europe'] = (origin == 2)*1.0
data['Japan'] = (origin == 3)*1.0

###############################################################################
############################## CLEAN DATA #####################################
###############################################################################

train_dataset = data.sample(frac=0.8,random_state=0)
test_dataset = data.drop(train_dataset.index)

###############################################################################
############################# EXAMINE DATA ####################################
###############################################################################

# Have a quick look at the joint distribution of a few pairs of columns from
# the training set.

g = sns.pairplot(
        train_dataset[["MPG", "Cylinders", "Displacement", "Weight"]], 
        diag_kind="kde")

fig = g.fig # convert to matplotlib plot. Other seaborn use fig = g.getfig()
fig.show()

# Look at overall stats
train_stats = train_dataset.describe()
train_stats.pop("MPG")
train_stats = train_stats.transpose()
print (train_stats)

###############################################################################
###################### SPLIT FEATURES FROM LABELS #############################
###############################################################################

train_labels = train_dataset.pop('MPG')
test_labels = test_dataset.pop('MPG')

###############################################################################
########################### NORMALISE THE DATA ################################
###############################################################################

# Normalise using the mean and standard deviation from the training set

def norm(x):
  return (x - train_stats['mean']) / train_stats['std']

normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)

###############################################################################
############################### BUILD MODEL ###################################
###############################################################################

# Here, we'll use a Sequential model with two densely connected hidden layers,
# and an output layer that returns a single, continuous value. Regularisation
# helps prevent over-fitting (try adjusting the values; higher numbers = more
# regularisation. Regularisation may be type l1 or l2.)

def build_model():
  model = keras.Sequential([
    layers.Dense(64, kernel_regularizer=keras.regularizers.l1(0.01),
                 activation=tf.nn.relu, 
                 input_shape=[len(train_dataset.keys())]),
                                
    keras.layers.Dense(64, kernel_regularizer=keras.regularizers.l1(0.01),
                 activation=tf.nn.relu),
                       
    keras.layers.Dense(1)])

  optimizer = tf.train.RMSPropOptimizer(0.001)

  model.compile(loss='mse',
                optimizer=optimizer,
                metrics=['mae', 'mse'])
  return model

model = build_model()

# Print a summary of the model

print (model.summary())

###############################################################################
############################### TRAIN MODEL ###################################
###############################################################################

# Display training progress by printing a single dot for each completed epoch
class PrintDot(keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs):
    if epoch % 100 == 0: print('')
    print('.', end='')

EPOCHS = 1000

history = model.fit(
  normed_train_data, train_labels,
  epochs=EPOCHS, validation_split = 0.2, verbose=0,
  callbacks=[PrintDot()])

# Show last few epochs in history
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
print(hist.tail())

###############################################################################
############################### PLOT TRAINING #################################
###############################################################################

def plot_history(history):
  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Mean Abs Error [MPG]')
  plt.plot(hist['epoch'], hist['mean_absolute_error'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_mean_absolute_error'],
           label = 'Val Error')
  plt.legend()
  plt.ylim([0,5])
  
  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Mean Square Error [$MPG^2$]')
  plt.plot(hist['epoch'], hist['mean_squared_error'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_mean_squared_error'],
           label = 'Val Error')
  plt.legend()
  plt.ylim([0,20])
  plt.show()

plot_history(history)

###############################################################################
############################# MAKE PREDICTIONS ################################
###############################################################################

# Make predictions from test-set

test_predictions = model.predict(normed_test_data).flatten()

# Scatter plot plot
plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [MPG]')
plt.ylabel('Predictions [MPG]')
plt.axis('equal')
plt.axis('square')
plt.xlim([0,plt.xlim()[1]])
plt.ylim([0,plt.ylim()[1]])
_ = plt.plot([-100, 100], [-100, 100])
plt.show()

# Error plot
error = test_predictions - test_labels
plt.hist(error, bins = 25)
plt.xlabel("Prediction Error [MPG]")
_ = plt.ylabel("Count")
plt.show

# Copyright (c) 2017 François Chollet
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

112. Splitting data set into training and test sets using Pandas DataFrames methods

Note: this may also be performed using SciKit-Learn train_test_split method, but here we will use native Pandas methods.

Create a DataFrame

# Create pandas data frame

import pandas as pd

name = ['Sam', 'Bill', 'Bob', 'Ian', 'Jo', 'Anne', 'Carl', 'Toni']
age = [22, 34, 18, 34, 76, 54, 21, 8]
gender = ['f', 'm', 'm', 'm', 'f', 'f', 'm', 'f']
height = [1.64, 1.85, 1.70, 1.75, 1.63, 1.79, 1.70, 1.68]
passed_physical = [0, 1, 1, 1, 0, 1, 1, 0]

people = pd.DataFrame()
people['name'] = name
people['age'] = age
people['gender'] = gender
people['height'] = height
people['passed'] = passed_physical

print(people)

Out:

   name  age gender  height  passed
0   Sam   22      f    1.64       0
1  Bill   34      m    1.85       1
2   Bob   18      m    1.70       1
3   Ian   34      m    1.75       1
4    Jo   76      f    1.63       0
5  Anne   54      f    1.79       1
6  Carl   21      m    1.70       1
7  Toni    8      f    1.68       0

Split training and test sets

Here we take a random sample (25%) of rows and remove them from the original data by dropping index values.

# Create a copy of the DataFrame to work from
# Omit random state to have different random split each run

people_copy = people.copy()
train_set = people_copy.sample(frac=0.75, random_state=0)
test_set = people_copy.drop(train_set.index)

print ('Training set')
print (train_set)
print ('\nTest set')
print (test_set)
print ('\nOriginal DataFrame')
print (people)

Out:

Training set
   name  age gender  height  passed
6  Carl   21      m    1.70       1
2   Bob   18      m    1.70       1
1  Bill   34      m    1.85       1
7  Toni    8      f    1.68       0
3   Ian   34      m    1.75       1
0   Sam   22      f    1.64       0

Test set
   name  age gender  height  passed
4    Jo   76      f    1.63       0
5  Anne   54      f    1.79       1

Original DataFrame
   name  age gender  height  passed
0   Sam   22      f    1.64       0
1  Bill   34      m    1.85       1
2   Bob   18      m    1.70       1
3   Ian   34      m    1.75       1
4    Jo   76      f    1.63       0
5  Anne   54      f    1.79       1
6  Carl   21      m    1.70       1
7  Toni    8      f    1.68       0

Use ‘pop’ to extract the labels

‘Pop’ will remove a column from the DataFrame, and transfer it to a new variable.

train_set_labels = train_set.pop('passed')
test_set_labels = test_set.pop('passed')

Training set
   name  age gender  height
6  Carl   21      m    1.70
2   Bob   18      m    1.70
1  Bill   34      m    1.85
7  Toni    8      f    1.68
3   Ian   34      m    1.75
0   Sam   22      f    1.64

Out:

Training set
   name  age gender  height
6  Carl   21      m    1.70
2   Bob   18      m    1.70
1  Bill   34      m    1.85
7  Toni    8      f    1.68
3   Ian   34      m    1.75
0   Sam   22      f    1.64

Training set label (y)
6    1
2    1
1    1
7    0
3    1
0    0
Name: passed, dtype: int64

110. TensorFlow text-based classification – from raw text to prediction

Download the py file from this here: tensorflow.py

If you need to see our guide on installing and using a TensorFlow environment.

Below is a worked example that uses text to classify whether a movie reviewer likes a movie or not.

The code goes through the following steps:
1. import libraries
2. load data
3. clean data
4. convert words to numbers
5. process data for tensorflow
6. build model
7. train model
8. predict outcome (like movie or nor) for previously unseen reviews

Please also see the TensorFlow tutorials where the TensorFlow model building code came from:

https://www.tensorflow.org/tutorials/keras/basic_text_classification

https://www.tensorflow.org/tutorials/keras/overfit_and_underfit

"""
This example starts with with raw text (movie reviews) and predicts whether the 
reviewer liked the movie.

The code goes through the following steps:
    1. import libraries
    2. load data
    3. clean data
    4. convert words to numbers
    5. process data for tensorflow
    6. build model
    7. train model
    8. predict outcome (like movie or nor) for previously unseen reviews

For information on installing a tensorflow environment in Anaconda see:
https://pythonhealthcare.org/2018/12/19/106-installing-and-using-tensorflow-using-anaconda/

For installing anaconda see:
https://www.anaconda.com/download

We import necessary libraries.

If you are missing a library then if using Anaconda from a command line (after
activating tensorflow library) use:
    conda import library-name
        
If you find you missing a nltk download then from a command line (after
activating tensorflow library) use:
    python (to being command line python session)
    import nltk
    nltk.download(library name)
    or
    nltk.download() will open dialoge box where you can install any/all nltk
    libraries
"""

###############################################################################
############################## IMPORT LIBRARIES ############################### 
###############################################################################

import numpy as np
import pandas as pd
import nltk
import tensorflow as tf

from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow import keras


# If not previously performed:
# nltk.download('stopwords')

###############################################################################
################################## LOAD DATA ################################## 
###############################################################################

"""
Here we load up a csv file. Each line contains a text string and then a label.
An example is given to download the imdb dataset which contains 50,000 movie
reviews. The label is 0 or 1 depending on whetehr the reviewer liked the movie.
"""
print ('Loading data')

# If you do not already have the data locally you may download (and save) by:
file_location = 'https://gitlab.com/michaelallen1966/00_python_snippets' +\     '_and_recipes/raw/master/machine_learning/data/IMDb.csv'
data = pd.read_csv(file_location)
# save to current directory
data.to_csv('imdb.csv', index=False)

# If you already have the data locally then you may run the following
# data = pd.read_csv('imdb.csv')

# Change headings of dataframe to make them more universal
data.columns=['text','label']

# We'll now hold back 5% of data for a final test that ha snot been used in
# training

number_of_records = data.shape[0]
number_to_hold_back = int(number_of_records * 0.05)
number_to_use = number_of_records - number_to_hold_back
data = data.head(number_to_use)
data_held_back = data.tail(number_to_hold_back)

###############################################################################
################################## CLEAN DATA ################################# 
###############################################################################

"""
Here we process the data in the following ways:
  1) change all text to lower case
  2) tokenize (breaks text down into a list of words)
  3) remove punctuation and non-word text
  4) find word stems (e.g. runnign, run and runner will be converted to run)
  5) removes stop words (commonly occuring words of little value, e.g. 'the')
"""

stemming = PorterStemmer()
stops = set(stopwords.words("english"))

def apply_cleaning_function_to_list(X):
    cleaned_X = []
    for element in X:
        cleaned_X.append(clean_text(element))
    return cleaned_X

def clean_text(raw_text):
    """This function works on a raw text string, and:
        1) changes to lower case
        2) tokenizes (breaks text down into a list of words)
        3) removes punctuation and non-word text
        4) finds word stems
        5) removes stop words
        6) rejoins meaningful stem words"""
    
    # Convert to lower case
    text = raw_text.lower()
    
    # Tokenize
    tokens = nltk.word_tokenize(text)
    
    # Keep only words (removes punctuation + numbers)
    # use .isalnum to keep also numbers
    token_words = [w for w in tokens if w.isalpha()]
    
    # Stemming
    stemmed_words = [stemming.stem(w) for w in token_words]
    
    # Remove stop words
    meaningful_words = [w for w in stemmed_words if not w in stops]
      
    # Return cleaned data
    return meaningful_words

print ('Cleaning text')
# Get text to clean
text_to_clean = list(data['text'])

# Clean text and add to data
data['cleaned_text'] = apply_cleaning_function_to_list(text_to_clean)

###############################################################################
######################## CONVERT WORDS TO NUMBERS ############################# 
###############################################################################

"""
The frequency of all words is counted. Words are then given an index number so
that th emost commonly occurring words hav ethe lowest number (so the 
dictionary may then be truncated at any point to keep the most common words).
We avoid using the index number zero as we will use that later to 'pad' out
short text.
"""

def training_text_to_numbers(text, cutoff_for_rare_words = 1):
    """Function to convert text to numbers. Text must be tokenzied so that
    test is presented as a list of words. The index number for a word
    is based on its frequency (words occuring more often have a lower index).
    If a word does not occur as many times as cutoff_for_rare_words,
    then it is given a word index of zero. All rare words will be zero.
    """

    # Flatten list if sublists are present
    if len(text) > 1:
        flat_text = [item for sublist in text for item in sublist]
    else:
        flat_text = text
    
    # get word freuqncy
    fdist = nltk.FreqDist(flat_text)

    # Convert to Pandas dataframe
    df_fdist = pd.DataFrame.from_dict(fdist, orient='index')
    df_fdist.columns = ['Frequency']

    # Sort by word frequency
    df_fdist.sort_values(by=['Frequency'], ascending=False, inplace=True)

    # Add word index
    number_of_words = df_fdist.shape[0]
    df_fdist['word_index'] = list(np.arange(number_of_words)+1)
    
    # Convert pandas to dictionary
    word_dict = df_fdist['word_index'].to_dict()
    
    # Use dictionary to convert words in text to numbers
    text_numbers = []
    for string in text:
        string_numbers = [word_dict[word] for word in string]
        text_numbers.append(string_numbers)
    
    return (text_numbers, df_fdist)

# Call function to convert training text to numbers
print ('Convert text to numbers')
numbered_text, dict_df = \
    training_text_to_numbers(data['cleaned_text'].values)

# Keep only word freqeuncies 1 to 10000
def limit_word_count(numbered_text):
    max_word_count = 10000
    filtered_text = []
    for number_list in numbered_text:
        filtered_line = \
            [number for number in number_list if number <=max_word_count]
        filtered_text.append(filtered_line)
        
    return filtered_text
    
data['numbered_text'] = limit_word_count(numbered_text)

# Pickle dataframe and dictionary dataframe (for later use if required)
data.to_pickle('data_numbered.p')
dict_df.to_pickle('data_dictionary_dataframe.p')

###############################################################################
######################### PROCESS DATA FOR TENSORFLOW ######################### 
###############################################################################

"""
Here we extract data from the pandas DataFrame, make all text vectors the same
length (by padding short texts and truncating long ones). We then split into
trainign and test data sets.
"""

print ('Process data for TensorFlow model')

# At this point pickled data (processed in an earlier run) might be loaded with 
# data=pd.read_pickle(file_name)
# dict_df=pd.read_pickle(filename)

# Get data from datframe and put in X and y lists
X = list(data.numbered_text.values)
y = data.label.values

## MAKE ALL X DATA THE SAME LENGTH
# We will use keras to make all X data a length of 512.
# Shorter data will be padded with 0, longer data will be truncated.
# We have oreviously kept the value zero free from use..

processed_X = \
    keras.preprocessing.sequence.pad_sequences(X,
                                               value=0,
                                               padding='post',
                                               maxlen=512)

## SPLIT DATA INTO TRAINIGN AND TEST SETS

X_train, X_test, y_train, y_test=train_test_split(
        processed_X,y,test_size=0.2,random_state=999)

###############################################################################
########################## BUILD MODEL AND OPTIMIZER ########################## 
###############################################################################

"""
Here we construct a four-layer neural network with keras/tensorflow.
The first layer is the input layer, then we have two hidden layers, and an
output layer.
"""

print ('Build model')

# BUILD MODEL

# input shape is the vocabulary count used for the text-to-numebr conversion 
# (10,000 words plus one for our zero padding)
vocab_size = 10001

###############################################################################
# Info on neural network layers
#
# The layers are stacked sequentially to build the classifier:
#
# The first layer is an Embedding layer. This layer takes the integer-encoded 
# vocabulary and looks up the embedding vector for each word-index. These 
# vectors are learned as the model trains. The vectors add a dimension to the 
# output array. The resulting dimensions are: (batch, sequence, embedding).
#
# Next, a GlobalAveragePooling1D layer returns a fixed-length output vector for
# each example by averaging over the sequence dimension. This allows the model 
# to handle input of variable length, in the simplest way possible.
#
# This fixed-length output vector is piped through a fully-connected (Dense) 
# layer with 16 hidden units.
#
# The last layer is densely connected with a single output node. Using the 
# sigmoid activation function, this value is a float between 0 and 1, 
# representing a probability, or confidence level.
#
# The regulaizers help prevent over-fitting. Overfitting is evident when the
# trainign data fit is significant better than the test data fit. The level and
# the type may be adjusted to maximise test accuracy
##############################################################################

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, 16))        

model.add(keras.layers.GlobalAveragePooling1D())

model.add(keras.layers.Dense(16, activation=tf.nn.relu, 
                             kernel_regularizer=keras.regularizers.l2(0.01)))

model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid,
                             kernel_regularizer=keras.regularizers.l2(0.01)))

model.summary()

# CONFIGURE OPTIMIZER

model.compile(optimizer=tf.train.AdamOptimizer(),
              loss='binary_crossentropy',
              metrics=['accuracy'])

###############################################################################
################################# TRAIN MODEL ################################# 
###############################################################################

"""
Here we train the model. Using more epochs may give higher accuracy.

In 'real life' you may wish to hold back other test data (e.g. 10% of the 
orginal data so that you may use the test set here to help optimise the 
neutral network parameters and then test the final model on an independent data
set.

When verbose is set to 1, the model will show accuracy and loss for training 
and test data sets

"""

print ('Train model')

# Train model (verbose = 1 shows training progress)
model.fit(X_train,
          y_train,
          epochs=100,
          batch_size=512,
          validation_data=(X_test, y_test),
          verbose=1)


results = model.evaluate(X_train, y_train)
print('\nTraining accuracy:', results[1])

results = model.evaluate(X_test, y_test)
print('\nTest accuracy:', results[1])

###############################################################################
######################### PREDICT RESULTS FOR NEW TEXT ######################## 
###############################################################################

"""
Here we make predictions from text that has never been applied before. As we
are using data that has been held back we may also check its accuracy against 
a known label
 """

print ('\nMake predictions')

# We held some data back from thr original test set
# We will first clean the text

text_to_clean = list(data_held_back['text'].values)
X_clean = apply_cleaning_function_to_list(text_to_clean)
 
# Now we need to convert words to numbers.
# As these are new data it is possible that the word is not recognized so we
# will check the word is in the dictionary

# Convert pandas dataframe to dictionary
word_dict = dict_df['word_index'].to_dict()

# Use dictionary to convert words in text to numbers
text_numbers = []
for string in X_clean:
    string_numbers = []
    for word in string:
        if word in word_dict:
            string_numbers.append(word_dict[word])
    text_numbers.append(string_numbers)

# Keep only the top 10,000 words
# The function is repeated here for clarity (but would not usually be repeated)  

def limit_word_count(numbered_text):
    max_word_count = 10000
    filtered_text = []
    for number_list in numbered_text:
        filtered_line = \
            [number for number in number_list if number <=max_word_count]
        filtered_text.append(filtered_line)
        
    return filtered_text
    
text_numbers = limit_word_count(text_numbers)

# Process into fixed length arrays
    
processed_X = \
    keras.preprocessing.sequence.pad_sequences(text_numbers,
                                               value=0,
                                               padding='post',
                                               maxlen=512)

# Get prediction
predicted_classes = model.predict_classes(processed_X)
# The predicted classes give 0/1 for each possible class. As we only have one
# class we need to 'flatten' this array to remove nesting
predicted_classes = predicted_classes.flatten()

# Check prediction against known label
actual_classes = data_held_back['label'].values
accurate_prediction = predicted_classes == actual_classes
accuracy = accurate_prediction.mean()
print ('Accuracy on unseen data: %.2f' %accuracy)