Source code for foehnix.iwls_logit
import numpy as np
from scipy.stats import logistic, norm
import logging
import pandas as pd
import foehnix.foehnix_functions as func
# logger
log = logging.getLogger(__name__)
# TODO: Think about making iwls_logit a class ccmodel with an method iwls_logit
[docs]def iwls_logit(logitx, y, beta=None, standardize=True, maxit=100, tol=1e-8):
"""
Iterative weighted least squares solver for a logistic regression model.
Parameters
----------
logitx : dict
Must contain:
- ``'values'`` : :py:class:`pandas.DataFrame` the model matrix
- ``'center'`` : :py:class:`pandas.Series`, containing the mean of
each model matrix row
- ``'scale'`` : :py:class:`pandas:Series`, containing the standard
deviation of matrix rows
- ``'is_standardized'``: boolean if matrix is standardized
y : :py:class:`numpy.ndarray`
predictor values of shape(len(observations), 1)
beta : :py:class:`numpy.ndarray`
initial regression coefficients. If None will be initialized with 0.
standardize : bool
If True (default) the model matrix will be standardized
maxit : int
maximum number of iterations, default 100.
tol : float
tolerance for improvement between iterations, default 1e-8.
Returns
-------
: dict
"""
# do we have to standardize the model matrix?
if standardize is True:
func.standardize(logitx)
x = logitx['values'].values
if np.isnan(x).any():
raise ValueError('Input logitx.values contains NaN!')
if np.isnan(y).any():
raise ValueError('Input y contains NaN!')
# check if we have columns with constant values (except one intercept).
if [len(np.unique(x[:, col])) for col in range(x.shape[1])].count(1) > 1:
raise ValueError('Model matrix contains columns with constant values.')
# y must be within 0 and 1
if (min(y) < 0) or (max(y) > 1):
raise ValueError('Values of y must be within ]0, 1[.')
# Initialize regression coefficients if needed
if beta is None:
beta = np.zeros(x.shape[1])
eta = x.dot(beta)
eta = eta.reshape(len(eta), 1)
prob = logistic.cdf(eta)
# Lists to trace log-likelihood path and the development of
# the coefficients during optimization.
llpath = []
coefpath = []
i = 1 # iteration variable
delta = 1 # likelihood difference between to iteration: break criteria
converged = True # Set to False if we do not converge before maxit
eps = np.finfo(float).eps
while delta > tol:
# new weights
w = np.sqrt(prob * (1-prob)) + eps
beta = np.linalg.inv((x*w).T.dot(x*w)).dot((x*w).T).dot(
eta*w + (y-prob) / w)
# update latent response eta
eta = x.dot(beta)
# update response
prob = logistic.cdf(eta)
# update log-likelihood sum
llpath.append(np.sum(y * eta - np.log(1+np.exp(eta))))
coefpath.append(beta)
log.debug('Iteration %d, ll=%15.4f' % (i, llpath[-1]))
if i > 1:
delta = llpath[-1] - llpath[-2]
# check if we converged
if i == maxit:
converged = False
log.critical('IWLS solver for logistic model did not converge.')
break
i += 1
# If converged, remove last likelihood and coefficient entries
if converged:
llpath = llpath[:-1]
coefpath = coefpath[:-1]
# calculate standard error
if logitx['is_standardized'] is True:
xds = func.destandardized_values(logitx)
else:
xds = x
beta_se = pd.Series(np.sqrt(np.diag(np.linalg.inv((xds*w).T.dot(xds*w)))),
index=logitx['values'].columns)
del xds
beta = coefpath[-1]
# Effective degree of freedom
edf = np.sum(np.diag((x*w).T.dot(x*w).dot(
np.linalg.inv((x*w).T.dot(x*w)))))
# Keep coefficients destandardized
coef = pd.Series(beta.copy().squeeze(), index=logitx['values'].columns)
if standardize is True:
coef = func.destandardized_coefficients(coef, logitx)
# final logliklihood
ll = llpath[-1]
rval = {'edf': edf,
'loglik': ll,
'AIC': -2*ll + 2*edf,
'BIC': -2*ll + np.log(len(x)) * edf,
'converged': converged,
'beta': beta,
'beta_se': beta_se,
'coef': coef,
'iter': i-1}
return rval
[docs]def iwls_summary(ccmodel):
"""
Prints some statistics for a given concomitant model
Parameters
----------
ccmodel : dict
which is returned by :py:class:`foehnix.iwls_logit`
"""
tmp = pd.DataFrame([], columns=['Estimate', 'Std. Error',
'z_value', 'Pr(>|z|)'],
index=ccmodel['coef'].keys(),
dtype=float)
tmp.loc[:, 'Estimate'] = ccmodel['coef'].copy()
tmp.loc[:, 'Std. Error'] = ccmodel['beta_se'].copy()
tmp.loc[:, 'z_value'] = (tmp.loc[:, 'Estimate'] /
tmp.loc[:, 'Std. Error'])
tmp.loc[:, 'Pr(>|z|)'] = 2 * norm.cdf(0, loc=np.abs(tmp.loc[:, 'z_value']))
idx = ['cc.' + k for k in list(ccmodel['coef'].index)]
tmp.index = idx
print('\n------------------------------------------------------\n')
print('Concomitant model: z test of coefficients\n')
print(tmp)
print('\nNumber of IWLS iterations %d (%s)' %
(ccmodel['iter'],
('converged' if ccmodel['converged'] else 'not converged')))
print("Dispersion parameter for binomial family taken to be 1.")