dsfs 16 logistic regression

logistic regression

the problem

categorical variables

 # 0 no premium account
 # 1 prenium account

data [experience, salary, paid_account]

 # each element is [1, experience, salary]
 x = [[1] + row[:2] for row in data]

 # each element is paid_account
 y = [row[2] for row in data]

use linear regression and find the best model

 paid account = β0 + β1experience + β2salary + ε

 rescaled_x = rescale(x)
 # [0.26, 0.43, -0.43]
 beta = estimate_beta(rescaled_x, y)
 predictions = [predict(x_i, beta) for x_i in rescaled_x]

 plt.scatter(predictions, y)
 plt.xlabel('predicted')
 plt.ylabel('actual')
 plt.show()

logistic function

code

 def logistic(x):
     return 1.0 / (1 + math.exp(-x))

 # input gets larger and positive it gets closer and closer to 1
 # input gets larger and negative it gets closer and closer to 0

 def logistic_prime(x):
     return logistic(x) * (1 - logistic(x))

 # use this fit a model
 yi = f(xiβ) + εi
 # where f is a logistic function

fit model

 # linear regression => minimizing the sum of squared errors
 # ended up with the β that maximized the likelihood of the data

 # logistic regression => use gradient descent to maximize the likelihood directly
 # need to calculate the likelihood function and its gradient

formular

 p(yi|xi,β) = f(xiβ)^yi(1-f(xiβ))^(1-yi)

 # since if yi is 0 this equals
 1-f(xiβ)

 # if yi is 1 it equals
 f(xiβ)

 # it turns out that it's actually simpler
 # to maximize the `log likelihood`
 logL(β|xi,yi) = yi logf(xiβ) + (1-yi)log(1-f(xiβ))

log is strictly increasing function

 # any beta that maximizes the log likelihood
 # also maximizes the likelihood and vice versa

 def logistic_log_likelihood_i(x_i, y_i, beta):
     if y_i == 1:
         return math.log(logistic(dot(x_i, beta)))
     else:
         return math.log(1 - logistic(dot(x_i, beta)))

if assume different data points are independent from one another

 # the overall likelihood is just the product of the individual likelihood
 # means the overall log likelihood is the sum of the individual log likelihood
 def logistic_log_likelihood(x, y, beta):
     return sum(logistic_log_likelihood_i(x_i, y_i, best)
                for x_i, y_i in zip(x, y))

a little bit of calculus gives us the gradient

 def logistic_log_partial_ij(x_i, y_i, beta, j):
     """here i is the index of the data point,
     j the index of the derivative"""
     return (y_i - logistic(dot(x_i, beta))) * x_i[j]

 def logistic_log_gradient_i(x_i, y_i, beta):
     """the gradient of the log likelihood
     corresponding to the ith data point"""
     return [logistic_log_partial_ij(x_i, y_i, beta, j)
             for j, _ in enumerate(beta)]

 def logistic_log_gradient(x, y, beta):
     return reduce(vector_add,
                   [logistic_log_gradient_i(x_i, y_i, beta)
                   for x_i, y_i in zip(x, y)])

applying the model