dsfs 15 multiple regression

multiple regression

hypothesize a linear model with more independent variables

     minutes = α + β1friends + β2work hours + β3phd + ε

     # in ch11 introduce a dummy variable
     # 1 for users with phds 0 for users without

the model

ch14

 # fit a model of the form
 yi = α + βxi + εi

 # each input xi is not a signle number
 # but rather a vector of k numbers
 # xi1,...xik

 # multiple regression model assumes that
 yi = α + β1xi1 + ... + βkxik + εi

in multiple regression the vector of parameters is ususlly called β

 beta = [alpha, beta_1, ..., beta_k]
 # and
 x_i = [1, x_i1, ..., x_ik]

 # then our model is just
 def predict(x_i, beta):
     """assumes that the first element of each x_i is 1"""
     return dot(x_i, beta)

 # our independent variable x will be a list of vectors
 # each of which looks like this
 [1,   # constant term
  49,  # number of friends
  4,   # work hours per day
  0]   # doesn't have phd

further assumptions of the least squares model

     # e.g. in ch14 we built a model
     # predicted that each additonal friends
     # was associated with an extra 0.90 daily minutes on the site

imagine

 # 1. people work more hours spend less time on site
 # 2. people with more friends tend to work more hours

 # "actual" model is
 minutes = α + β1friends + β2work hours + ε

 # work hours and friends are positively correlated
 minutes = α + β1friends + ε

 # we will underestimated β1

fitting the model

     def error(x_i, y_i, beta):
         return y_i - predict(x_i, beta)

     def squared_error(x_i, y_i, beta):
         return error(x_i, y_i, beta) ** 2

     # use calculus
     def squared_error_gradient(x_i, y_i, beta):
         """the gradient (with respect to beta)
         corresponding to the ith squared error term"""
         return [-2 * x_ij * error(x_i, y_i, beta)
                 for x_ij in x_i]

find optimal beta using stochastic gradient descent

 def estimate_beta(x, y):
     beta_initial = [random.random() for x_i in x[0]]
     return minimize_stochastic(squared_error,
                                squared_error_gradient,
                                x, y,
                                beta_initial,
                                0.001)
 random.seed(0)
 # [30.63, 0.972, -1.868, 0.911]
 beta = estimate_beta(x, daily_minutes_good)

 # this means our model looks like
 minutes = 30.63 + 0.972 friends − 1.868 work hours + 0.911 phd

interpreting the model

goodness for fit

r-squared

 def multiple_r_squared(x, y, beta):
     sum_of_squared_errors = sum(error(x_i, y_i, beta) ** 2
                                 for x_i, y_i in zip(x, y))
     return 1.0 - sum_of_squared_errors / total_sum_of_squares(y)

 # keep in mind
 # adding new variables to a regression
 # will `necessarily` increase the r-squared

digression the bootstrap

     # a sample of n data point
     # generated by some (unknow to us) distribution
     data = get_sample(num_points=n)

     # in ch5 function `median`
     # 1. if all the data are very close to 100
     #    => the actual median is close to 100
     # 2. half the data is close to 0 and the other half is colse to 200
     #    => can't be nearly as certain about the median

bootstrap

 def bootstrap_sample(data):
     """randomly sample len(data) element with replacement"""
     return [random.choice(data) for _ in data]

 def bootstrap_statistic(data, stats_fn, num_samples):
     """evaluates stats_fn on num_samples bootstrap samples from data"""
     return [stats_fn(bootstrap_sample(data))
             for _ in range(num_samples)]

 # e.g.
 # 101 points all very close to 100
 chose_to_100 = [99.5 + random.random() for _ in range(101)]

 # 101 point, 50 near 0, 50 near 200
 far_from_100 = ([99.5 + random.random()] +
                 [random.random() for _ in range(50)] +
                 [200 + random.random() for _ in range(50)])

 # median of each both close to 100
 # however
 bootstrap_statistic(close_to_100, median, 100)
 bootstrap_statistic(far_from_100, median, 100)

standard error of regression coefficients