recommender systems
-
e.g.
netflix => movie => to watch amazon => products => to buy twitter => users => to follow
-
data set
users_interests
users_interests = [ ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"], ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"], ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"], ["R", "Python", "statistics", "regression", "probability"], ["machine learning", "regression", "decision trees", "libsvm"], ["Python", "R", "Java", "C++", "Haskell", "programming languages"], ["statistics", "probability", "mathematics", "theory"], ["machine learning", "scikit-learn", "Mahout", "neural networks"], ["neural networks", "deep learning", "Big Data", "artificial intelligence"], ["Hadoop", "Java", "MapReduce", "Big Data"], ["statistics", "R", "statsmodels"], ["C++", "deep learning", "artificial intelligence", "probability"], ["pandas", "R", "Python"], ["databases", "HBase", "Postgres", "MySQL", "MongoDB"], ["libsvm", "regression", "support vector machines"] ]
-
recommending what’s popular
-
easy approach is to simply recommend what’s popular
popular_interests = Counter(interest for user_interests in users_interests for interest in user_interests).most_common() [('Python', 4), ('R', 4), ('Java', 3), ('regression', 3), ('statistics', 3), ('probability', 3), # ... ] # just suggest to a user # the most popular interests # that he's not already interested in def most_popular_new_interests(user_interests, max_results=5): suggestion = [(interest, frequency) for interest, frequency in popular_interests if interest not in user_interests] return suggestion[:max_results] # if user 1 with interests ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"] # recommend most_popular_new_interests(users_interests[1], 5) # [('Python', 4), ('R', 4), ('Java', 3), ('regression', 3), ('statistics', 3)]
-
-
user-based collaborative filtering
-
look for users who are somewhow
similar
to him# need a way to measure how similar two users are # cosine similarity def consine_similarity(v, w): return dot(v, w) / math.sqrt(dot(v, v) * dot(w, w)) # it measures the `angle` between `v` and `w` # if `v` and `w` point in the same direction # then the numerator and denominator are equal # and their consine similarity equals 1 ######################################### very similar # if `v` and `w` point in opposite direction # their consine similarity equals -1 # if `v` or `w` is 0 # their consine similarity equals 0 ##################################### not very similar
-
steps
# 1. unique_interests unique_interests = sorted(list({interest for user_interests in users_interests for interest in user_interests})) # list ['Big Data', 'C++', 'Cassandra', 'HBase', 'Hadoop', 'Haskell', # ... ] # 2. produce an `interest` vector of 0s and 1s for each user def make_user_interest_vector(user_interests): """given a list of interests produce a vector whose ith element is 1 if unique_interests[i] is in the list, 0 otherwise""" return [1 if interest in user_interests else 0 for interest in unique_interests] # 3. create a matrix user_interest_matrix = map(make_user_interest_vector, users_interests) # user_interest_matrix[i][j] == 1 if i specified interest j # 0 otherwise # 4. compute pairwise similarities user_similarities = [[consine_similarity(interest_vector_i, interest_vector_j) for interest_vector_j in user_interest_matrix] for interest_vector_i in user_interest_matrix] # user_similarities[0][0] == 0.57 # share interests in `hadoop, java, big data` # user_similarities[0][8] == 0.19 # share only one interest `big data` # 5. user_similarities[i] is the vector of i's similarities to every other user # finds the most similar users to a given user # not to include the user herself # nor any users with zero similarity # sort the results from most similar to least similar def most_similar_users_to(user_id): pairs = [(other_user_id, similarity) for other_user_id, similarity in enumerate(user_similarities[user_id]) if user_id != other_user_id and similarity > 0] return sorted(pairs, key=lambda (_, similarity): similarity, reverse=True) [(9, 0.5669467095138409), (1, 0.3380617018914066), (8, 0.1889822365046136), (13, 0.1690308509457033), (5, 0.1543033499620919)] # 6. suggest new interests to a user def user_based_suggestions(user_id, include_current_interests=False): suggestions = defaultdict(float) for other_user_id, similarity in most_similar_users_to(user_id): for interest in users_interests[other_user_id]: suggestion[interest] += similarity # convert them to a sorted list suggestions = sorted(suggestions.items(), key=lambda (_, weight): weight, reverse=True) # and (maybe) exclude already-interests if include_current_interests: return suggestions else: return [(suggestion, weight) for suggestion, weight in suggestions if suggestion not in users_interests[user_id]] # user_based_suggestion(0) [('MapReduce', 0.5669467095138409), ('MongoDB', 0.50709255283711), ('Postgres', 0.50709255283711), ('NoSQL', 0.3380617018914066), ('neural networks', 0.1889822365046136), ('deep learning', 0.1889822365046136), ('artificial intelligence', 0.1889822365046136), #... ]
-
-
item-based collaborative filtering
-
compute similarities between interests directly
# generate suggestions for each user # by aggregating interests # that are similar to her current interests
-
steps
# 1. transpose user-interest matrix # rows => interests # cols => users interest_user_matrix = [[user_interest_vector[j] for user_interest_vector in user_interest_matrix] for j, _ in enumerate(unique_interests)] # 1. e.g. unique_interests[0] => bigdata [1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0] 0 user 8, 9 user # 2. use cosine similarity # similarity 1 <= if precisely the same users are interested in two topics # similarity 0 <= if no two users are interested in both topics interest_similarity = [[consine_similarity(user_vector_i, user_vector_j) for user_vector_j in interest_user_matrix] for user_vector_i in interest_user_matrix] # 3. e.g. find interests most similar to bigdata (interest 0) def most_similar_interest_to(interest_id): similarities = interest_similarities[interest_id] pairs = [(unique_interests[other_interest_id], similarity) for other_interest_id, similarity in enumerate(similarities) if interest_id != other_interest_id and similarity > 0] return sorted(pairs, key=lambda (_, similarity): similarity, reverse=True) [('Hadoop', 0.8164965809277261), ('Java', 0.6666666666666666), ('MapReduce', 0.5773502691896258), ('Spark', 0.5773502691896258), ('Storm', 0.5773502691896258), ('Cassandra', 0.4082482904638631), ('artificial intelligence', 0.4082482904638631), ('deep learning', 0.4082482904638631), ('neural networks', 0.4082482904638631), ('HBase', 0.3333333333333333)] # 4. create recommendations for a user # by summing up the similarities of the interests # similar to his def item_based_suggestions(user_id, include_current_interest=False): # add up the similar interests suggestions = defaultdict(float) user_interest_vector = user_interest_matirx[user_id] for interest_id, is_interested in enumerate(user_interest_vector): if is_interested: similar_interests = most_similar_interests_to(interest_id) for interest, similarity in similar_interest: suggestions[interest] += similarity # sort them by weight suggestions = sorted(suggestions.items(), key=lambda (_, similarity): similarity, reverse=True) if include_current_interests: return suggestions else: return [(suggestion, weight) for suggestion, weight in suggestions if suggestion not in users_interests[user_id]] # 5. e.g. user 0 [('MapReduce', 1.861807319565799), ('Postgres', 1.3164965809277263), ('MongoDB', 1.3164965809277263), ('NoSQL', 1.2844570503761732), ('programming languages', 0.5773502691896258), ('MySQL', 0.5773502691896258), ('Haskell', 0.5773502691896258), ('databases', 0.5773502691896258), ('neural networks', 0.4082482904638631), ('deep learning', 0.4082482904638631), ('C++', 0.4082482904638631), ('artificial intelligence', 0.4082482904638631), ('Python', 0.2886751345948129), ('R', 0.2886751345948129)]
-