nlp
-
word clouds
-
buzzword cloud
data = [ ("big data", 100, 15), ("Hadoop", 95, 25), ("Python", 75, 50), ("R", 50, 40), ("machine learning", 80, 20), ("statistics", 20, 60), ("data science", 60, 70), ("analytics", 90, 3), ("team player", 85, 85), ("dynamic", 2, 90), ("synergies", 70, 0), ("actionable insights", 40, 30), ("think out of the box", 45, 10), ("self-starter", 30, 50), ("customer focus", 65, 15), ("thought leadership", 35, 35)]
-
code
def text_size(total): """equals 8 if total is 0, 28 if total is 200""" return 8 + total / 200 * 20 for word, job_popularity, resume_popularity in data: plt.text(job_popularity, resume_popularity, word, ha='center', va='center', size=text_size(job_popularity + resume_popularity)) plt.xlabel('popularity on job postings') plt.ylabel('popularity on resumes') plt.axis([0, 100, 0, 100]) plt.xticks([]) plt.yticks([]) plt.show()
-
-
n-gram models
-
ch09 retrieve data
# replace unicode def fix_unicode(text): return text.replace(u'\u2089', "'") # code from bs4 import BeautifulSoup import requests url = 'http://radar.oreilly.com/2010/06/what-is-data-science.html' html = requests.get(url).text soup = BeautifulSoup(html, 'html5lib') content = soup.find('div', 'entry-content') regex = r'[\w]+[\.]' document = [] for paragraph in content('p'): words = re.findall(regex, fix_unicode(paragraph.text)) document.extend(word)
-
zip stops when any of its inputs is done
# bigrams bigrams = zip(document, document[1:]) transitions = defaultdict(list) for prev, current in bigrams: transitions[prev].append(current) def generate_using_bigrams(): current = '.' result = [] while True: next_word_candidates = transitions[current] current = random.choice(next_word_candidates) result.append(current) if current == '.': return ' '.join(result)
-
trigrams triplets of consecutive words
# n-grams n consecutive words trigrams = zip(document, document[1:], document[2:]) trigrams_transitions = defaultdict(list) starts = [] for prev, current, next in trigrams: if prev == '.': starts.append(current) trigram_transitions[(prev, current)].append(next) def generate_using_trigrams(): current = random.choice(starts) prev = '.' result = [current] while True: next_word_candidates = trigram_transitions[(prev, current)] next_word = random.choice(next_word_candidates) prev, current = current, next_word result.append(current) if current == '.': return ' '.join(result)
-
-
grammars
-
define
grammar = { "_S" : ["_NP _VP"], "_NP" : ["_N", "_A _NP _P _A _N"], "_VP" : ["_V", "_V _NP"], "_N" : ["data science", "Python", "regression"], "_A" : ["big", "linear", "logistic"], "_P" : ["about", "near"], "_V" : ["learns", "trains", "tests", "is"] } # _S sentence # _NP noun phrase # _VP verb phrase
-
how to generate sentences from this grammar
['_S'] ['_NP','_VP'] ['_N','_VP'] ['Python','_VP'] ['Python','_V','_NP'] ['Python','trains','_NP'] ['Python','trains','_A','_NP','_P','_A','_N'] ['Python','trains','logistic','_NP','_P','_A','_N'] ['Python','trains','logistic','_N','_P','_A','_N'] ['Python','trains','logistic','data science','_P','_A','_N'] ['Python','trains','logistic','data science','about','_A', '_N'] ['Python','trains','logistic','data science','about','logistic','_N'] ['Python','trains','logistic','data science','about','logistic','Python']
-
code
def is_terminal(token): return token[0] != '_' def expand(grammar, tokens): for i, token in enumerate(tokens): if is_terminal(token): continue replacement = random.choice(grammar[token]) if is_terminal(replacement): token[i] = replacement else: tokens = tokens[:i] + replacement.split() + tokens[(i+1):] return expand(grammar, tokens) return tokens # generating sentences def generate_sentence(grammar): return expand(grammar, ['_S'])
-
-
an aside gibbs sampling
-
generating sampling from some distributions is easy
random.random() # and normal random variables with inverse_normal_cdf(random.random()) # but some distributions are harder to sample from
-
gibbs sampling is a technique
# for generating samples # from multidimensional distributions # when only know some of the conditional distributions
-
e.g.
# rolling two dice # x => first dice # y => sum of the dice # generate lots of (x, y) def roll_a_dice(): return random.choice([1, 2, 3, 4, 5, 6]) def direct_sample(): d1 = roll_a_dice() d2 = roll_a_dice() return d1, d1 + d2 # know the conditional distributions def random_y_given_x(x): """equally likely to be x + 1, x + 2, ..., x + 6""" return x + roll_a_dice() # random_x_given_y(y): if y <= 7: # if the total is 7 or less # the first dice is equally likely to be # 1, 2, ..., (total - 1) return random.randrange(1, y) else: # if the total is 7 or more # the first dice is equally likely to be # (total - 6), (total - 5), ..., 6 return random.randrange(y - 6, 7) # the way gibbs sampling works # start with any (valid) value for x, y # repeatedly alternate replacing x # with a random value picked conditional on y # and y vice versa # after a number of iterations # the result values of x, y will represent a sample from the unconditional joint distribution def gibbs_sample(num_iters=100): # doesn't really matter x, y = 1, 2 for _ in range(num_iters): x = random_x_given_y(y) y = random_y_given_x(x) return x, y # direct sample def compare_distributions(num_samples=1000): counts = defaultdict(lambda: [0, 0]) for _ in range(num_samples): counts[gibbs_sample()][0] +=1 counts[direct_sample()][1] +=1 return counts
-
-
topic modeling
-
latent dirichlet analysis (lda)
# lda has some similarities to the naive bayes classifier # it assumes a probabilistic model for documents
-
code
def sample_from(weights): """returns i with probability weights[i] / sum(weights)""" total = sum(weights) rnd = total * random.random() for i, w in enumerate(weights): rnd -= w if rnd <= 0: return i documents = [ ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"], ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"], ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"], ["R", "Python", "statistics", "regression", "probability"], ["machine learning", "regression", "decision trees", "libsvm"], ["Python", "R", "Java", "C++", "Haskell", "programming languages"], ["statistics", "probability", "mathematics", "theory"], ["machine learning", "scikit-learn", "Mahout", "neural networks"], ["neural networks", "deep learning", "Big Data", "artificial intelligence"], ["Hadoop", "Java", "MapReduce", "Big Data"], ["statistics", "R", "statsmodels"], ["C++", "deep learning", "artificial intelligence", "probability"], ["pandas", "R", "Python"], ["databases", "HBase", "Postgres", "MySQL", "MongoDB"], ["libsvm", "regression", "support vector machines"] ] # a list of counters, one for each document document_topic_counts = [Counter() for _ in documents] # a list of counters, one for each topic topic_word_counts = [Counter() for _ in range(K)] # a list of numbers, one for each topic topic_counts = [0 for _ in range(K)] # a list of numbers, one for each document document_lengths = map(len, documents) # the number of distinct words distinct_words = set(word for document in documents for word in document) W = len(distinct_words) D = len(documents) def p_topic_given_document(topic, d, alpha=0.1): """the fraction of words in document d that are assigned to topic (plus some smoothing)""" return ((document_topic_counts[d][topic] + alpha) / (document_lengths[d] + K * alpha)) def p_word_given_topic(word, topic, beta=0.1): """the fraction of words assigned to topic that equal word (plus some smoothing)""" return ((topic_word_counts[topic][word] + beta) / (topic_counts[topic] + W * beta)) def topic_weight(d, word, k): """given a document and a word in that document, return the weight for the kth topic""" return p_word_given_topic(word, K) * p_topic_given_document(k, d) def choose_new_topic(d, word): return sample_from([topic_weight(d, word, k) for k in range(K)]) # assigning every word to a random topic # populating our counters random.seed(0) document_topics = [[random.randrange(K) for word in document] for document in documents] for d in range(D): for word, topic in zip(documents[d], document_topics[d]): document_topic_count[d][topic] += 1 topic_word_counts[topic][word] += 1 topic_counts[topic] += 1 for iter in range(1000): for d in range(D): for i, (word, topic) in enumerate(zip(document[d], document_topics[d])): # remove this word/topic from the counts # so that doesn't influence the weights document_topic_counts[d][topic] -= 1 topic_word_counts[topic][word] -= 1 topic_counts[topic] -= 1 document_lengths[d] -= 1 # choose a new topic based on the weights new_topic = choose_new_topic(d, word) document_topics[d][i] = new_topic # and now add it back to the counts document_topic_counts[d][new_topic] -= 1 topic_word_counts[new_topic][word] -= 1 topic_counts[new_topic] -= 1 document_lengths[d] -= 1 # five most heavily weighted words for k, word_counts in enumerate(topic_word_counts): for word, count in word_counts.most_common(): if count > 0: print k, word, count # based on these assign topic names topic_names = ["Big Data and programming languages", "Python and statistics", "databases", "machine learning"] # how the model assigns topic to each user's interests for document, topic_counts in zip(documents, document_topic_counts): print document for topic, count in topic_counts.most_common(): if count > 0: print topic_names[topic], count, print # which gives ['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark', 'Storm', 'Cassandra'] Big Data and programming languages 4 databases 3 ['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres'] databases 5 ['Python', 'scikit-learn', 'scipy', 'numpy', 'statsmodels', 'pandas'] Python and statistics 5 machine learning 1
-