dsfs 09 getting data

stdin and stdout

pipe data

code

 # egrep.py
 import sys, re

 # sys.argv is the list of command-line arguments
 # sys.argv[0] is the name of the program it self
 # sys.argv[1] will be the regex specified at the command line
 regex = sys.argv[1]

 # for every line passed into the script
 for line in sys.stdin:
     # if it matched the regex, write it to stdout
     if re.search(regex, line):
         sys.stdout.write(line)

line count

 # line_count.py
 import sys

 count = 0
 for line in sys.stdin:
     count += 1

 # print goes to sys.stdout
 print count

run

 # on windows
 > type demo.txt | python egrep.py "[0-9]" | python line_count.py

 # unix
 > cat demo.txt | python egrep.py "[0-9]" | python line_count.py

word count

code

 # most_common_words.py
 import sys
 from collections import Counter

 # pass in number of words at first argument
 try:
     num_words = int(sys.argv[1])
 except:
     print "usage: most_common_words.py num_words"
     # non-zero exit code indicates error
     sys.exit(1)

 counter = Counter(word.lower()
                   for line in sys.stdin
                   for word in line.strip().split()
                   if word)

 for word, count in counter.most_common(num_words):
     sys.stdout.write(str(count))
     sys.stdout.write('\t')
     sys.stdout.write(word)
     sys.stdout.write('\n')

run

 > type the_bible.txt | python most_common_words.py 10

reading files

text files

code

 # r w a readonly write append
 reading = open('demo.txt', 'r')
 writing = open('demo.txt', 'w')
 appending = open('demo.txt', 'a')

 reading.close()

use with

 with open('demo.txt', 'r') as f:
     data = func_gets_data_from(f)
 process(data)

iterate

 starts_with_hash = 0
 with open('demo.txt', 'r') as f:
     for line in f:
         if re.match("^#", line):
             starts_with_hash += 1

delimited files

tab-separated comma-separated

code

 import csv
 with open('tab.txt', 'rb') as f:
     reader = csv.reader(f, delimiter='\t')
     for row in reader:
         date = row[0]
         symbol = row[1]
         price = float(row[2])
         process(date, symbol, price)

skip header row

 with open('tab.txt', 'rb') as f:
     reader = csv.DictReader(f, delimter=':')
     for row in reader:
         date = row['date']
         symbol = row['symbol']
         price = row['price']
         process(date, symbol, price)

write

 today_prices = { 'AAPL': 90.91, 'MSFT': 41.68, 'FB': 64.5 }

 with open('demo.txt', 'wb') as f:
     writer = csv.writer(f, delimiter=',')
     for stock, price in today_prices.items():
         writer.writerow([stock, price])

scraping the web

html and parsing thereof

html parser

 # python's built-in html parser is not that lenient
 # install one
 $ pip install

 from bs4 import BeautifulSoup
 import requests

 html = requests.get('http://www.example.com').text
 soup = BeautifulSoup(html, 'html5lib')

 # find first <p> tag
 first_paragraph = soup.find('p')
 # or
 first_paragraph = soup.p

 # `text` property
 first_paragraph_text = soup.p.text
 first_paragraph_words = soup.p.text.split()

 # tag's attributes => dict
 first_paragraph_id = soup.p['id']       # raises KeyError if no 'id'
 first_paragraph_id2 = soup.p.get('id')  # returns None if no 'id'
        
 # get multiple tags at once
 all_paragraphs = soup.find_all('p')   # or just soup('p')
 paragraphs_with_ids = [p for p in soup('p') if p.get('id')]
        
 # find tags with a specific class
 important_paragraphs = soup('p', {'class' : 'important'})
 important_paragraphs2 = soup('p', 'important')
 important_paragraphs3 = [p for p in soup('p')
 if 'important' in p.get('class', [])]

 #
 spans_inside_divs = [span
                      for div in soup('div')    # for each <div> on the page
                      for span in div('span')]  # find each <span> inside it

e.g. o’reilly books about data

e.g.

url

 http://shop.oreilly.com/category/browse-subjects/data.do?sortby=publicationDate&page=1

robots.txt

 craw-delay: 30
 request-rate: 1/30

 # 1. wait 30 seconds between requests
 # 2. request only one page every 30 seconds

code

 url = "http://shop.oreilly.com/category/browse-subjects/data.do?sortby=publicationDate&page=1"
 soup = BeautifulSoup(requests.get(url).text, 'html5lib')
 tds = soup('td', 'thumbtext')
 print len(tds)   # 30

 def is_video(td):
     """it's a video if it has exactly one pricelabel
     and if the stripped text inside that pricelabel starts with 'video'"""
     pricelabels = td('span', pricelabel)
     return (len(pricelabels) == 1 and pricelabels[0].text.strip().startswith('Video'))

 print len([td for td in tds if not is_video(td)])   # 21

 # title author isbn date
 def book_info(td):
     """given a `td` tag representing a book
     extract the book's details and return as dict"""
     title = td.find('div', 'thumbheader').a.text
     by_author = td.find('div', 'AuthorName').text
     authors = [x.strip() for x in re.sub('^By ', "", by_author).split(',')]
     isbn_link = td.find('div', 'thumbheader').a.get('href')
     isbn = re.match('product/(.*)\.do', isbn_link).groups()[0]
     date = td.find('span', 'directorydate').text.strip()
     return {
         "title" : title,
         "authors" : authors,
         "isbn" : isbn,
         "date" : date
     }

scrape

 from bs4 import BeautifulSoup
 import requests
 from time import sleep
 base_url = "http://shop.oreilly.com/category/browse-subjects/data.do?sortby=publicationDate&page="
 books = []
 NUM_PAGES = 31 # at the time of writing, probably more by now
 for page_num in range(1, NUM_PAGES + 1):
     print "souping page", page_num, ",", len(books), " found so far"
     url = base_url + str(page_num)
     soup = BeautifulSoup(requests.get(url).text, 'html5lib')
     for td in soup('td', 'thumbtext'):
         if not is_video(td):
             books.append(book_info(td))
     # now be a good citizen and respect the robots.txt!
     sleep(30)

plot

 def get_year(book):
     """book["date"] looks like 'November 2014' so we need to
     split on the space and then take the second piece"""
     return int(book["date"].split()[1])

 # 2014 is the last complete year of data (when I ran this)
 year_counts = Counter(get_year(book) for book in books
                       if get_year(book) <= 2014)

 import matplotlib.pyplot as plt
 years = sorted(year_counts)
 book_counts = [year_counts[year] for year in years]
 plt.plot(years, book_counts)
 plt.ylabel("# of data books")
 plt.title("Data is Big!")
 plt.show()

using apis

json (and xml)

code

 import json
 serialized = """{ "title" : "Data Science Book",
                   "author" : "Joel Grus",
                   "publicationYear" : 2014,
                   "topics" : [ "data", "science", "data science"] }"""

 # parse the JSON to create a Python dict
 deserialized = json.loads(serialized)
 if "data science" in deserialized["topics"]:
 print deserialized

using an unauthenticated api

code

 import requests, json
 endpoint = "https://api.github.com/users/joelgrus/repos"
 repos = json.loads(requests.get(endpoint).text)

python-dateutil

 from dateutil.parser import parse

 dates = [parse(repo["created_at"]) for repo in repos]
 month_counts = Counter(date.month for date in dates)
 weekday_counts = Counter(date.weekday() for date in dates)

 last_5_repositories = sorted(repos,
                              key=lambda r: r["created_at"],
                              reverse=True)[:5]
 last_5_languages = [repo["language"]
                     for repo in last_5_repositories]

finding apis

e.g. using the twitter apis

getting credentials

using twython

 from twython import Twython

 twitter = Twython(CONSUMER_KEY, CONSUMER_SECRET)
 # search for tweets containing the phrase "data science"
 for status in twitter.search(q='"data science"')["statuses"]:
     user = status["user"]["screen_name"].encode('utf-8')
     text = status["text"].encode('utf-8')
     print user, ":", text
     print

streaming api

 from twython import TwythonStreamer

 # appending data to a global variable is pretty poor form
 # but it makes the example much simpler
 tweets = []

 class MyStreamer(TwythonStreamer):
     """our own subclass of TwythonStreamer that specifies
     how to interact with the stream"""

     def on_success(self, data):
         """what do we do when twitter sends us data?
         here data will be a Python dict representing a tweet"""
         # only want to collect English-language tweets
         if data['lang'] == 'en':
             tweets.append(data)
             print "received tweet #", len(tweets)
         # stop when we've collected enough
         if len(tweets) >= 1000:
             self.disconnect()

     def on_error(self, status_code, data):
         print status_code, data
         self.disconnect()

 stream = MyStreamer(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
 # starts consuming public statuses that contain the keyword 'data'
 stream.statuses.filter(track='data')
 # if instead we wanted to start consuming a sample of *all* public statuses
 # stream.statuses.sample()
 top_hashtags = Counter(hashtag['text'].lower()
                        for tweet in tweets
                        for hashtag in tweet["entities"]["hashtags"])
 print top_hashtags.most_common(5)

reference

scrapy