#---
# Excerpted from "Everyday Scripting in Ruby"
# We make no guarantees that this code is fit for any purpose.
# Visit http://www.pragmaticprogrammer.com/titles/bmsft for more book information.
#---
require 'uri'
require 'open-uri'
require 'csv'
### This does the bulk of the work
# This simple version has some problems, but is the
# framework for the more complete version.
def trip(url, steps=10) # (1)
steps.times do # (2)
page = fetch(url)
book_info = scrape_book_info(page)
puts format_output(book_info)
next_book = scrape_affinity_list(page)[0]
url = next_book[:url]
end
end
# This more complete version replaces the simple version above.
def trip(url, steps=10)
so_far = [] # (3)
steps.times do
page = fetch(url)
book_info = scrape_book_info(page)
so_far << book_info[:title]
puts format_output(book_info)
next_book = scrape_affinity_list(page).find do | possible | # (4)
not so_far.include?(possible[:title])
end
url = next_book[:url]
end
end
### Fetching Amazon book pages
def url_for(isbn)
"http://www.amazon.com/gp/product/" + isbn
end
def fetch(url)
open(url) { | response |
response.read
}
end
### Scraping information out of Amazon book pages
def scrape_book_info(html)
retval = {}
html = restrict(html,
/
\s*})
retval[:title] = scrape_title(html)
retval[:authors] = scrape_authors(html)
retval
end
def scrape_title(html)
%r{(.*?)}m =~ html
clean_title($1)
end
def scrape_authors(html)
author_anchor = %r{(.+?)}m
html.scan(author_anchor).flatten.collect do | author |
clean_author(author)
end
end
def restrict(html, starting_regexp, stopping_regexp)
start = html.index(starting_regexp)
stop = html.index(stopping_regexp, start)
html[start..stop]
end
def clean_title(amazon_title)
# The regexps below have duplication. It could be removed, but
# regexps are hard enough to understand as it is.
paper = /\(\s*Paperback\s*\)\s*$/m
hard = /\(\s*Hardcover\s*\)\s*$/m
amazon_title.gsub(paper, '').gsub(hard, '').strip.squeeze(' ')
end
def clean_author(amazon_author)
amazon_author.squeeze(' ')
end
def scrape_affinity_list(html)
result = []
whole_list_matches = %r{also\s+bought.*?}m
one_element_matches = %r{.*?}m
html_affinity_list = html[whole_list_matches]
html_affinity_list.scan(one_element_matches).collect do | item |
{ :url => /href\s*=\s*"(.*?)"/.match(item)[1],
:title => %r{(.*?)}.match(item)[1] }
end
end
# How to print
def format_output(book_info)
self.send(FORMAT_STYLE, book_info)
end
def normal_string(book_info)
book_info[:title] # omit authors
end
def csv_string(book_info)
title = book_info[:title]
authors = book_info[:authors].join(', ')
CSV.generate_line([title, authors])
end
if $0 == __FILE__
if ARGV[0] == '--csv'
FORMAT_STYLE = :csv_string
ARGV.shift
else
FORMAT_STYLE = :normal_string
end
starting_isbn = ARGV[0] || '0974514055'
trip(url_for(starting_isbn))
end