#!/usr/bin/env python import chardet import sys import os import codecs import urllib from BeautifulSoup import UnicodeDammit from striphtml import striphtml sys.stdout = codecs.getwriter('utf-8')(sys.stdout) urls = [line.strip() for line in open('urls.txt').readlines()] for url in urls: print 'spidering ' + url html = urllib.urlopen(url).read() html = UnicodeDammit(html).unicode text = striphtml(html) filename = url.split('/')[-1] print 'saving as ' + filename out = codecs.open('corpus' + os.sep + filename, encoding='utf-8', mode='w').write(text)