#!/usr/bin/env python # -*- coding: utf-8 -*- import codecs import sys import BeautifulSoup from openanything import openAnything sys.stdout = codecs.getwriter('utf-8')(sys.stdout) def strip(html): soup = BeautifulSoup.BeautifulSoup(html) comments = soup.findAll(text=lambda text:isinstance(text, BeautifulSoup.Comment)) [comment.extract() for comment in comments] scripts = soup.findAll('script') for script in scripts: script.extract() body = soup.body(text=True) return ''.join(body) if __name__ == "__main__": import sys content = openAnything(sys.argv[1]).read() print 'content: ' print content print '===' """ print 'ucontent: ' print ucontent print '===' """ ucontent = BeautifulSoup.UnicodeDammit(content).unicode print type(ucontent) print strip(ucontent)