#! -*- coding: utf-8 -*- from openAnything import openAnything from html2text import html2text from chardet import detect from text import freq import sys urls = open('italianurls.txt').readlines() urls = [url.strip() for url in urls] for url in urls: print url def url2unitext(url): raw = openAnything(url).read() enc = detect(raw)['encoding'] html = raw.decode(enc) content = html2text(html) return content docs = {} for url in urls: docs[url] = url2unitext(url) letterfreqs = {} for url in docs: content = docs[url] fq = freq(content) letterfreqs.update(fq) for fq in sorted(letterfreqs.items()): print fq[0], fq[1]