#!/usr/bin/env python # -*- coding: utf-8 -*- import sys, os, re from urllib import urlopen from collections import defaultdict from operator import itemgetter def ngrams(seq,n): return [seq[i:i+n] for i in range(len(seq)-n+1)] def freq(seq): fq = defaultdict(int) for e in seq: fq[e] += 1 return fq def byfreq(seq, reversed=True): fq = freq(seq) return [(a,b) for a,b in sorted(fq.items(), key=itemgetter(1), reverse=reversed)] def depunc(text): return re.sub('[' + punctuation + whitespace + ']', ' ', text) def depunctuate(text): from unicodedata import category fixed = [] for c in text: if not category(c).startswith('P'): fixed.append(c) else: fixed.append(' ') return ''.join(fixed) """ unicode_data = open('/home/pat/amundo/misc/unicode/UnicodeData.txt') punctuation = [] for line in unicode_data: line.split(';') if line.split(';')[2].startswith('P'): punctuation.append(unichr(int(line.split(';')[0],16))) #punctuationRE = re.compile( #print '['+'|'.join(punctuation)+']' from unicodedata import name for c in punctuation: try: n = name(c) print c, n except: continue """