#!/usr/bin/env python # -*- coding: utf-8 -*- """ General utilities for text. """ import sys, os from BeautifulSoup import UnicodeDammit #from html2text import html2text import re def uread(fname): return UnicodeDammit(open(fname,'U').read()).unicode def ufile(filename): return open(filename, 'U').read().decode('utf-8').strip() """ def plaintext(text): #import chardet #coding = chardet.detect(text) text = text.decode('utf-8','ignore') #text = html2text(text) return text """ def sentences(s): """ o fsm, forgive me. """ return s.split('\.') def ngrams(s,n): """ >>> ngrams('abcddlagk', 3) ['abc', 'bcd', 'cdd', 'ddl', 'dla', 'lag', 'agk'] """ return [''.join(s[i:i+n]) for i in range(len(s)-n+1)] def bigrams(s): return ngrams(s,2) def trigrams(s): return ngrams(s,3) def depunctuate(textstr, replacement=" "): """ replace instances of punctuation in textstr >>> depunctuate("oh") 'oh' >>> depunctuate("oh?") 'oh ' >>> depunctuate("oh. yeah?") 'oh yeah ' >>> depunctuate("Oh, really? It's... no way. No, no way!") 'Oh really It s no way No no way ' """ return re.sub(r'[\?\!\.\,\']', replacement, textstr) def letter2cat(textstr): alpha = alphabet(textstr) l2c = {} alpha.sort() for a in alpha: l2c[a] = unicodedata.category(a) return l2c def depunctuate(textstr, replacement=" "): """ Remove all characters with a Unicode category beginning with 'P' >>> depunctuate("oh") 'oh' >>> depunctuate("oh?") 'oh ' >>> depunctuate("oh. yeah?") 'oh yeah ' >>> depunctuate("Oh, really? It's... no way. No, no way!") 'Oh really It s no way No no way ' """ from unicodedata import category for letter in list(textstr): if category(letter).startswith('P'): textstr = textstr.replace(letter, ' ') return textstr #def tokenize(textstr, nopunc=False): def tokenize(textstr): """ Simple word tokenizer for a string 'textstr' >>> tokenize('O that this too too sullied flesh would melt!') ['O', 'that', 'this', 'too', 'too', 'sullied', 'flesh', 'would', 'melt'] """ #if nopunc: textstr = depunctuate(textstr) textstr = depunctuate(textstr) words = re.split('[ \t]', textstr) words = [word.strip() for word in words] wordRE = re.compile('^[\w]+$',re.UNICODE) res = [word for word in words if wordRE.match(word)] return res def alphabet(textstr): """ returns a sorted list of all the letters in text >>> alphabet("returns a sorted list of all the letters in text") [' ', 'a', 'd', 'e', 'f', 'h', 'i', 'l', 'n', 'o', 'r', 's', 't', 'u', 'x'] """ alpha = list(set(textstr)) alpha.sort() return alpha def letters(textstr): return list(textstr) def wfreq(wordlist): counts = freq(wordlist) return counts def windex(wordlist): """ can take a list of words or a count dictionary.""" words = list(tokenize(wordlist)) word_index = set(words) return word_index def get_content(): import fileinput content = [] for line in fileinput.input(): line = line.decode('utf-8','ignore') content.append(line) return ''.join(content) def freq(seq): f = {} for e in seq: if e not in f: f[e] = 1 else: f[e] += 1 return f def wf(seq): """ return word frequency dictionary """ return freq(tokenize(seq)) def uniq(seq): return list(set(seq)) def top(n, d): """ return all keys of dictionary d which have one of the top n values of all values in d >>> d = {'ftw': 3, 'yo': 2, 'gnx': 3, 'nd': 2, 'rhug': 4, 'n': 1, 's': 1, 'y': 1, 'gq': 2, 'jfcx': 4} >>> sorted(top(2, d)) ['ftw', 'gnx', 'jfcx', 'rhug'] Handles ties: >>> points = { 'abe' : 1, 'ben' : 2, 'bob' : 2 , 'chris' : 3, 'dave' : 4, 'evan' : 5} >>> sorted(top(3, points)) ['chris', 'dave', 'evan'] >>> sorted(top(4, points)) ['ben', 'bob', 'chris', 'dave', 'evan'] >>> x = { ... 'a' : 1, ... 'b' : 2, ... 'c' : 3, ... 'c2' : 3 ... } Handing it a list assumes you want the top most frequent elements: >>> top(3, [1, 2, 3, 4, 5]) # all have freq 1 [1, 2, 3, 4, 5] >>> top(2, ['once', 'twice', 'twice', 'thrice', 'thrice', 'thrice']) ['twice', 'thrice'] """ from text import freq if not hasattr(d, 'values'): d = freq(d) topvals = sorted(set(d.values()))[-n:] res = [] """ for e in d: if d[e] in topvals: res.append(e) return res """ return [e for e in d if d[e] in topvals] def unicodify(resource): """ tries to return a Unicode opject containing the content of any uri. """ from chardet import detect from openAnything import openAnything raw = openAnything(resource).read() encoding = detect(raw)['encoding'] return raw.decode(encoding) def _test(): import doctest doctest.testmod() if __name__ == "__main__": """ from fileinput import input for line in input(): print plaintext(line) """ _test() """ sortable = [(v,k) for k,v in dictionary.items()] sortable.sort() return sortable byval(letfq) for i in byval(letfq):print i for i in byval(letfq):print i[0], i[1] from unicodedata import name name(u"ግ") name(u"ጭ") for i in byval(letfq)[:10]:print i[0], i[1] for i in byval(letfq)[-10:]:print i[0], i[1] for i in byval(letfq)[-20:]:print i[0], i[1] for i in byval(letfq)[-50:]:print i[0], i[1] for i in byval(letfq)[-50:]: if ord(i[1]) not < 400: for i in byval(letfq)[-50:]: if (ord(i[1]) not < 400): for i in byval(letfq)[-50:]: if (ord(i[1]) > 400): print i[0], i[1] if (ord(i[1]) > 400): print i[0], i[1], name(i[1]) len(uam) am = pry('http://www2.dw-world.de/amharic/presse/1.184890.1.html').read() uam += text.plaintext(am) len(uam) letfq = text.freq(list(uam)) def topvals(series): pass topvals([1,2,2,3,3,3]) topvals([1,2,2,3,3,3],1) topvals([1,2,2,3,3,3],2) topvals([1,2,2,3,3,3],3) """