#!/usr/bin/env python # -*- coding: utf-8 -*- import codecs import sys import re from openanything import openAnything import string from collections import defaultdict sys.stdout = codecs.getwriter('utf-8')(sys.stdout) # learn dutch. def ngrams(text, n): return [tuple(text[i:i+2]) for i in range(len(text)-n+1)] def frequency(seq): d = defaultdict(int) for elem in seq: d[elem] += 1 return d def depunctuate(text): puncRE = re.compile('[' + string.punctuation + ']') return puncRE.sub(' ', text) def sentence_split(text): # @@TODO handle non-spacing terminalsRE = re.compile('[^ ]([\?\!\.]+) ') return terminalsRE.split(text) def tokenize(text): return depunctuate(text.lower()).split() class Document: """ A Document is a text with statistical information attached. """ def __init__(self, text): self.content = text self.character_bigrams = self.bigrams(self.content) self.words = tokenize(self.content) self.word_bigrams = self.bigrams(self.words) self.word_frequency = frequency(self.words) self.word_bigram_frequency = frequency(self.word_bigrams) self.sentences = [Sentence(sent) for sent in sentence_split(self.content)] def bigrams(self, text): """ Build a bigram model of text """ return ngrams(text, 2) class Sentence: def __init__(self, sentence): self.sentence = sentence self.words = sentence_split(sentence) self.word_count = len(self.words) # @@TODO handle non-spacing #def __str__(self): # return self.sentence class Corpus: def __init__(self, resources, name="corpus"): self.texts = [openAnything(resource).read() for resource in resources] #self.texts = [text.decode('utf-8') for text in self.texts] self.documents = [Document(text) for text in self.texts] self.word_bigrams = self.merge_word_bigrams() self.sentences = self.merge_sentences() def merge_word_bigrams(self): word_bigrams = [] for document in self.documents: word_bigrams.extend(document.word_bigrams) return word_bigrams def merge_sentences(self): sentences = [] for document in self.documents: sentences.extend(document.sentences) return sentences def bigram_sample(self, threshhold=1): bigram_frequency = frequency(self.word_bigrams) bigram_frequency = [(pair, fq) for pair,fq in bigram_frequency.items() if fq > threshhold] return bigram_frequency nl_files = ['nl' + str(i) + '.txt' for i in range(1,6)] nl = Corpus(nl_files, name='dutch') for document in nl.documents: print document.word_bigrams for sentence in document.sentences: print sentence print print nl.word_bigrams print [pair for pair in nl.bigram_sample(threshhold=10)] for s in nl.sentences[9:20]: print type(s.sentence), s.sentence """ for (a,b),fq in d.word_bigram_frequency.items(): if fq > 1: print a, b, fq """