from glob import glob glob('*deu*') f = glob('*deu*')[0] deu = open(f).read().decode('utf-8') from collectoins import defaultdict from collections import defaultdict def freq(seq): f = defaultdict(int) for e in seq: f[e] += 1 return f def ngrams(seq): return [seq[i:i+n] for i in range(len(seq)-n+1] def ngrams(seq): return [seq[i:i+n] for i in range(len(seq)-n+1)] def ngrams(seq,n): return [seq[i:i+n] for i in range(len(seq)-n+1)] def bigrams(seq): return ngrams(seq,2) deu.split() words = deu.split() bigrams(words) def ngrams(seq,n): return [tuple(seq[i:i+n]) for i in range(len(seq)-n+1)] def bigrams(seq): return ngrams(seq,2) bigrams(words) freq(bigrams(words)) len(freq(bigrams(words))) len(words) byfreq = sorted(freq(words).items(), key=itemgetter(1)) from operator import itemgetter byfreq = sorted(freq(words).items(), key=itemgetter(1)) byfreq wordsbyfreq = byfreq del byfreq wordsbyfreq bgbyfreq = sorted(freq(bigrams(words)), key=itemgetter(1)) bgbyfreq bgbyfreq = sorted(freq(bigrams(words)).items(), key=itemgetter(1)) bgbyfreq len(wordsbyfreq) wordsbyfreq [b for a,b in wordsbyfreq] [w for w,f in wordsbyfreq if f>10] topwords = [w for w,f in wordsbyfreq if f>10] bg for bg in bgbyfreq [bg for bg in bgbyfreq] [bg for ((a,b),bg) in bgbyfreq] [a,b for ((a,b),f) in bgbyfreq] [(a,b) for ((a,b),f) in bgbyfreq] [(a,b) for ((a,b),f) in bgbyfreq if a in topwords and b in topwords] 'der die' in deu deu.index('der die') deu[deu.index('der die')-100:deu.index('der die')+100 ] print deu[deu.index('der die')-100:deu.index('der die')+100] [(f,a,b) for ((a,b),f) in bgbyfreq if a in topwords and b in topwords] for x,y,z in [(f,a,b) for ((a,b),f) in bgbyfreq if a in topwords and b in topwords]: print x,y,z from urllib import urlretrieve help(urlretrieve) urlretrieve('http://www.gutenberg.org/files/18232/18232-0.txt', filename="hofmann-de.txt') urlretrieve('http://www.gutenberg.org/files/18232/18232-0.txt', filename="hofmann-de.txt") deu = open('hofmann-de.txt').read().decode('utf-8') len(deu) readline.write_history_file('cliqueish.txt') open('23228-8.txt').read().decode('utf-8') open('23228-8.txt').read().decode('latin-1') text = open('23228-8.txt').read().decode('latin-1') open('arabische-de.txt','w').write(text.encode('utf-8')) open('all').read().decode('utf-8') from BeautifulSoup import UnicodeDammit text = UnicodeDammit(open('all').read()).unicode open("fixed",'w').write(text.encode('utf-8')) from BeautifulSoup import UnicodeDammit text = UnicodeDammit(open('all').read()).unicode open("fixed",'w').write(text)) open("fixed",'w').write(text) print text from chardet import detect detect(text) import chardet from urllib import urlopen hi = urlopen('http://www.cfilt.iitb.ac.in/commonwords/2000-hin.aci').read() chardet.detect(hi) hi print hi chardet.detect(hi) len(hi) print u"\ue000" print u"\u2000" c = u"\u2000" from unicodedata import name print c, name(c) c = u"\u2005" print c, name(c) c = u"\u203b" print c, name(c) unichr(u"\u2005") unichr(2005) unichr(int(2005,16)) help(unichr) hex(2005) unichr(hex(2005)) unichr(int(hex(2005))) help(range) range(0x1,0x20) import binascii help(binascii) binascii.hexlify('0x20') import os os.environ.get('FOO') full_path = os.path.split(os.path.abspath(os.path.dirname(sys.argv[0]))) sys.path.append(os.path.join(*(full_path[:full_path.index('admin')+1] + ['lib'])) ) import sys sys.path.append(os.path.join(*(full_path[:full_path.index('admin')+1] + ['lib'])) ) full_path = os.path.split(os.path.abspath(os.path.dirname(sys.argv[0]))) sys.path.append(os.path.join(*(full_path[:full_path.index('admin')+1] + ['lib']))) from text import * dir() def udhr(code): return open('/home/pat/udhr/udhr_'+code+'.txt').read().decode('utf-8') por = udhr('por') def udhr(code): return open('/home/pat/repo/udhr/udhr_'+code+'.txt').read().decode('utf-8') por = udhr('por') porwords = tokenize(por) porbigrams = tuplize(bigrams(porwords)) porbigramsbyfreq = byfreq(porbigrams) for a,b in porbigramsbyfreq: print a,b moby = open('/home/pat/repo/identify/samples/mobydick.txt').read().decode('utf-8') words = tokenize(moby) mbigrams = bigrams(words) mbigrams = tuplize(bigrams(words)) len(mbigrams) topbigrams = byfreq(mbigrams) for a,b in topbigrams: print a,b for a,b in topbigrams[-10:]: print a,b for a,b in topbigrams[:10]: print a,b topbigrams mbigrams[:10] freq(mbigrams[:10]) byfreq(mbigrams[:10]) def byfreq(seq): fq = freq(seq).items() return sorted(fq, key=itemgetter(1)) byfreq(mbigrams[:10]) topbigrams = byfreq(mbigrams) for a,b in topbigrams[:10]: print a,b for a,b in topbigrams[-10:]: print a,b for (a,b),f in topbigrams[-100:]: print a,b for (a,b),f in topbigrams[-100:]: print f, a,b for (a,b),f in topbigrams[-10:]: print f, a,b for (a,b),f in topbigrams[-15:]: print f, a,b readline.write_history_file('moby.py') from BeautifulSoup import BeautifulStoneSoup as Soup from urllib import urlopen xml = urlopen('http://news.google.com/news?ned=hi_in&hl=hi&ned=hi_in&q=%E0%A4%AD%E0%A4%BE%E0%A4%B0%E0%A4%A4&ie=UTF-8&output=atom').read() Soup(xml) soup = Soup(xml) import scipy help(scipy.io.read_array) dir(scipy) a1 = scipy.array([1,2,3,4]) a2 = scipy.array([4,3,2,1]) print a1 print a2 print a2 * a1 a3 = a2 * a1 print a3 a1 = scipy.zeros((4,5)) a1 print a1 print a2 a2 = scipy.empty((4,5)) print a2 a3 = scipy.zeros((4,5), dtype="f") a3 print a3 scipy.io open('bo2en.txt').read().decode('utf-8') print open('bo2en.txt').read().decode('utf-8') print open('bo2en.txt').read().decode('utf-8').splitlines() open('bo2en.txt').read().decode('utf-8').splitlines() open('bo2en.txt').read().decode('utf-8').splitlines() enbo = open('bo2en.txt').read().decode('utf-8').splitlines() enbo = [line.strip().split('\t') for line in enbo] for b,e in enbo: print b print e print [len(l) for l in enbo] print enbo[0] print enbo[0][0] boen = open('bo2en.txt').read().decode('utf-8').splitlines() boen print boen[0] boen = open('bo2en.txt').read().decode('utf-8') print boen[0] len(boen.splitlines()) boen[-1] print boen[-1] print boen.splitlines()[-1] print boen.splitlines()[-1].split('\t') boen = [line.strip() for line in open('bo2en.txt').read().decode('utf-8')] [len(line.split('\t')) for line in boen] set([len(line.split('\t')) for line in boen]) boen[0] type(boen) boen boen = [line.strip() for line in open('bo2en.txt').read().decode('utf-8').splitlines()] boen set([len(line.split('\t')) for line in boen]) [boen.index(line) for line in boen if len(line.split('\t') == 1] [boen.index(line) for line in boen if len(line.split('\t')) == 1] boen[0] printboen[0] print boen[0] print boen[1] boen = [line.strip() for line in open('bo2en.txt').read().decode('utf-8').splitlines()] set([len(line.split('\t')) for line in boen]) boen = [line.strip() for line in open('bo2en.txt').read().decode('utf-8').splitlines()] set([len(line.split('\t')) for line in boen]) try: line.split('\t') except: print line try: line.split('\t') except: x = line for line in boen: try: psas for line in boen: if '\t' not in line: print line boen = [line.strip() for line in open('bo2en.txt').read().decode('utf-8').splitlines()] for line in boen: if '\t' not in line: print line import subprocess subprocess.call('ls -l') subprocess.call('ls -l', shell=True) MESSAGES = 'tail /var/log/messages' SPACE = 'df -h' cmds = [MESSAGES, SPACE] count = 0 for cmd in cmds: count += 1 print "Running command #%s" % count subprocess.call(cmd, shell=True) readline.write_history_file('doer.py') a = set('abc') a b = set('bcd') a & b a or b a, b b or a b || a b | a b - a a -b a | b == b | a b | a a | b sorted(a | b ) a & b linkRE = """\[\[ # [[ ( # title [^\]\|\:]+ # several of anything but ]|: ) # end title \|? # bar? ( # anchor? [^\]\[\|\:]+ )? # end anchor? \]\] # ]] """ link = linkRE import re linkRE = re.compile(linkRE) help(re) linkRE = re.compile(linkRE, re.VERBOSE) readline.write_history_file('link.py') rennes = u""" |date-sans=[[2005]] |dens={{formatnum:4093}} |aire urbaine= 600 746 hab. (est. 2008) |nomhab=Rennais, Rennaises |siteweb=[http://www.ville-rennes.fr/ Site de la ville de Rennes] }} [[Rennes]] est une commune [[France|française]], le chef-lieu du département d'[[Ille-et-Vilaine]] et de la région [[Bretagne (région administrative)|Bretagne]], ainsi que l'une des capitales historiques du [[duché de Bretagne]] se situe à l'est de la [[Bretagne]] sur les bords de [[La Vilaine]]. Ses [[gentilé|habitants]] sont appelés les '''Rennais''' et '''Rennaises'''. Elle est appelée ''Resnn'' en [[gallo]] et ''Roazhon'' en [[breton]], Rennes vient des [[Redones]], nom du [[peuple gaulois]] peuplant cette partie d'Armorique au {{-s|II|e}} En [[langue des signes française]], la ville se signe comme un [[renne]]. Rennes est la huitième [[ville universitaire]] française, après [[Paris]], [[Lyon]], [[Toulouse]], [[Lille]], [[Marseille]]/[[Aix-en-Provence]], [[Montpellier]] et [[Bordeaux]][ftp://trf.education.gouv.fr/pub/edutel/dpd/atlas/atlas2006/atlas2006.pdf Site du gouvernement, Atlas 2006 de l'enseignement supérieur] (et la septième si l'on considère que Aix et Marseille forment une seule ville). """ from wikialign_outline import * extract_links(rennes) extract_wikitext_links(rennes) from wikialign_outline import * extract_wikitext_links(rennes) rennes = u""" |date-sans=[[2005]] |dens={{formatnum:4093}} |aire urbaine= 600 746 hab. (est. 2008) |nomhab=Rennais, Rennaises |siteweb=[http://www.ville-rennes.fr/ Site de la ville de Rennes] }} [[Rennes]] est une commune [[France|française]], le chef-lieu du département d'[[Ille-et-Vilaine]] et de la région [[Bretagne (région administrative)|Bretagne]], ainsi que l'une des capitales historiques du [[duché de Bretagne]] se situe à l'est de la [[Bretagne]] sur les bords de [[La Vilaine]]. Ses [[gentilé|habitants]] sont appelés les '''Rennais''' et '''Rennaises'''. Elle est appelée ''Resnn'' en [[gallo]] et ''Roazhon'' en [[breton]], Rennes vient des [[Redones]], nom du [[peuple gaulois]] peuplant cette partie d'Armorique au {{-s|II|e}} En [[langue des signes française]], la ville se signe comme un [[renne]]. Rennes est la huitième [[ville universitaire]] française, après [[Paris]], [[Lyon]], [[Toulouse]], [[Lille]], [[Marseille]]/[[Aix-en-Provence]], [[Montpellier]] et [[Bordeaux]][ftp://trf.education.gouv.fr/pub/edutel/dpd/atlas/atlas2006/atlas2006.pdf Site du gouvernement, Atlas 2006 de l'enseignement supérieur] (et la septième si l'on considère que Aix et Marseille forment une seule ville). """ linkRE extract_wikitext_links(rennes) rennes = u""" |date-sans=[[2005]] |dens={{formatnum:4093}} |aire urbaine= 600 746 hab. (est. 2008) |nomhab=Rennais, Rennaises |siteweb=[http://www.ville-rennes.fr/ Site de la ville de Rennes] }} [[Rennes]] est une commune [[France|française]], le chef-lieu du département d'[[Ille-et-Vilaine]] et de la région [[Bretagne (région administrative)|Bretagne]], ainsi que l'une des capitales historiques du [[duché de Bretagne]] se situe à l'est de la [[Bretagne]] sur les bords de [[La Vilaine]]. Ses [[gentilé|habitants]] sont appelés les '''Rennais''' et '''Rennaises'''. Elle est appelée ''Resnn'' en [[gallo]] et ''Roazhon'' en [[breton]], Rennes vient des [[Redones]], nom du [[peuple gaulois]] peuplant cette partie d'Armorique au {{-s|II|e}} En [[langue des signes française]], la ville se signe comme un [[renne]]. """ from wikialign_outline import * linkRE from link import * linkRE linkRE.findall(rennes) for x,y in linkRE.findall(rennes): print x,y for x,y in linkRE.findall(rennes): print x,' --- ',y dir() peuple_galois = spider_article('fr', 'Peuple_gaulois') peuple_galois from wikialign import download_article peuple_galois = download_article('fr', 'Peuple_gaulois') peuple_galois linkRE.findall(peuple_galois) peuple_galois linkRE.findall(peuple_galois, re.VERBOSE) peuples_galois = download_article('fr', 'Peuples_gaulois') peuples_galois linkRE.findall(peuples_galois) split_into_sentences(peuples_galois) rennes_fr = download_article('fr', 'Rennes') rennes_en = download_article('en', 'Rennes') len(rennes_fr) len(rennes_en) split_into_sentences(rennes_fr) fr = split_into_sentences(rennes_fr) en = split_into_sentences(rennes_en) for f in fr: f for f in fr: linkRE.findall(f) for f in fr: set(linkRE.findall(f)) set(linkRE.findall(f)), f for f in fr: set(linkRE.findall(f)), f [(f, set(linkRE.findall(f)) for f in fr] [(f, set(linkRE.findall(f))) for f in fr] linksent [(f, set(linkRE.findall(f))) for f in fr] linksent = [(f, set(linkRE.findall(f))) for f in fr] linksent[0] linksent[54] [ls for ls in linksent if len(ls[1]) > 1 ] linksent = [ls for ls in linksent if len(ls[1]) > 1 ] len(linksent) from random import choice choice(linksent) readline.write_history_file('whee.py') import whee whee.fr [whee.links(e) for e in whee.en] enset = [whee.links(e) for e in whee.en] frset = [whee.links(f) for e in whee.fr] frset = [whee.links(f) for f in whee.fr] frset from random import choice choice(frset) f = choice(frset) for e in enset: if e.intersection(f): for e in enset: if e.intersection(f): print e, f a = set('abc') b = set('bcd') a.intersection(b) if a.intersection(b): print 'hi' f\ f f = choice(frset) frset f [x[0] for x in f] [x for x in enset if 'Peu' in x] [x for x in enset] [x for x in enset if 'Peu' in x[0]] [x for x in list(enset)] [list(s) for x in enset] [list(x) for x in enset] for a in [list(x) for x in enset]: print a q = [('a','b'), ('c')] 'b' in q [y for y in q if 'b' in y] [a for a in [list(x) for x in enset] if 'Paris' in a] [a for a in enset] [list(a) for a in enset] [b for b in list(a) for a in enset] [ list(a) for a in enset] dir() dir(whee) whee.en open('en.txt','w').write(en.encode('utf-8') ) open('en.txt','w').write(whee.en.encode('utf-8')) open('en.txt','w').write('\n'.join(whee.en).encode('utf-8')) type(whee.en) type(whee.en[0]) en = '.'.join(whee.en) type(en) type(en.decode('utf-8')) uen = en.decode('utf-8') fr = '.'.join(whee.fr) ufr = fr.decode('utf-8') open('en.txt','w').write('\n'.join(uen).encode('utf-8')) open('fr.txt','w').write('\n'.join(ufr).encode('utf-8')) dir() dir(whee) whee.e whee.linked whee.extract_links whee.extract_links() whee.extract_links('[[foo|bar]]') whee.extract_wikitext_links('[[foo|bar]]') whee.extract_wikitext_links('[[foo|bar]] [[baz]]') whee.compare_sentences en whee.extract_wikitext_links(en) whee.extract_wikitext_links(frr) whee.extract_wikitext_links(fr) [e[0] for e in whee.extract_wikitext_links(en)] elinks = [e[0] for e in whee.extract_wikitext_links(en)] flinks = [e[0] for e in whee.extract_wikitext_links(fr)] set(elinks) es = set(elinks) fs = set(flinks) es.intersection(fs) shared = es.intersection(fs) dir(whee) for s in whee.en: s for s in whee.en: for e in es: if s in e: print s