from glob import glob
glob('*deu*')
f = glob('*deu*')[0]
deu = open(f).read().decode('utf-8')
from collectoins import defaultdict
from collections import defaultdict
def freq(seq):
f = defaultdict(int)
for e in seq: f[e] += 1
return f
def ngrams(seq): return [seq[i:i+n] for i in range(len(seq)-n+1]
def ngrams(seq): return [seq[i:i+n] for i in range(len(seq)-n+1)]
def ngrams(seq,n): return [seq[i:i+n] for i in range(len(seq)-n+1)]
def bigrams(seq): return ngrams(seq,2)
deu.split()
words = deu.split()
bigrams(words)
def ngrams(seq,n): return [tuple(seq[i:i+n]) for i in range(len(seq)-n+1)]
def bigrams(seq): return ngrams(seq,2)
bigrams(words)
freq(bigrams(words))
len(freq(bigrams(words)))
len(words)
byfreq = sorted(freq(words).items(), key=itemgetter(1))
from operator import itemgetter
byfreq = sorted(freq(words).items(), key=itemgetter(1))
byfreq
wordsbyfreq = byfreq
del byfreq
wordsbyfreq
bgbyfreq = sorted(freq(bigrams(words)), key=itemgetter(1))
bgbyfreq
bgbyfreq = sorted(freq(bigrams(words)).items(), key=itemgetter(1))
bgbyfreq
len(wordsbyfreq)
wordsbyfreq
[b for a,b in wordsbyfreq]
[w for w,f in wordsbyfreq if f>10]
topwords = [w for w,f in wordsbyfreq if f>10]
bg for bg in bgbyfreq
[bg for bg in bgbyfreq]
[bg for ((a,b),bg) in bgbyfreq]
[a,b for ((a,b),f) in bgbyfreq]
[(a,b) for ((a,b),f) in bgbyfreq]
[(a,b) for ((a,b),f) in bgbyfreq if a in topwords and b in topwords]
'der die' in deu
deu.index('der die')
deu[deu.index('der die')-100:deu.index('der die')+100
]
print deu[deu.index('der die')-100:deu.index('der die')+100]
[(f,a,b) for ((a,b),f) in bgbyfreq if a in topwords and b in topwords]
for x,y,z in [(f,a,b) for ((a,b),f) in bgbyfreq if a in topwords and b in topwords]: print x,y,z
from urllib import urlretrieve
help(urlretrieve)
urlretrieve('http://www.gutenberg.org/files/18232/18232-0.txt', filename="hofmann-de.txt')
urlretrieve('http://www.gutenberg.org/files/18232/18232-0.txt', filename="hofmann-de.txt")
deu = open('hofmann-de.txt').read().decode('utf-8')
len(deu)
readline.write_history_file('cliqueish.txt')
open('23228-8.txt').read().decode('utf-8')
open('23228-8.txt').read().decode('latin-1')
text = open('23228-8.txt').read().decode('latin-1')
open('arabische-de.txt','w').write(text.encode('utf-8'))
open('all').read().decode('utf-8')
from BeautifulSoup import UnicodeDammit
text = UnicodeDammit(open('all').read()).unicode
open("fixed",'w').write(text.encode('utf-8'))
from BeautifulSoup import UnicodeDammit
text = UnicodeDammit(open('all').read()).unicode
open("fixed",'w').write(text))
open("fixed",'w').write(text)
print text
from chardet import detect
detect(text)
import chardet
from urllib import urlopen
hi = urlopen('http://www.cfilt.iitb.ac.in/commonwords/2000-hin.aci').read()
chardet.detect(hi)
hi
print hi
chardet.detect(hi)
len(hi)
print u"\ue000"
print u"\u2000"
c = u"\u2000"
from unicodedata import name
print c, name(c)
c = u"\u2005"
print c, name(c)
c = u"\u203b"
print c, name(c)
unichr(u"\u2005")
unichr(2005)
unichr(int(2005,16))
help(unichr)
hex(2005)
unichr(hex(2005))
unichr(int(hex(2005)))
help(range)
range(0x1,0x20)
import binascii
help(binascii)
binascii.hexlify('0x20')
import os
os.environ.get('FOO')
full_path = os.path.split(os.path.abspath(os.path.dirname(sys.argv[0])))
sys.path.append(os.path.join(*(full_path[:full_path.index('admin')+1] + ['lib']))
)
import sys
sys.path.append(os.path.join(*(full_path[:full_path.index('admin')+1] + ['lib']))
)
full_path = os.path.split(os.path.abspath(os.path.dirname(sys.argv[0])))
sys.path.append(os.path.join(*(full_path[:full_path.index('admin')+1] + ['lib'])))
from text import *
dir()
def udhr(code): return open('/home/pat/udhr/udhr_'+code+'.txt').read().decode('utf-8')
por = udhr('por')
def udhr(code): return open('/home/pat/repo/udhr/udhr_'+code+'.txt').read().decode('utf-8')
por = udhr('por')
porwords = tokenize(por)
porbigrams = tuplize(bigrams(porwords))
porbigramsbyfreq = byfreq(porbigrams)
for a,b in porbigramsbyfreq: print a,b
moby = open('/home/pat/repo/identify/samples/mobydick.txt').read().decode('utf-8')
words = tokenize(moby)
mbigrams = bigrams(words)
mbigrams = tuplize(bigrams(words))
len(mbigrams)
topbigrams = byfreq(mbigrams)
for a,b in topbigrams: print a,b
for a,b in topbigrams[-10:]: print a,b
for a,b in topbigrams[:10]: print a,b
topbigrams
mbigrams[:10]
freq(mbigrams[:10])
byfreq(mbigrams[:10])
def byfreq(seq):
fq = freq(seq).items()
return sorted(fq, key=itemgetter(1))
byfreq(mbigrams[:10])
topbigrams = byfreq(mbigrams)
for a,b in topbigrams[:10]: print a,b
for a,b in topbigrams[-10:]: print a,b
for (a,b),f in topbigrams[-100:]: print a,b
for (a,b),f in topbigrams[-100:]: print f, a,b
for (a,b),f in topbigrams[-10:]: print f, a,b
for (a,b),f in topbigrams[-15:]: print f, a,b
readline.write_history_file('moby.py')
from BeautifulSoup import BeautifulStoneSoup as Soup
from urllib import urlopen
xml = urlopen('http://news.google.com/news?ned=hi_in&hl=hi&ned=hi_in&q=%E0%A4%AD%E0%A4%BE%E0%A4%B0%E0%A4%A4&ie=UTF-8&output=atom').read()
Soup(xml)
soup = Soup(xml)
import scipy
help(scipy.io.read_array)
dir(scipy)
a1 = scipy.array([1,2,3,4])
a2 = scipy.array([4,3,2,1])
print a1
print a2
print a2 * a1
a3 = a2 * a1
print a3
a1 = scipy.zeros((4,5))
a1
print a1
print a2
a2 = scipy.empty((4,5))
print a2
a3 = scipy.zeros((4,5), dtype="f")
a3
print a3
scipy.io
open('bo2en.txt').read().decode('utf-8')
print open('bo2en.txt').read().decode('utf-8')
print open('bo2en.txt').read().decode('utf-8').splitlines()
open('bo2en.txt').read().decode('utf-8').splitlines()
open('bo2en.txt').read().decode('utf-8').splitlines()
enbo = open('bo2en.txt').read().decode('utf-8').splitlines()
enbo = [line.strip().split('\t') for line in enbo]
for b,e in enbo:
print b
print e
print
[len(l) for l in enbo]
print enbo[0]
print enbo[0][0]
boen = open('bo2en.txt').read().decode('utf-8').splitlines()
boen
print boen[0]
boen = open('bo2en.txt').read().decode('utf-8')
print boen[0]
len(boen.splitlines())
boen[-1]
print boen[-1]
print boen.splitlines()[-1]
print boen.splitlines()[-1].split('\t')
boen = [line.strip() for line in open('bo2en.txt').read().decode('utf-8')]
[len(line.split('\t')) for line in boen]
set([len(line.split('\t')) for line in boen])
boen[0]
type(boen)
boen
boen = [line.strip() for line in open('bo2en.txt').read().decode('utf-8').splitlines()]
boen
set([len(line.split('\t')) for line in boen])
[boen.index(line) for line in boen if len(line.split('\t') == 1]
[boen.index(line) for line in boen if len(line.split('\t')) == 1]
boen[0]
printboen[0]
print boen[0]
print boen[1]
boen = [line.strip() for line in open('bo2en.txt').read().decode('utf-8').splitlines()]
set([len(line.split('\t')) for line in boen])
boen = [line.strip() for line in open('bo2en.txt').read().decode('utf-8').splitlines()]
set([len(line.split('\t')) for line in boen])
try:
line.split('\t')
except:
print line
try:
line.split('\t')
except:
x = line
for line in boen:
try:
psas
for line in boen:
if '\t' not in line: print line
boen = [line.strip() for line in open('bo2en.txt').read().decode('utf-8').splitlines()]
for line in boen:
if '\t' not in line: print line
import subprocess
subprocess.call('ls -l')
subprocess.call('ls -l', shell=True)
MESSAGES = 'tail /var/log/messages'
SPACE = 'df -h'
cmds = [MESSAGES, SPACE]
count = 0
for cmd in cmds:
count += 1
print "Running command #%s" % count
subprocess.call(cmd, shell=True)
readline.write_history_file('doer.py')
a = set('abc')
a
b = set('bcd')
a & b
a or b
a, b
b or a
b || a
b | a
b - a
a -b
a | b == b | a
b | a
a | b
sorted(a | b )
a & b
linkRE = """\[\[ # [[
( # title
[^\]\|\:]+ # several of anything but ]|:
) # end title
\|? # bar?
( # anchor?
[^\]\[\|\:]+
)? # end anchor?
\]\] # ]] """
link = linkRE
import re
linkRE = re.compile(linkRE)
help(re)
linkRE = re.compile(linkRE, re.VERBOSE)
readline.write_history_file('link.py')
rennes = u"""
|date-sans=[[2005]]
|dens={{formatnum:4093}}
|aire urbaine= 600 746 hab. (est. 2008)
|nomhab=Rennais, Rennaises
|siteweb=[http://www.ville-rennes.fr/ Site de la ville de Rennes]
}}
[[Rennes]] est une commune [[France|française]], le chef-lieu du département d'[[Ille-et-Vilaine]] et de la région [[Bretagne (région administrative)|Bretagne]], ainsi que l'une des capitales historiques du [[duché de Bretagne]] se situe à l'est de la [[Bretagne]] sur les bords de [[La Vilaine]].
Ses [[gentilé|habitants]] sont appelés les '''Rennais''' et '''Rennaises'''.
Elle est appelée ''Resnn'' en [[gallo]] et ''Roazhon'' en [[breton]], Rennes vient des [[Redones]], nom du [[peuple gaulois]] peuplant cette partie d'Armorique au {{-s|II|e}}
En [[langue des signes française]], la ville se signe comme un [[renne]].
Rennes est la huitième [[ville universitaire]] française, après [[Paris]], [[Lyon]], [[Toulouse]], [[Lille]], [[Marseille]]/[[Aix-en-Provence]], [[Montpellier]] et [[Bordeaux]][[ftp://trf.education.gouv.fr/pub/edutel/dpd/atlas/atlas2006/atlas2006.pdf Site du gouvernement, Atlas 2006 de l'enseignement supérieur]] (et la septième si l'on considère que Aix et Marseille forment une seule ville).
"""
from wikialign_outline import *
extract_links(rennes)
extract_wikitext_links(rennes)
from wikialign_outline import *
extract_wikitext_links(rennes)
rennes = u"""
|date-sans=[[2005]]
|dens={{formatnum:4093}}
|aire urbaine= 600 746 hab. (est. 2008)
|nomhab=Rennais, Rennaises
|siteweb=[http://www.ville-rennes.fr/ Site de la ville de Rennes]
}}
[[Rennes]] est une commune [[France|française]], le chef-lieu du département d'[[Ille-et-Vilaine]] et de la région [[Bretagne (région administrative)|Bretagne]], ainsi que l'une des capitales historiques du [[duché de Bretagne]] se situe à l'est de la [[Bretagne]] sur les bords de [[La Vilaine]].
Ses [[gentilé|habitants]] sont appelés les '''Rennais''' et '''Rennaises'''.
Elle est appelée ''Resnn'' en [[gallo]] et ''Roazhon'' en [[breton]], Rennes vient des [[Redones]], nom du [[peuple gaulois]] peuplant cette partie d'Armorique au {{-s|II|e}}
En [[langue des signes française]], la ville se signe comme un [[renne]].
Rennes est la huitième [[ville universitaire]] française, après [[Paris]], [[Lyon]], [[Toulouse]], [[Lille]], [[Marseille]]/[[Aix-en-Provence]], [[Montpellier]] et [[Bordeaux]][[ftp://trf.education.gouv.fr/pub/edutel/dpd/atlas/atlas2006/atlas2006.pdf Site du gouvernement, Atlas 2006 de l'enseignement supérieur]] (et la septième si l'on considère que Aix et Marseille forment une seule ville).
"""
linkRE
extract_wikitext_links(rennes)
rennes = u"""
|date-sans=[[2005]]
|dens={{formatnum:4093}}
|aire urbaine= 600 746 hab. (est. 2008)
|nomhab=Rennais, Rennaises
|siteweb=[http://www.ville-rennes.fr/ Site de la ville de Rennes]
}}
[[Rennes]] est une commune [[France|française]], le chef-lieu du département d'[[Ille-et-Vilaine]] et de la région [[Bretagne (région administrative)|Bretagne]], ainsi que l'une des capitales historiques du [[duché de Bretagne]] se situe à l'est de la [[Bretagne]] sur les bords de [[La Vilaine]].
Ses [[gentilé|habitants]] sont appelés les '''Rennais''' et '''Rennaises'''.
Elle est appelée ''Resnn'' en [[gallo]] et ''Roazhon'' en [[breton]], Rennes vient des [[Redones]], nom du [[peuple gaulois]] peuplant cette partie d'Armorique au {{-s|II|e}}
En [[langue des signes française]], la ville se signe comme un [[renne]].
"""
from wikialign_outline import *
linkRE
from link import *
linkRE
linkRE.findall(rennes)
for x,y in linkRE.findall(rennes): print x,y
for x,y in linkRE.findall(rennes): print x,' --- ',y
dir()
peuple_galois = spider_article('fr', 'Peuple_gaulois')
peuple_galois
from wikialign import download_article
peuple_galois = download_article('fr', 'Peuple_gaulois')
peuple_galois
linkRE.findall(peuple_galois)
peuple_galois
linkRE.findall(peuple_galois, re.VERBOSE)
peuples_galois = download_article('fr', 'Peuples_gaulois')
peuples_galois
linkRE.findall(peuples_galois)
split_into_sentences(peuples_galois)
rennes_fr = download_article('fr', 'Rennes')
rennes_en = download_article('en', 'Rennes')
len(rennes_fr)
len(rennes_en)
split_into_sentences(rennes_fr)
fr = split_into_sentences(rennes_fr)
en = split_into_sentences(rennes_en)
for f in fr: f
for f in fr:
linkRE.findall(f)
for f in fr:
set(linkRE.findall(f))
set(linkRE.findall(f)), f
for f in fr:
set(linkRE.findall(f)), f
[(f, set(linkRE.findall(f)) for f in fr]
[(f, set(linkRE.findall(f))) for f in fr]
linksent [(f, set(linkRE.findall(f))) for f in fr]
linksent = [(f, set(linkRE.findall(f))) for f in fr]
linksent[0]
linksent[54]
[ls for ls in linksent if len(ls[1]) > 1 ]
linksent = [ls for ls in linksent if len(ls[1]) > 1 ]
len(linksent)
from random import choice
choice(linksent)
readline.write_history_file('whee.py')
import whee
whee.fr
[whee.links(e) for e in whee.en]
enset = [whee.links(e) for e in whee.en]
frset = [whee.links(f) for e in whee.fr]
frset = [whee.links(f) for f in whee.fr]
frset
from random import choice
choice(frset)
f = choice(frset)
for e in enset: if e.intersection(f):
for e in enset:
if e.intersection(f):
print e, f
a = set('abc')
b = set('bcd')
a.intersection(b)
if a.intersection(b): print 'hi'
f\
f
f = choice(frset)
frset
f
[x[0] for x in f]
[x for x in enset if 'Peu' in x]
[x for x in enset]
[x for x in enset if 'Peu' in x[0]]
[x for x in list(enset)]
[list(s) for x in enset]
[list(x) for x in enset]
for a in [list(x) for x in enset]: print a
q = [('a','b'), ('c')]
'b' in q
[y for y in q if 'b' in y]
[a for a in [list(x) for x in enset] if 'Paris' in a]
[a for a in enset]
[list(a) for a in enset]
[b for b in list(a) for a in enset]
[ list(a) for a in enset]
dir()
dir(whee)
whee.en
open('en.txt','w').write(en.encode('utf-8')
)
open('en.txt','w').write(whee.en.encode('utf-8'))
open('en.txt','w').write('\n'.join(whee.en).encode('utf-8'))
type(whee.en)
type(whee.en[0])
en = '.'.join(whee.en)
type(en)
type(en.decode('utf-8'))
uen = en.decode('utf-8')
fr = '.'.join(whee.fr)
ufr = fr.decode('utf-8')
open('en.txt','w').write('\n'.join(uen).encode('utf-8'))
open('fr.txt','w').write('\n'.join(ufr).encode('utf-8'))
dir()
dir(whee)
whee.e
whee.linked
whee.extract_links
whee.extract_links()
whee.extract_links('[[foo|bar]]')
whee.extract_wikitext_links('[[foo|bar]]')
whee.extract_wikitext_links('[[foo|bar]] [[baz]]')
whee.compare_sentences
en
whee.extract_wikitext_links(en)
whee.extract_wikitext_links(frr)
whee.extract_wikitext_links(fr)
[e[0] for e in whee.extract_wikitext_links(en)]
elinks = [e[0] for e in whee.extract_wikitext_links(en)]
flinks = [e[0] for e in whee.extract_wikitext_links(fr)]
set(elinks)
es = set(elinks)
fs = set(flinks)
es.intersection(fs)
shared = es.intersection(fs)
dir(whee)
for s in whee.en: s
for s in whee.en:
for e in es:
if s in e:
print s