[len(r) for r in data] max([len(r) for r in data]) min([len(r) for r in data]) data[0] [max(r) for r in data] max([max(r) for r in data]) [r for r in data if max(r) > 1000] for y in [r for r in data if max(r) > 1000] : print y print from collections import defaultdict def freq(seq): d = defaultdict(int) for e in seq: d[e] += 1 return d freq(data[0]) len(set(data[0])) len((data[0])) data = urllib.urlopen('http://www.cs.cmu.edu/~nasmith/LS2.F06/a1.train.dat').readlines() import urllib data = urllib.urlopen('http://www.cs.cmu.edu/~nasmith/LS2.F06/a1.train.dat').readlines() data = [line.strip() for line in data] from collections import defaultdict def freq(seq): d = defaultdict(int) for e in seq: d[e] += 1 return d for line in data: pass from operator import itemgetter for line in data: print sorted(freq(list(line)), key=itemgetter(2)) for line in data: genes = """> GCGUUGGGC > GAGAAGGUC > GGGGGGGCC > GUGCCGUAC """ genes = genes.replace('> ','') genes genes.splitlines() genes = genes.splitlines() genes = [lines(gene) for gene in genes] genes = [list(gene) for gene in genes] genes for g in genes: print g for g in genes: print ''.join(g) from collections import defaultdict def freq(seq): d = defaultdict(int) for e in seq: d[e] += 1 return d all = [] [all + x for x in genes] [all.extend(x) for x in genes] all set(all) from operator import itemgetter for k,v in sorted(freq(all), key=itemgetter(1)): print k,v for k,v in sorted(freq(all).items(), key=itemgetter(1)): print k,v [(k,v) for k,v in sorted(freq(all).items(), key=itemgetter(1))] model = [(k,v) for k,v in sorted(freq(all).items(), key=itemgetter(1))] model dict(model) model = dict(model) model.items() [g[0] for g in genes] [g[1] for g in genes] [g[i] for g in genes for i in range(len(set(all)))] from math import log log(2) help(log) log(2,2) def ln(n): return log(n,2) ln(2) ln(4) ln(7) lg = ln lg(7) del(ln) lg lg(2) ln freq moby = list(open('/home/pat/repo/identify/samples/mobydick.txt').read()) len(moby) model model = freq(moby) model['a'] def surprisal(event, model): pass len(moby) def surprisal(event, space): model = freq(list(space)) total = len(space) return model[event] / float(total) surprisal('a', moby) surprisal('q', moby) surprisal('x', moby) surprisal('>', moby) import BeautifulSoup Soup = BeautifulSoup.BeautifulSoup Soup(open('3col.html').read()) html = Soup(open('3col.html').read()) print html.prettify() out = open('3col.html','w') out.write( html.prettify()) out.close() from urllib import urlopen from elementtree import ElementTree from urllib import urlopen from elementtree import ElementTree from _elementtree import ElementTree ElementTree elementree = _elementree tree = ElementTree.parse(urllib.open('http://dut.proz.com/profile/633824').read())) tree = ElementTree.parse(urllib.open('http://dut.proz.com/profile/633824').read()) import urllib tree = ElementTree.parse(urllib.open('http://dut.proz.com/profile/633824').read()) tree = ElementTree.parse(urllib.urlopen('http://dut.proz.com/profile/633824').read()) help(ElementTree) tree = ElementTree.parse(urllib.urlopen('http://dut.proz.com/profile/633824')) tree =ElementTree.parse('633824') help(ElementTree.parse) help(ElementTree) import _elementtree help(_elementtree) from xml import etree etree.ElementTree('633824') etree.ElementTree.parse('633824') ot = urllib.urlopen('http://fdnet.com.au/bmhughes/otxml.zip') import sys; sys.path.append('/home/pat/repo/udhr') open('/home/pat/repo/udhr/_index.txt') from glob import glob; glob('/home/pat/repo/udhr/*index*') from glob import glob; glob('/home/pat/repo/udhr/*_*') from glob import glob; glob('/home/pat/repo/udhr/_*') from glob import glob; glob('/home/pat/repo/udhr/*index*') from glob import glob; glob('/home/pat/repo/udhr/*index*')[0] idx = from glob import glob; glob('/home/pat/repo/udhr/*index*')[0] idx = glob('/home/pat/repo/udhr/*index*')[0] open(idx).readlines() import BeautifulSoup BeautifulSoup(idx) BeautifulSoup.BeautifulSoup(idx) BeautifulSoup.BeautifulSoup(open(idx)) idx = BeautifulSoup.BeautifulSoup(open(idx)) print idx.prettify() "this is an english sentence" for index, word in enumerante("this is an english sentence".split()): print index, word for index, word in enumerate("this is an english sentence".split()): print index, word def wordindex(text): indexes = [] pass from urllib import urlopen ind = urlopen('http://www.unicode.org/udhr/d/udhr_ind.txt').read().decode('utf-8').split() len(ind) print ' '.join(ind[:100]) eng = urlopen('http://www.unicode.org/udhr/d/udhr_eng.txt').read().decode('utf-8').split() map(len, [eng, ind]) print ' '.join(eng[:100]) all = open('all').decode('utf-8') all = open('all').read().decode('utf-8') len(all) len(all.split('\n\n')) all = open('all').read().decode('utf-8') len(all.split('\n\n')) fixed = [para for para in all.split('\n\n') if '{' not in para and 'document' not in para] len(fixed) lines = open('all').read().decode('utf-8').splitlines() lines = open('indonesian.txt').read().decode('utf-8').splitlines() from collection import defaultdict from collections import defaultdict def freq(seq): d = defaultdict(int) for e in seq: d[e] += 1 return d model = freq(lines) for line in model: if "Dengan menyebut nama Allah Yang Maha Pemurah lagi Maha Penyayang." not in line and model[line] < 114: print line all[:10] dir() ind = open('all').decode('utf-8').splitlines() ind = open('indonesian.txt').decode('utf-8').splitlines() ind = open('indonesian.txt').read().decode('utf-8').splitlines() for line in model: if "Dengan menyebut nama Allah Yang Maha Pemurah lagi Maha Penyayang." not in line and model[line] < 114: out.write(line) import codecs out = codecs.open('indonesian2.txt',mode='w',encoding='utf-8') for line in model: if "Dengan menyebut nama Allah Yang Maha Pemurah lagi Maha Penyayang." not in line and model[line] < 114: out.write(line) out.close() import string en = open('english2.txt').read().decode('utf-8').lower().split() id = open('../id/indonesian2.txt').read().decode('utf-8').lower().split() map(len, [en,id]) en[:10] id[:10] def spread(text): from collections import defaultdict locs = defaultdict(list) for i, w in enumerate(text): locs[w].append(i) return locs enloc = spread(en) idloc = spread(id) enloc['what'] enloc['up'] for w in ['what', 'up']: print w, str(len(enloc[w])) for w in ['apa', 'kabar']: print w, str(len(idloc[w])) enloc['what'][:10] + enloc['what'][-10:] enloc['what'][:10] + '..' + enloc['what'][-10:] ' '.join(enloc['what'][:10]) + '..' + ' '.join(enloc['what'][-10:]) print ' '.join(enloc['what'][:10]) + '..' + ' '.join(enloc['what'][-10:]) print ' '.join(map(str(enloc['what'][:10])) + '..' + ' '.join(enloc['what'][-10:]) print ' '.join(map(str, enloc['what'][:10])) + '..' + ' '.join(map(str, enloc['what'][-10:])) def show(w, source): print ' '.join(map(str, source[w][:10])) + '..' + ' '.join(map(str, source[w][-10:])) show('what', en') show('what', 'en') show('what', enloc) def show(w, source): print w print ' '.join(map(str, source[w][:10])) + '..' + ' '.join(map(str, source[w][-10:])) show('what', enloc) show('apa', idloc) import random random.uniform random.uniform() random.uniform(0,10,10) random.uniform(10,10) random.normalvariate() random.normalvariate(10,10) random.randrange() random.randrange(10) sorted([random.randrange(10) for i in range(10)]) sorted(set([random.randrange(10) for i in range(10)])) sorted(set([random.randrange(1000) for i in range(10)])) nums = sorted(set([random.randrange(1000) for i in range(10)])) print nums [(n/max(nums)) for n in nums] [(n/float(max(nums))) for n in nums] [(n/float(max(nums))*100) for n in nums] [(int(n/float(max(nums))*100)) for n in nums] nums2 = [(int(n/float(max(nums))*100)) for n in nums] floor import math math.floor from math import floor floor(87) floor(87,10) help(floor) 87 % 10 87 - 87 % 10 round round(87) round(87,10) help(round) round(87,2) round(87,1) round(87,0) freq from collections import defaultdict def freq(seq): d = defaultdict(int) for e in seq: d[e] += 1 returnd def freq(seq): d = defaultdict(int) for e in seq: d[e] += 1 return d nums2 freq(nums2) def flatten(n): return n - n % 10 freq(map(flatten, nums2)) freq(map(flatten, nums2+nums2)) for k,v in sorted(freq(map(flatten, nums2+nums2)): for k,v in sorted(freq(map(flatten, nums2+nums2))): k,v sorted(freq(map(flatten, nums2+nums2))): sorted(freq(map(flatten, nums2+nums2))) sorted(freq(map(flatten, nums2+nums2)).items()) id len(id) from math import sqrt sqrt(len(id)) pow pow(len(id), .5) sqrt(len(id)) == pow(len(id), .5) pow(len(en), .5) en enloc['what'] len(enloc['what'])