open('lasvegas.txt').readlines() lines = open('lasvegas.txt').readlines() [line for line in lines if '-' in line] len([line for line in lines if '-' in line]) lines = open('lasvegas.txt').readlines() [line for line in lines if '-' in line] lines = [line for line in lines if '-' in line] open('las','w').write(lines) open('las','w').write('\n\n'.join(lines)) lines = open('lasvegas.txt').readlines() [int(line[0]) for line in lines] [line[0]for line in lines] lines = open('lasvegas.txt').readlines() [int(line[0]) for line in lines] [line[0] for line in lines] lines = open('lasvegas.txt').readlines() [line[0] for line in lines] [line[0] for line in lines if len(line) > 1 ] [int(line[0]) for line in lines if len(line) > 1 ] sum([int(line[0]) for line in lines if len(line) > 1 ]) from random import choice deck = ["notwild" for i in range(52 - 2)] + ["wild", "wild"] deck from random import shuffle shuffle(deck) decks = deck * 5 decks def trials(n): gotred2 = 0 shuffle(decks) if "wild" in decks[:10]: return 1 else: return 0 trials(1) trials(110) def trials(n): gotred2 = 0 for i in range(n): shuffle(decks) if "wild" in decks[:10]: return 1 def trials(n): gotred2 = 0 for i in range(n): if "wild" in decks[:10]: gotred2 += 1 else: continue return gotred2 trials(100) def trials(n): gotred2 = 0 shuffle(decks) for i in range(n): if "wild" in decks[:10]: gotred2 += 1 else: continue return gotred2 trials(100) a = decks shuffle(decks) b = decks a = b a = decks shuffle(decks) b = decks a == b help(shuffle) a = decks a[:10] shuffle(decks) a[:10] trials(100000) import networkx as NX G=NX.Graph() G.add_edge(1,2) G.add_node("spam") print G.nodes() print G.edges() draw(G) NG.draw(G) from matplotlib import draw import matplotlib import networkx as NX G=NX.Graph() G.add_node('spam') G.add_node(1,2) G.add_edge(1,2) draw(G) import pylab as P draw(G) import networkx.drawing dir(networkx.drawing) networkx.draw(G) P.show() draw_random(G) networkx.draw_random(G) networkx.draw_graphviz(G) import networkx.drawing reload(networkx.drawing) networkx.draw_graphviz(G) networkx.write_dot(G) networkx.drawing.write_dot(G) networkx.drawing.write_dot(G, 'foo.dot') petersen = networkx.petersen_graph() networkx.drawing.draw(petersen) petersen = networkx.petersen_graph() networkx.drawing.draw(petersen) petersen = networkx.petersen_graph() networkx.drawing.draw(petersen) ba=barabasi_albert_graph(100,5) ba=networkx.barabasi_albert_graph(100,5) networkx.drawing.draw(ba) from urllib import urlopen repo = urlopen('http://www.iana.org/assignments/language-subtag-registry').readlines() repo = repo[1:] repo = ''.join(repo) repo.split('%%') len(repo.split('%%')) repo = urlopen('http://www.iana.org/assignments/language-subtag-registry').read() repo = repo.split('%%') repo[0] repo = repo[1:] repo[0] print repo[0] def lookup(pat): for line in repo: if pat in line: print line lookup('Spanish') scripts = {} for lang in repo: pass lookup('Suppress-Script') for lang in repo: if 'Suppress-Script' not in lang: print lang for lang in repo: if 'Suppress-Script' not in lang and 'redundant' not in lang: print lang for lang in repo: if 'Suppress-Script' not in lang and 'redundant' not in lang and 'grandfathered' not in lang: print lang for lang in repo: if 'Suppress-Script' not in lang and 'redundant' not in lang and 'grandfathered' not in lang and 'variant' not in lang: print lang for lang in repo: if 'Suppress-Script' not in lang and 'redundant' not in lang and 'grandfathered' not in lang and 'variant' not in lang and if 'region' not in lang: print lang if 'Suppress-Script' not in lang and 'redundant' not in lang and 'grandfathered' not in lang and 'variant' not in lang and 'region' not in lang: print lang for lang in repo: if 'Suppress-Script' not in lang and 'redundant' not in lang and 'grandfathered' not in lang and 'variant' not in lang and 'region' not in lang: print lang if 'Suppress-Script' not in lang and 'redundant' not in lang and 'grandfathered' not in lang and 'variant' not in lang and 'region' not in lang and 'Type: language' in name: print lang for lang in repo: if 'Suppress-Script' not in lang and 'redundant' not in lang and 'grandfathered' not in lang and 'variant' not in lang and 'region' not in lang and 'Type: language' in name: print lang for lang in repo: if 'Suppress-Script' not in lang and 'redundant' not in lang and 'grandfathered' not in lang and 'variant' not in lang and 'region' not in lang and 'Type: language' in lang: print lang len("songs you bothered to learn all the chords to in their entirety and can bang out on your guitar and sometimes do in feeble but sincere attempts to pitch pr") from pysqlite2 import dbapi2 as sqlite from pysqlite3 import dbapi2 as sqlite from sqlite3 import dbapi2 as sqlite sqlite import clusters import sys; sys.path.append('/home/pat/lex/lex/') import languages dir(languages) languages.language_codes languages.language_codes.keys() for code in languages.language_codes.keys(): if not os.path.isdir(code): print code import os for code in languages.language_codes.keys(): if os.path.isdir(code): print code for code in languages.language_codes.keys(): if not os.path.isdir(code): os.mkdir(code) import sys; sys.path.append('/home/pat/repo/wikialign') import languages languages.codes2names import os for code in languages.codes2names.keys(): print code for code in languages.codes2names.keys(): if not os.path.isdir(code): os.mkdir(code) import feedfinder help(feedfinder.feed) help(feedfinder) feedfinder.find('http://www.daniel-yorick.com') feedfinder.feed('http://www.daniel-yorick.com') import feedparser feedparser.parse('http://daniel-yorick.com/feed') import BeautifulSoup from BeautifulSoup import BeautifulSoup as Soup Soup(' Soup('enda001') Soup(open('enda001')) enda = Soup(open('enda001')) enda.outline for o in enda.outline: o enda.find('outline') soup enda enda.prettyprint() enda.prettyprint dir(enda) from sqlite3 import dbapi2 as sqlite sqlite.connect('blogcorpus.db') con = sqlite.connect('blogcorpus.db') sql = """create table corpus(url, feedurl, content, language)""" con.execute(sql) con.commit() from BeautifulSoup import BeautifulSoup as Soup from BeautifulSoup import BeautifulStoneSoup as Stone Stone('enda001') enda = Stone('enda001') enda.findAll('td') enda.prettify() print enda.prettify() enda = Stone(open('enda001')) print enda.prettify() enda.findAll('outline') len(enda.findAll('outline')) enda.findAll('outline')['07 Blogger Indonesia'] enda.findAll('outline')['title'] enda.find('outline')['title'] enda.find('outline') enda.findAll('outline') for o in enda.findAll('outline'): o for o in enda.findAll('outline'): print type(o) for o in enda.findAll('outline'): print dir(o) for o in enda.findAll('outline')[1]: print dir(o) for o in enda.findAll('outline'): o['title'] for o in enda.findAll('outline'): if o['title']: print o['title'] enda.findAll('outline')[0] enda.findAll('outline')[10] for o in enda.findAll('outline'): print o['text'] for o in enda.findAll('outline'): for x in o: print x for o in enda.findAll('outline'): print o for o in enda.findAll('outline'): 'title' in o for o in enda.findAll('outline'): 'text' in o for o in enda.findAll('outline'): o['text'] for o in enda.findAll('outline'): if o['title']: print o['title'] for o in enda.findAll('outline'): o.title for o in enda.findAll('outline'): o.text for o in enda.findAll('outline'): print o.text for o in enda.findAll('outline'): print o.title for o in enda.findAll('outline'): print o.contents.title enda.findAll('outline')[0] enda.findAll('outline')[10] enda.findAll('outline')[12] enda.findAll('outline')[20] enda.findAll('outline')[40] enda.findAll('outline')[39]] enda.findAll('outline')[39] import os os.getcwd() for o in enda.findAll('outline'): o['xmlurl'] x = enda.findAll('outline')[39] dir(x) x.fetch('text') x.fetch('title') x.find('title') x.findAll('title') x['title'] x['text'] x x['xmlurl'\ ] x['xmlurl'] for x in enda.findAll('outline'): x['xmlurl'] for x in enda.findAll('outline'): try: url=x['xmlurl'] print url except KeyError: continue [x for x in enda.findAll('outline') if 'topik' in x['title']] [x for x in enda.findAll('outline') if 'topik' in x['text']] [x for x in enda.findAll('outline') if 'Indonesia' in x['']] [x for x in enda.findAll('outline') if 'Indonesia' in x['text']] len([x for x in enda.findAll('outline') if 'Indonesia' in x['text']] ) len([x for x in enda.findAll('outline') if '07 Blogger Indonesia' in x['text']] ) [x for x in enda.findAll('outline') if '07 Blogger Indonesia' in x['text']] category = [x for x in enda.findAll('outline') if '07 Blogger Indonesia' in x['text']] type(category) category.findChildren category.findChildren() category category.find('*') category.find() category.findNext() category.findNext().findNext() enda.children() enda.findChildren() kids = enda.findChildren() [kid['xmlurl'] for kid in kids] kids = enda.findChild() kids = enda.findChildren() kids kids[0] for o in enda.findAll('outline'): if o['text']: print o['text'] for o in enda.findAll('outline'): if o['title']: print o['title'] allTags = enda.findAll(True) len(allTags) allTags['title'] == '07 Blogger Indonesia' enda.findAll(text='07 Blogger Indonesia') enda.findAll(text='07 Bertopik') enda.findAll(text=re.compile('*Indonesia*')) import re enda.findAll(text=re.compile('*Indonesia*')) enda.findAll(text=re.compile('.*Indonesia.*')) enda.findAll(title=re.compile('.*Indonesia.*')) enda.findAll(attrs={'text':True}) enda.findAll(attrs={'text':re.compile("Indonesia$"}) enda.findAll(attrs={'text':re.compile("Indonesia$")}) enda.findAll(attrs={'text':re.compile("Blogger Indonesia$")}) enda.findAll(attrs={'text':re.compile("07 Blogger Indonesia$")}) bloggers = enda.findAll(attrs={'text':re.compile("07 Blogger Indonesia$")})[0] bloggers.findChildren() bloggers tentang = enda.findAll(attrs={'title':re.compile(".*tentang inovasi")})[0] tentang tentang.parent tentang.parent.prettify() print tentang.parent.prettify() tentang = enda.findAll(attrs={'text':re.compile(".*tentang inovasi")})[0] tentang tentang.next() tentang.next tentang.nextSibling() tentang.next tentang.next.nex tentang.next.next tentang.previous tentang.previous.previous tentang.previous.previous.previous.previous tentang.previous.previous.previous.previous.previous tentang.previous.previous.previous.previous.previous.previous tentang.previous.previous.previous.previous.previous.previous.previous tentang.previous.previous.previous.previous.previous.previous.previous.previous tentang.previous.previous.previous.previous.previous.previous.previous.previous.previous tentang.previous.previous.previous.previous.previous.previous.previous.previous.previous.previous tentang.previous.previous.previous.previous.previous.previous.previous.previous.previous.previous.previous tentang.previous.previous.previous.previous.previous.previous.previous.previous.previous.previous.previous.previous tentang = enda.findAll(attrs={'text':re.compile("07 Blogger Indonesia")})[0] tentang outline = enda.findAll(attrs={'text':re.compile("07 Blogger Indonesia")})[0] outline.findChildren() outline = enda.findAll(attrs={'text':re.compile("07 Blogger Indonesia")}) outline outline.findChildren() outline[0].findChildren() # first i open up this bloglines opml file: from urllib import urlopen opml = urlopen("http://www.bloglines.com/export?id=enda001") # now i use beautiful soup's xml module to parse it: from BeautifulSoup import BeautifulStoneSoup as Soup soup = Soup(opml) # My goal is to make a list of all the xmlurl's under the outline with text="07 Blogger Indonesia" suboutline = soup.findAll(attrs={'text':'07 Blogger Indonesia'}) suboutline # so there it is, as a result set. but where are the children of the first element of that result set? suboutline.findChildren() suboutline[0].findChildren() reddit = Soup(urlopen('http://science.reddit.com/')) reddit.prettify() print reddit.prettify() reddit.find('a') reddit.findAll('a') as = reddit.findAll('a') [a['href'] for a in as] [a['href'] for a in as if a.startswith('http://')] [a['href'] for a in as] links = reddit.findAll('a') [a['href'] for a in links if a.startswith('http://')] links [a for a in links] [a['href'] for a in links] [a['href'] for a in links if a.startswith('http')] [a['href'] for a in links if a['href'].startswith('http')] [a['href'] for a in links if a['href'].startswith('http') and not a['href'].startswith('http://reddit.com')] len([a['href'] for a in links if a['href'].startswith('http') and not a['href'].startswith('http://reddit.com')]) len([a['href'] for a in links if a['href'].startswith('http')]) ([a['href'] for a in links if a['href'].startswith('http') and not a['href'].startswith('http://reddit.com')]) ([a['href'] for a in links if a['href'].startswith('http') and not a['href'].startswith('http://reddit.com') and 'reddit' not in a['href']]) len(([a['href'] for a in links if a['href'].startswith('http') and not a['href'].startswith('http://reddit.com') and 'reddit' not in a['href']])) for x in (([a['href'] for a in links if a['href'].startswith('http') and not a['href'].startswith('http://reddit.com') and 'reddit' not in a['href']])): print x bloggers = Soup(open('indonesianbloggers.xml')) bloggers bloggers.prettify() print bloggers.prettify() bloggers[0] bloggers.contents bloggers.contents[0] bloggers.contents[0].findChildren() bloggers.contents[0].findRecursiveChildren() bloggers.findChild bloggers.findChildren bloggers.findChildren.findChildren bloggers.findChildren().findChildren bloggers.findChildren() bloggers.findChildren()[0] bloggers.findChildren()[1] bloggers.findChildren()[3] bloggers.findChildren()[1].parent bloggers.findChildren()[3] [blog.xmlUrl for blog in bloggers.findChildren()] [blog for blog in bloggers.findChildren()] [blog['xmlUrl'] for blog in bloggers.findChildren()] [blog['xmlurl'] for blog in bloggers.findChildren()] [blog['attrs':{xmlurl:True}] for blog in bloggers.findChildren()] [blog['attrs':{xmlurl:True}] for blog in bloggers.findChildren()[1:]] [blog['xmlurl'] for blog in bloggers.findChildren()[1:]] [blog['xmlUrl'] for blog in bloggers.findChildren()[1:]] [blog['text'] for blog in bloggers.findChildren()[1:]] [blog['htmlUrl'] for blog in bloggers.findChildren()[1:]] [blog for blog in bloggers.findChildren()[1:]] [dir(blog) for blog in bloggers.findChildren()[1:]] recent = Soup(open('/tmp/shortChanges.xml')) recent.findAll('weblog') [weblog['url'] for weblog in recent.findAll('weblog')[:10]] open('lasvegas.txt').readlines() lines = open('lasvegas.txt').readlines() [line for line in lines if '-' in line] len([line for line in lines if '-' in line]) lines = open('lasvegas.txt').readlines() [line for line in lines if '-' in line] lines = [line for line in lines if '-' in line] open('las','w').write(lines) open('las','w').write('\n\n'.join(lines)) lines = open('lasvegas.txt').readlines() [int(line[0]) for line in lines] [line[0]for line in lines] lines = open('lasvegas.txt').readlines() [int(line[0]) for line in lines] [line[0] for line in lines] lines = open('lasvegas.txt').readlines() [line[0] for line in lines] [line[0] for line in lines if len(line) > 1 ] [int(line[0]) for line in lines if len(line) > 1 ] sum([int(line[0]) for line in lines if len(line) > 1 ]) from random import choice deck = ["notwild" for i in range(52 - 2)] + ["wild", "wild"] deck from random import shuffle shuffle(deck) decks = deck * 5 decks def trials(n): gotred2 = 0 shuffle(decks) if "wild" in decks[:10]: return 1 else: return 0 trials(1) trials(110) def trials(n): gotred2 = 0 for i in range(n): shuffle(decks) if "wild" in decks[:10]: return 1 def trials(n): gotred2 = 0 for i in range(n): if "wild" in decks[:10]: gotred2 += 1 else: continue return gotred2 trials(100) def trials(n): gotred2 = 0 shuffle(decks) for i in range(n): if "wild" in decks[:10]: gotred2 += 1 else: continue return gotred2 trials(100) a = decks shuffle(decks) b = decks a = b a = decks shuffle(decks) b = decks a == b help(shuffle) a = decks a[:10] shuffle(decks) a[:10] trials(100000) import networkx as NX G=NX.Graph() G.add_edge(1,2) G.add_node("spam") print G.nodes() print G.edges() draw(G) NG.draw(G) from matplotlib import draw import matplotlib import networkx as NX G=NX.Graph() G.add_node('spam') G.add_node(1,2) G.add_edge(1,2) draw(G) import pylab as P draw(G) import networkx.drawing dir(networkx.drawing) networkx.draw(G) P.show() draw_random(G) networkx.draw_random(G) networkx.draw_graphviz(G) import networkx.drawing reload(networkx.drawing) networkx.draw_graphviz(G) networkx.write_dot(G) networkx.drawing.write_dot(G) networkx.drawing.write_dot(G, 'foo.dot') petersen = networkx.petersen_graph() networkx.drawing.draw(petersen) petersen = networkx.petersen_graph() networkx.drawing.draw(petersen) petersen = networkx.petersen_graph() networkx.drawing.draw(petersen) ba=barabasi_albert_graph(100,5) ba=networkx.barabasi_albert_graph(100,5) networkx.drawing.draw(ba) from urllib import urlopen repo = urlopen('http://www.iana.org/assignments/language-subtag-registry').readlines() repo = repo[1:] repo = ''.join(repo) repo.split('%%') len(repo.split('%%')) repo = urlopen('http://www.iana.org/assignments/language-subtag-registry').read() repo = repo.split('%%') repo[0] repo = repo[1:] repo[0] print repo[0] def lookup(pat): for line in repo: if pat in line: print line lookup('Spanish') scripts = {} for lang in repo: pass lookup('Suppress-Script') for lang in repo: if 'Suppress-Script' not in lang: print lang for lang in repo: if 'Suppress-Script' not in lang and 'redundant' not in lang: print lang for lang in repo: if 'Suppress-Script' not in lang and 'redundant' not in lang and 'grandfathered' not in lang: print lang for lang in repo: if 'Suppress-Script' not in lang and 'redundant' not in lang and 'grandfathered' not in lang and 'variant' not in lang: print lang for lang in repo: if 'Suppress-Script' not in lang and 'redundant' not in lang and 'grandfathered' not in lang and 'variant' not in lang and if 'region' not in lang: print lang if 'Suppress-Script' not in lang and 'redundant' not in lang and 'grandfathered' not in lang and 'variant' not in lang and 'region' not in lang: print lang for lang in repo: if 'Suppress-Script' not in lang and 'redundant' not in lang and 'grandfathered' not in lang and 'variant' not in lang and 'region' not in lang: print lang if 'Suppress-Script' not in lang and 'redundant' not in lang and 'grandfathered' not in lang and 'variant' not in lang and 'region' not in lang and 'Type: language' in name: print lang for lang in repo: if 'Suppress-Script' not in lang and 'redundant' not in lang and 'grandfathered' not in lang and 'variant' not in lang and 'region' not in lang and 'Type: language' in name: print lang for lang in repo: if 'Suppress-Script' not in lang and 'redundant' not in lang and 'grandfathered' not in lang and 'variant' not in lang and 'region' not in lang and 'Type: language' in lang: print lang len("songs you bothered to learn all the chords to in their entirety and can bang out on your guitar and sometimes do in feeble but sincere attempts to pitch pr") from pysqlite2 import dbapi2 as sqlite from pysqlite3 import dbapi2 as sqlite from sqlite3 import dbapi2 as sqlite sqlite import clusters import sys; sys.path.append('/home/pat/lex/lex/') import languages dir(languages) languages.language_codes languages.language_codes.keys() for code in languages.language_codes.keys(): if not os.path.isdir(code): print code import os for code in languages.language_codes.keys(): if os.path.isdir(code): print code for code in languages.language_codes.keys(): if not os.path.isdir(code): os.mkdir(code) import sys; sys.path.append('/home/pat/repo/wikialign') import languages languages.codes2names import os for code in languages.codes2names.keys(): print code for code in languages.codes2names.keys(): if not os.path.isdir(code): os.mkdir(code) import feedfinder help(feedfinder.feed) help(feedfinder) feedfinder.find('http://www.daniel-yorick.com') feedfinder.feed('http://www.daniel-yorick.com') import feedparser feedparser.parse('http://daniel-yorick.com/feed') import BeautifulSoup from BeautifulSoup import BeautifulSoup as Soup Soup(' Soup('enda001') Soup(open('enda001')) enda = Soup(open('enda001')) enda.outline for o in enda.outline: o enda.find('outline') soup enda enda.prettyprint() enda.prettyprint dir(enda) from sqlite3 import dbapi2 as sqlite sqlite.connect('blogcorpus.db') con = sqlite.connect('blogcorpus.db') sql = """create table corpus(url, feedurl, content, language)""" con.execute(sql) con.commit() from BeautifulSoup import BeautifulSoup as Soup from BeautifulSoup import BeautifulStoneSoup as Stone Stone('enda001') enda = Stone('enda001') enda.findAll('td') enda.prettify() print enda.prettify() enda = Stone(open('enda001')) print enda.prettify() enda.findAll('outline') len(enda.findAll('outline')) enda.findAll('outline')['07 Blogger Indonesia'] enda.findAll('outline')['title'] enda.find('outline')['title'] enda.find('outline') enda.findAll('outline') for o in enda.findAll('outline'): o for o in enda.findAll('outline'): print type(o) for o in enda.findAll('outline'): print dir(o) for o in enda.findAll('outline')[1]: print dir(o) for o in enda.findAll('outline'): o['title'] for o in enda.findAll('outline'): if o['title']: print o['title'] enda.findAll('outline')[0] enda.findAll('outline')[10] for o in enda.findAll('outline'): print o['text'] for o in enda.findAll('outline'): for x in o: print x for o in enda.findAll('outline'): print o for o in enda.findAll('outline'): 'title' in o for o in enda.findAll('outline'): 'text' in o for o in enda.findAll('outline'): o['text'] for o in enda.findAll('outline'): if o['title']: print o['title'] for o in enda.findAll('outline'): o.title for o in enda.findAll('outline'): o.text for o in enda.findAll('outline'): print o.text for o in enda.findAll('outline'): print o.title for o in enda.findAll('outline'): print o.contents.title enda.findAll('outline')[0] enda.findAll('outline')[10] enda.findAll('outline')[12] enda.findAll('outline')[20] enda.findAll('outline')[40] enda.findAll('outline')[39]] enda.findAll('outline')[39] import os os.getcwd() for o in enda.findAll('outline'): o['xmlurl'] x = enda.findAll('outline')[39] dir(x) x.fetch('text') x.fetch('title') x.find('title') x.findAll('title') x['title'] x['text'] x x['xmlurl'\ ] x['xmlurl'] for x in enda.findAll('outline'): x['xmlurl'] for x in enda.findAll('outline'): try: url=x['xmlurl'] print url except KeyError: continue [x for x in enda.findAll('outline') if 'topik' in x['title']] [x for x in enda.findAll('outline') if 'topik' in x['text']] [x for x in enda.findAll('outline') if 'Indonesia' in x['']] [x for x in enda.findAll('outline') if 'Indonesia' in x['text']] len([x for x in enda.findAll('outline') if 'Indonesia' in x['text']] ) len([x for x in enda.findAll('outline') if '07 Blogger Indonesia' in x['text']] ) [x for x in enda.findAll('outline') if '07 Blogger Indonesia' in x['text']] category = [x for x in enda.findAll('outline') if '07 Blogger Indonesia' in x['text']] type(category) category.findChildren category.findChildren() category category.find('*') category.find() category.findNext() category.findNext().findNext() enda.children() enda.findChildren() kids = enda.findChildren() [kid['xmlurl'] for kid in kids] kids = enda.findChild() kids = enda.findChildren() kids kids[0] for o in enda.findAll('outline'): if o['text']: print o['text'] for o in enda.findAll('outline'): if o['title']: print o['title'] allTags = enda.findAll(True) len(allTags) allTags['title'] == '07 Blogger Indonesia' enda.findAll(text='07 Blogger Indonesia') enda.findAll(text='07 Bertopik') enda.findAll(text=re.compile('*Indonesia*')) import re enda.findAll(text=re.compile('*Indonesia*')) enda.findAll(text=re.compile('.*Indonesia.*')) enda.findAll(title=re.compile('.*Indonesia.*')) enda.findAll(attrs={'text':True}) enda.findAll(attrs={'text':re.compile("Indonesia$"}) enda.findAll(attrs={'text':re.compile("Indonesia$")}) enda.findAll(attrs={'text':re.compile("Blogger Indonesia$")}) enda.findAll(attrs={'text':re.compile("07 Blogger Indonesia$")}) bloggers = enda.findAll(attrs={'text':re.compile("07 Blogger Indonesia$")})[0] bloggers.findChildren() bloggers tentang = enda.findAll(attrs={'title':re.compile(".*tentang inovasi")})[0] tentang tentang.parent tentang.parent.prettify() print tentang.parent.prettify() tentang = enda.findAll(attrs={'text':re.compile(".*tentang inovasi")})[0] tentang tentang.next() tentang.next tentang.nextSibling() tentang.next tentang.next.nex tentang.next.next tentang.previous tentang.previous.previous tentang.previous.previous.previous.previous tentang.previous.previous.previous.previous.previous tentang.previous.previous.previous.previous.previous.previous tentang.previous.previous.previous.previous.previous.previous.previous tentang.previous.previous.previous.previous.previous.previous.previous.previous tentang.previous.previous.previous.previous.previous.previous.previous.previous.previous tentang.previous.previous.previous.previous.previous.previous.previous.previous.previous.previous tentang.previous.previous.previous.previous.previous.previous.previous.previous.previous.previous.previous tentang.previous.previous.previous.previous.previous.previous.previous.previous.previous.previous.previous.previous tentang = enda.findAll(attrs={'text':re.compile("07 Blogger Indonesia")})[0] tentang outline = enda.findAll(attrs={'text':re.compile("07 Blogger Indonesia")})[0] outline.findChildren() outline = enda.findAll(attrs={'text':re.compile("07 Blogger Indonesia")}) outline outline.findChildren() outline[0].findChildren() # first i open up this bloglines opml file: from urllib import urlopen opml = urlopen("http://www.bloglines.com/export?id=enda001") # now i use beautiful soup's xml module to parse it: from BeautifulSoup import BeautifulStoneSoup as Soup soup = Soup(opml) # My goal is to make a list of all the xmlurl's under the outline with text="07 Blogger Indonesia" suboutline = soup.findAll(attrs={'text':'07 Blogger Indonesia'}) suboutline # so there it is, as a result set. but where are the children of the first element of that result set? suboutline.findChildren() suboutline[0].findChildren() reddit = Soup(urlopen('http://science.reddit.com/')) reddit.prettify() print reddit.prettify() reddit.find('a') reddit.findAll('a') as = reddit.findAll('a') [a['href'] for a in as] [a['href'] for a in as if a.startswith('http://')] [a['href'] for a in as] links = reddit.findAll('a') [a['href'] for a in links if a.startswith('http://')] links [a for a in links] [a['href'] for a in links] [a['href'] for a in links if a.startswith('http')] [a['href'] for a in links if a['href'].startswith('http')] [a['href'] for a in links if a['href'].startswith('http') and not a['href'].startswith('http://reddit.com')] len([a['href'] for a in links if a['href'].startswith('http') and not a['href'].startswith('http://reddit.com')]) len([a['href'] for a in links if a['href'].startswith('http')]) ([a['href'] for a in links if a['href'].startswith('http') and not a['href'].startswith('http://reddit.com')]) ([a['href'] for a in links if a['href'].startswith('http') and not a['href'].startswith('http://reddit.com') and 'reddit' not in a['href']]) len(([a['href'] for a in links if a['href'].startswith('http') and not a['href'].startswith('http://reddit.com') and 'reddit' not in a['href']])) for x in (([a['href'] for a in links if a['href'].startswith('http') and not a['href'].startswith('http://reddit.com') and 'reddit' not in a['href']])): print x bloggers = Soup(open('indonesianbloggers.xml')) bloggers bloggers.prettify() print bloggers.prettify() bloggers[0] bloggers.contents bloggers.contents[0] bloggers.contents[0].findChildren() bloggers.contents[0].findRecursiveChildren() bloggers.findChild bloggers.findChildren bloggers.findChildren.findChildren bloggers.findChildren().findChildren bloggers.findChildren() bloggers.findChildren()[0] bloggers.findChildren()[1] bloggers.findChildren()[3] bloggers.findChildren()[1].parent bloggers.findChildren()[3] [blog.xmlUrl for blog in bloggers.findChildren()] [blog for blog in bloggers.findChildren()] [blog['xmlUrl'] for blog in bloggers.findChildren()] [blog['xmlurl'] for blog in bloggers.findChildren()] [blog['attrs':{xmlurl:True}] for blog in bloggers.findChildren()] [blog['attrs':{xmlurl:True}] for blog in bloggers.findChildren()[1:]] [blog['xmlurl'] for blog in bloggers.findChildren()[1:]] [blog['xmlUrl'] for blog in bloggers.findChildren()[1:]] [blog['text'] for blog in bloggers.findChildren()[1:]] [blog['htmlUrl'] for blog in bloggers.findChildren()[1:]] [blog for blog in bloggers.findChildren()[1:]] [dir(blog) for blog in bloggers.findChildren()[1:]] recent = Soup(open('/tmp/shortChanges.xml')) recent.findAll('weblog') [weblog['url'] for weblog in recent.findAll('weblog')[:10]] import urllib urllib.urlopen('http://marchsya.blogspot.com/feeds/posts/default').read() import feedparser feedparser.parse('http://perpustakaanonline.freehostia.com/feed/') from urllib import urlopen cy = urlopen('http://www.bloglines.com/search?q=cymraeg&ql=en&s=f&pop=l&news=m&t=f&n=100') cy = cy.read() cy from BeautifulSoup import BeautifulSoup as Soup Soup(cy) soup = Soup(cy) soup.find('a') soup.findAll('a') [link['href'] for link in soup.findAll('a')] [link['href'] for link in soup.findAll('a') if link['href']] soup.findAll(attrs={'href':True}) [link['href'] for link in soup.findAll(attrs={'href':True})] [link['href'] for link in soup.findAll(attrs={'href':True}) and 'bloglines' not in link['href']] [link['href'] for link in soup.findAll(attrs={'href':True}) if 'bloglines' not in link['href']] len([link['href'] for link in soup.findAll(attrs={'href':True}) if 'bloglines' not in link['href']] ) [link['href'] for link in soup.findAll(attrs={'href':True}) if 'bloglines' not in link['href'] and link['href'].startswith('http')] for i,blog in enumerate([link['href'] for link in soup.findAll(attrs={'href':True}) if 'bloglines' not in link['href'] and link['href'].startswith('http')] ): print i,blog for i,blog in enumerate(sorted([link['href'] for link in soup.findAll(attrs={'href':True}) if 'bloglines' not in link['href'] and link['href'].startswith('http')] )): print i,blog for i,blog in enumerate(sorted(set([link['href'] for link in soup.findAll(attrs={'href':True}) if 'bloglines' not in link['href'] and link['href'].startswith('http')] ))): print i,blog cy.prettify() type(cy) cy soup.prettify() print soup.prettify() soup.findAll('p', attrs={'class':'url'}) soup.findAll(attrs={'class':'url'}).find('p') soup.findAll(attrs={'class':'url'}) for p in soup.findAll(attrs={'class':'url'}): p for p in soup.findAll(attrs={'class':'url'}): p.find('a') p.find(attrs={'href':True}) for p in soup.findAll(attrs={'class':'url'}): p.find(attrs={'href':True}) for p in soup.findAll(attrs={'class':'url'}): for link in p.find(attrs={'href':True}): link['href'] for p in soup.findAll(attrs={'class':'url'}): for link in p.find(attrs={'href':True}): link for p in soup.findAll(attrs={'class':'url'}): p for p in soup.findAll(attrs={'class':'url'}): for link in p.find(attrs={'href':True}): p for p in soup.findAll(attrs={'class':'url'}): for link in p.find(attrs={'href':True}): link for p in soup.findAll(attrs={'class':'url'}): p