import re
from string import punctuation
from collections import defaultdict
from wikialign import * 

puncRE = re.compile('['+punctuation+']')
id = download_article('id','Barack_Obama').decode('utf-8')
en = download_article('en','Barack_Obama').decode('utf-8')

def chunks(text):
 return puncRE.split(text)

code = '('+'|'.join(codes2names.keys()+['simple','zh-yue','Image','pdc'])+')'
linkRE = re.compile('''\[\[%s:([^\]]+)\]\]''' % code)
 
def clean(text):
  return strip_info_boxes(linkRE.sub('', text))

id = clean(id)
en = clean(en)

print 'INDONESIAN WORDS'
print ' '.join(sorted(set(id.split()), key=len))
#print 'ENGLISH WORDS'
#print ' '.join(sorted(set(en.split()), key=len))


