#!/usr/bin/env python # -*- coding:utf-8 -*- import re import sys, os from unicodedata import normalize rules = open('/home/pat/repo/accents/DiacriticFolding.txt').readlines() rules = [rule.strip() for rule in rules if not rule.startswith('#') and len(rule)>1] def process_rule(rule): uncommented = rule.split('#')[0] before, after = [side.strip() for side in uncommented.split(';')] return (before.split(), after) rules = [process_rule(rule) for rule in rules] subs = [] for before, after in rules: before = [unichr(int(codepoint,16)) for codepoint in before] before = ''.join(before) after = unichr(int(after,16)) subs.append((before, after)) def remove_accents_from_text(text): text = normalize('NFC',text) text = normalize('NFD', text) for before, after in subs: text = re.sub(before, after, text) fixed = normalize('NFC',text) return text if __name__ == "__main__": text = sys.stdin.read().decode('utf-8') print u"bḙfṍrḗ:" print text print print u"after:" print remove_accents_from_text(text)