#!/usr/bin/env python # -*- coding: utf-8 -*- import sys, re from collections import defaultdict from operator import attrgetter, itemgetter import codecs sys.path.append('/home/pat/.pylib/') from textual import tokenize sys.path.append('/home/pat/repo/identify') from vecid import id sys.stdout = codecs.getwriter('utf-8')(sys.stdout) from languages import names2codes """ srtalign.py - process .srt formatted subtitle files 2 00:05:23,075 --> 00:05:28,710 sitä enemmän alat huomata, että meille on valehdeltu. """ def ufile(filename): return open(filename, 'U').read().decode('utf-8').strip() class Line: """ A single line of a Movie. In subtitle terms, a line which has its own timestamp. """ def __init__(self, line): self.parts = line.splitlines() self.number = self.parts[0] self.stamp = self.get_time(self.parts[1]) self.text = ' '.join(self.parts[2:]).replace('\n',' ') def to_seconds(self, stamp): stamp = stamp.strip() hours, minutes, seconds, milliseconds = re.split('[,:]', stamp) hours, minutes, seconds, milliseconds = tuple([int(x) for x in (hours, minutes, seconds, milliseconds)]) return (hours * 60 * 60 + minutes * 60 + seconds + (float(milliseconds)/1000)) def get_time(self, stamp): start, stop = map(self.to_seconds, stamp.split('-->')) return start class Movie: """ Container for lines; holds a screenplay Language is identified statistically """ def __init__(self, content): self.movie = content self.language = id(self.movie)[-1][1] # make this api more encapsulated self.code = names2codes[self.language] self.lines = self.get_lines() self.text = ' '.join([line.text for line in self.lines]) self.words = tokenize(self.text) self.vocab = set(self.words) def get_lines(self): return [Line(line) for line in re.split('\n\n+', self.movie)] class Alignment: """ Structure that holds two Movie objects with their Lines aligned by timestamp """ def __init__(self, alpha, beta): self.alpha = Movie(alpha) self.beta = Movie(beta) self.aligned = self.align() def align(self): aligned = [] for movie in self.alpha, self.beta: for line in movie.lines: aligned.append( (movie.code, line.stamp, line.text)) return aligned def dump(self): for code, stamp, text in sorted(self.aligned, key=itemgetter(1)): print "%s\t%.3d\t%s" % (code, stamp, text) """ def dump_table(alpha, beta): start = 0 end = max(map(len, [alpha.lines, beta.lines])) table = ["