diff --git a/lib/Subtitle/Format/SMI.pm b/lib/Subtitle/Format/SMI.pm new file mode 100644 index 0000000..5f84462 --- /dev/null +++ b/lib/Subtitle/Format/SMI.pm @@ -0,0 +1,235 @@ +package Subtitle::Format::SMI; + +use strict; +use warnings; +use utf8; + +use Subtitle::Event; +use Subtitle::Utils qw(:string :timing); + +use base 'Subtitle::Format'; + +sub new { + my ($class, %args) = @_; + my $self = { + debug => $args{debug} || 0, + eol => $args{eol} || "\n", + title => '', + style => undef, + params => {}, + events => [], + log => [], + }; + + return bless($self, $class); +} + +sub new_event { return Subtitle::Event->new; } + +sub parse { + my ($self, $lines) = @_; + my $text = join("\n", @{ $lines }); + + ## the code below taken from HTML::SimpleParse 0.12 + ## and modified for modern perl versions + my @tokens; + while (1) { + my ($content, $type); + + # First we try to pull off any plain text (anything before a "<" char) + if ($text =~ /\G([^<]+)/gcs) { + $content = $1; $type = 'text'; + trim($content); + chomp_all($content); + + # Then, SSI, comments, and markup declarations (usually ) + # ssi: + # comment: + # markup: + } elsif ($text =~ /\G<(!--(\#?).*?--)>/gcs) { + $type = ($2 ? 'ssi' : 'comment'); + $content = $1; + } elsif ($text =~ /\G<(!.*?)>/gcs) { + $type = 'markup'; + $content = $1; + + # Then, look for an end tag + } elsif ($text =~ m|\G|gcs) { + $content = lc($1); $type = 'endtag'; + + # Then, finally we look for a start tag + # We know the first char is <, make sure there's a > + } elsif ($text =~ /\G<(.*?)>/gcs) { + $content = lc($1); $type = 'starttag'; + + } else { + # the string is exhausted, or there's no > in it. + push @tokens, { + 'content' => substr($text, pos $text), + 'type' => 'text', + } unless (pos($text) eq length($text)); + last; + } + ## end of imported code + + push @tokens, { + 'content' => $content, + 'type' => $type, + 'offset' => ($type eq 'text' ? + pos($text) - length($content) : + pos($text) - length($content) - 2), + } unless $type eq 'text' and length($content) == 0; + } + + my @stack; + my $event; + my $path = ''; + foreach my $t (@tokens) { + if ($t->{type} eq 'starttag') { + my ($n) = ($t->{content} =~ m{^(\S+)}); + $t->{name} = $n; + pos($t->{content}) = length($n); + while ($t->{content} =~ m<\G\s+(\S+)=['"]?(\S+)['"]?>gsc) { + $t->{params}->{$1} = $2; + } + if (_is_markup_tag($n)) { + # skip: ignore + } elsif (@stack and $stack[-1]->{name} eq $n) { + # skip: same tag + } else { + $path .= '/' . lc($n); + push @stack, $t; + } + } elsif ($t->{type} eq 'endtag') { + unless (_is_markup_tag($t->{content})) { + my $n = $t->{content}; + $path =~ s<>i; + pop @stack; + } + next; + } + # + if ($path eq '/sami/head/title' and $t->{type} eq 'text') { + $self->{title} = $t->{content}; + } elsif ($path eq '/sami/head/samiparam' and $t->{type} eq 'comment') { + my $c = _comment_content($t->{content}); + $c =~ m{(\S+):\s*(\S+)}; + $self->{params}->{lc($1)} = $2 if $1; + } elsif ($path eq '/sami/head/style' and $t->{type} eq 'comment') { + $self->{style} = _comment_content($t->{content}); + } elsif ($path eq '/sami/body/table/sync') { + if ($t->{type} eq 'starttag' and $t->{name} eq 'sync') { + my $time = $t->{params}->{start}; + if ($event and $time) { + $event->t_end($time); + push @{ $self->{events} }, $event; + undef $event; + } + $event = $self->new_event; + $event->t_start($time) if defined $time; + } elsif ($t->{type} eq 'starttag' and $t->{name} eq 'p') { + $self->log(error => "

tag outside event at offset $t->{offset}") + unless $event; + $t->{content} =~ mi; + $event->{class} = $1 if $1; + } elsif ($t->{type} eq 'starttag' and $t->{name} eq 'br') { + my $c = $event->text; + $event->text($c . "\\n"); + } elsif ($t->{type} eq 'text') { + if ($t->{content} eq ' ') { + undef $event; # empty event used to hide previous one + } else { + my $c = $event->text; + $c .= $t->{content}; + $c =~ tr/\ \t//s; # remove multiple spaces + $event->text($c); + } + } else { + } + } else { + #print "unhandled token: $t->{type} -- $t->{content}\n"; + } + } # foreach @tokens + if ($event and my $t = $self->{params}->{length}) { + $event->t_end($t); + push @{ $self->{events} }, $event; + undef $event; + } + return scalar @{ $self->{events} }; +} + +sub build { + my ($self) = @_; + my @lines; + push @lines, ""; + push @lines, " "; + if ($self->{title}) { + my $line = " " x 4 . _tag_wrap('title', $self->{title}); + push @lines, $line; + } + if (scalar keys %{ $self->{params} }) { + while (my ($k, $v) = each %{ $self->{params} }) { + my $c = sprintf '', ucfirst($k), $v; + my $line = " " x 4 . _tag_wrap('samiparam', $c); + push @lines, $line; + } + } + if ($self->{style}) { + push @lines, ' '; + } + push @lines, " "; + push @lines, " "; + push @lines, " "; + my @events = $self->events; + while (my $e = shift @events) { + my $t = $e->text; + $t =~ s{\\n}{
}smg; + my $l = _tag_wrap('p', $t, {class => $e->{class}}); + push @lines, " " x 6 . _tag_wrap('sync', $l, {start => $e->t_start}); + if (@events and $e->t_end == $events[0]->t_start) { + # next event replaces current without gap, + # no need to hide if explicitly + next; + } else { + my $t = _tag_wrap('p', ' '); + push @lines, " " x 6 . _tag_wrap('sync', $t, {start => $e->t_end}); + } + } + push @lines, "
"; + push @lines, " "; + push @lines, "
", ""; + return join($self->{eol}, @lines); +} + +sub _tag_wrap { + my ($name, $content, $attrs) = @_; + my @attrs; + if ($attrs and ref($attrs) eq 'HASH') { + while (my ($k, $v) = each %{ $attrs }) { + next unless defined $v; + push @attrs, sprintf(' %s=%s', $k, $v); + } + } + return sprintf("<%s%s>%s", $name, join('', @attrs), $content, $name); +} + +sub _is_markup_tag { + my $t = lc(shift); + return 1 if $t eq 'p'; + return 1 if $t eq 'br'; + return; +} + +sub _comment_content { + my $c = shift; + $c =~ s<(^!-*\s*|\s*-*$)><>go; # strip !-- and -- + $c =~ s<^[\r\n]*><>o; + $c =~ s<[\r\n]*$><>o; + $c =~ s<^\s+><>gms; + trim($c); + return $c; +} + +1;