package Subtitle::Format::SMI; use strict; use warnings; use utf8; use Subtitle::Event; use Subtitle::Utils qw(:string :timing); use base 'Subtitle::Format'; sub new { my ($class, %args) = @_; my $self = { debug => $args{debug} || 0, eol => $args{eol} || "\n", title => '', style => undef, params => {}, events => [], log => [], }; return bless($self, $class); } sub new_event { return Subtitle::Event->new; } sub parse { my ($self, $lines) = @_; my $text = join("\n", @{ $lines }); ## the code below taken from HTML::SimpleParse 0.12 ## and modified for modern perl versions my @tokens; while (1) { my ($content, $type); # First we try to pull off any plain text (anything before a "<" char) if ($text =~ /\G([^<]+)/gcs) { $content = $1; $type = 'text'; trim($content); chomp_all($content); # Then, SSI, comments, and markup declarations (usually ) # ssi: # comment: # markup: } elsif ($text =~ /\G<(!--(\#?).*?--)>/gcs) { $type = ($2 ? 'ssi' : 'comment'); $content = $1; } elsif ($text =~ /\G<(!.*?)>/gcs) { $type = 'markup'; $content = $1; # Then, look for an end tag } elsif ($text =~ m|\G|gcs) { $content = lc($1); $type = 'endtag'; # Then, finally we look for a start tag # We know the first char is <, make sure there's a > } elsif ($text =~ /\G<(.*?)>/gcs) { $content = lc($1); $type = 'starttag'; } else { # the string is exhausted, or there's no > in it. push @tokens, { 'content' => substr($text, pos $text), 'type' => 'text', } unless (pos($text) eq length($text)); last; } ## end of imported code push @tokens, { 'content' => $content, 'type' => $type, 'offset' => ($type eq 'text' ? pos($text) - length($content) : pos($text) - length($content) - 2), } unless $type eq 'text' and length($content) == 0; } my @stack; my $event; my $path = ''; foreach my $t (@tokens) { if ($t->{type} eq 'starttag') { my ($n) = ($t->{content} =~ m{^(\S+)}); $t->{name} = $n; pos($t->{content}) = length($n); while ($t->{content} =~ m<\G\s+(\S+)=['"]?(\S+)['"]?>gsc) { $t->{params}->{$1} = $2; } if (_is_markup_tag($n)) { # skip: ignore } elsif (@stack and $stack[-1]->{name} eq $n) { # skip: same tag } else { $path .= '/' . lc($n); push @stack, $t; } } elsif ($t->{type} eq 'endtag') { unless (_is_markup_tag($t->{content})) { my $n = $t->{content}; $path =~ s<>i; pop @stack; } next; } # if ($path eq '/sami/head/title' and $t->{type} eq 'text') { $self->{title} = $t->{content}; } elsif ($path eq '/sami/head/samiparam' and $t->{type} eq 'comment') { my $c = _comment_content($t->{content}); $c =~ m{(\S+):\s*(\S+)}; $self->{params}->{lc($1)} = $2 if $1; } elsif ($path eq '/sami/head/style' and $t->{type} eq 'comment') { $self->{style} = _comment_content($t->{content}); } elsif ($path eq '/sami/body/table/sync') { if ($t->{type} eq 'starttag' and $t->{name} eq 'sync') { my $time = $t->{params}->{start}; if ($event and $time) { $event->t_end($time); push @{ $self->{events} }, $event; undef $event; } $event = $self->new_event; $event->t_start($time) if defined $time; } elsif ($t->{type} eq 'starttag' and $t->{name} eq 'p') { $self->log(error => "

tag outside event at offset $t->{offset}") unless $event; $t->{content} =~ mi; $event->{class} = $1 if $1; } elsif ($t->{type} eq 'starttag' and $t->{name} eq 'br') { my $c = $event->text; $event->text($c . "\\n"); } elsif ($t->{type} eq 'text') { if ($t->{content} eq ' ') { undef $event; # empty event used to hide previous one } else { my $c = $event->text; $c .= $t->{content}; $c =~ tr/\ \t//s; # remove multiple spaces $event->text($c); } } else { } } else { #print "unhandled token: $t->{type} -- $t->{content}\n"; } } # foreach @tokens if ($event and my $t = $self->{params}->{length}) { $event->t_end($t); push @{ $self->{events} }, $event; undef $event; } return scalar @{ $self->{events} }; } sub build { my ($self) = @_; my @lines; push @lines, ""; push @lines, " "; if ($self->{title}) { my $line = " " x 4 . _tag_wrap('title', $self->{title}); push @lines, $line; } if (scalar keys %{ $self->{params} }) { while (my ($k, $v) = each %{ $self->{params} }) { my $c = sprintf '', ucfirst($k), $v; my $line = " " x 4 . _tag_wrap('samiparam', $c); push @lines, $line; } } if ($self->{style}) { push @lines, ' '; } push @lines, " "; push @lines, " "; push @lines, " "; my @events = $self->events; while (my $e = shift @events) { my $t = $e->text; $t =~ s{\\n}{
}smg; my $l = _tag_wrap('p', $t, {class => $e->{class}}); push @lines, " " x 6 . _tag_wrap('sync', $l, {start => $e->t_start}); if (@events and $e->t_end == $events[0]->t_start) { # next event replaces current without gap, # no need to hide if explicitly next; } else { my $t = _tag_wrap('p', ' '); push @lines, " " x 6 . _tag_wrap('sync', $t, {start => $e->t_end}); } } push @lines, "
"; push @lines, " "; push @lines, "
", ""; return join($self->{eol}, @lines); } sub _tag_wrap { my ($name, $content, $attrs) = @_; my @attrs; if ($attrs and ref($attrs) eq 'HASH') { while (my ($k, $v) = each %{ $attrs }) { next unless defined $v; push @attrs, sprintf(' %s=%s', $k, $v); } } return sprintf("<%s%s>%s", $name, join('', @attrs), $content, $name); } sub _is_markup_tag { my $t = lc(shift); return 1 if $t eq 'p'; return 1 if $t eq 'br'; return; } sub _comment_content { my $c = shift; $c =~ s<(^!-*\s*|\s*-*$)><>go; # strip !-- and -- $c =~ s<^[\r\n]*><>o; $c =~ s<[\r\n]*$><>o; $c =~ s<^\s+><>gms; trim($c); return $c; } 1;