package Text::Dokuwiki::Parser; use strict; use warnings; use utf8; use Text::Dokuwiki::Regexps; sub new { my ($class) = @_; my $self = { footnotes => [], }; return bless($self, $class); } sub _parse_list { my ($self, $lines) = @_; my @lists = (); my @stack = ( \@lists ); my $types = {curr => '', last => ''}; my $level = {curr => 0, last => 0}; foreach my $line (@{ $lines }) { $line =~ m/^(\s+)/o; my ($ident, $dot, $rest) = ($line =~ m/^((?:\s{2})+)([\*-])\s*(.+)/); $level->{last} = $level->{curr}; $level->{curr} = $ident =~ tr/ / /; $types->{last} = $types->{curr}; $types->{curr} = ($dot eq '-') ? 'ol' : 'ul'; if ($level->{curr} == $level->{last} and $types->{curr} ne $types->{last}) { pop @stack; my $list = [$types->{curr} => {}]; push @{ $stack[-1] }, (@stack > 1) ? [li => {}, $list] : $list; push @stack, $list; } if ($level->{curr} > $level->{last}) { my $list = [$types->{curr} => {}]; push @{ $stack[-1] }, (@stack > 1) ? [li => {}, $list] : $list; push @stack, $list; } if ($level->{curr} < $level->{last}) { pop @stack; } push @{ $stack[-1] }, [li => {}, $self->_parse_text($rest)]; } pop @stack while @stack; return @lists; } sub _parse_table { my ($self, $lines) = @_; my ($i, $j, $colspan) = (0, 0, 0); my @rows = (); foreach my $line (@{ $lines }) { my @row = (); for ($j = 0; ;$j++) { last if ($line eq '|' or $line eq '^'); next if ($line !~ m/^(([\|\^])([^\|\^]*))/o); my $attrs = {}; my ($all, $key, $value) = ($1, $2, $3); $line = substr($line, length($all)); my $type = ($key eq '^') ? 'th' : 'td'; my ($lpad, $content, $rpad) = ($value =~ m/^(\s*)(.*?)(\s*)$/o); if ($line =~ m/^([\|\^]{2,})/o) { # colspan detected; $attrs->{colspan} = $colspan = length($1); } elsif (index($content, ":::") >= 0) { # rowspan detected for (my $k = $i - 1; $k >= 0; $k--) { # k is idx of prev rows my $cellptr = $rows[$k][$j + 2]; # +2 for (tr => {}) next if (index($cellptr->[2], ":::") >= 0); # also rowspan $cellptr->[1]->{rowspan} //= 1; # init attr, if missing $cellptr->[1]->{rowspan} += 1; # incr value last; } $attrs->{skip} = 1; } elsif ($colspan > 1) { # colspan-eaten column $attrs->{skip} = 1; $colspan -= 1; } if ($lpad eq '') { $attrs->{align} = ($rpad eq '') ? 'center' : 'left'; } else { $attrs->{align} = ($rpad eq '') ? 'right' : 'center'; } push @row, [$type => $attrs, $content]; } push @rows, [tr => {}, @row]; $i++; } return [table => {}, @rows]; } sub _parse_link { my ($self, $content) = @_; my ($href, @text); if ((my $pos = index($content, "|")) >= 0) { $href = substr($content, 0, $pos); @text = $self->_parse_text(substr($content, $pos + 1)); } else { $href = $content; @text = ($content); } unless ($href =~ m!^[a-z]+://!io) { $href =~ s{:}{/}go; } return [a => {href => $href}, @text]; } sub _parse_include { my ($self, $content) = @_; $content =~ s/^{{//o; $content =~ s/}}$//o; if ($content =~ m|^([a-z]+)>(.*)|oi) { warn "ignored: `$content`, unimplemented\n"; return; } my ($lpad, $rpad, $src, %attrs) = ('', '', ''); if ($content =~ m!^(\s*)([^\s\|]+)(\s*)[|](.*)!oi) { ($lpad, $src, $rpad) = ($1, $2, $3); $attrs{title} = $4; } elsif ($content =~ m!^(\s*)(\S+)(\s*)!) { ($lpad, $src, $rpad) = ($1, $2, $3); } else { ($src) = ($content =~ s/^(.+)/$1/or); } $attrs{align} = ($lpad ne '') ? (($rpad ne '') ? 'center' : 'right') : 'left'; $src =~ s<\?(\d+)(?:x(\d+))?> < $attrs{width} = $1 if $1; $attrs{height} = $2 if $2; '' >ioe; $src =~ s|:|/|go; $attrs{src} = $src; return [img => \%attrs]; } my $inlines = { "__" => "__", # underline "//" => "//", # italic "**" => "**", # bold "''" => "''", # monospace "{{" => "}}", # module/image "[[" => "]]", # link "((" => "))", # footnote }; sub _parse_text { my ($self, $line) = @_; my ($endtag, $endpos, $content); my @parts = (); while ($line) { $line =~ m!^(?:(.*?)(__|//|''|\*\*|\[\[|\{\{|\(\(|\\\\ ))?(.*)!o; my ($before, $match, $after) = ($1, $2, $3); if ($before) { push @parts, $before; $line = $match . $after; next; } if ($after and not $match) { push @parts, $after; last; } if ($match eq '\\\\ ') { # force newline push @parts, [br => {}]; $line = $after; next; } elsif ($match) { $endtag = $inlines->{$match}; $endpos = index($after, $endtag, 0); if ($endpos < 0) { # no closing marker? push @parts, $match; $line = $after; next; } $content = substr($after, 0, $endpos); $line = substr($after, $endpos + 2); } my %attrs; if ($match eq "//") { %attrs = ('font-style' => 'italic'); } elsif ($match eq "**") { %attrs = ('font-weight' => 'bold'); } elsif ($match eq "''") { %attrs = ('font-family' => 'monospace'); } elsif ($match eq "__") { %attrs = ('text-decoration' => 'underline'); } elsif ($match eq "{{") { push @parts, $self->_parse_include($content); next; } elsif ($match eq "[[") { push @parts, $self->_parse_link($content); next; } elsif ($match eq "((") { push @{ $self->{footnotes} }, $self->_parse_text($content); my $n = scalar @{ $self->{footnotes} }; push @parts, [sup => {}, [a => {href => "#note_$n"}, $n]]; next; } elsif ($match) { die("unrecognized inline: $match\n"); } if (%attrs) { push @parts, [span => \%attrs, $self->_parse_text($content)]; next; } } return @parts; } sub parse { my ($self, $text) = @_; my $rx = $Text::Dokuwiki::Regexps::regexps; my (@tree); my $len = length($text); pos($text) = 0; while ($len - pos($text) > 0) { my $pos = pos($text); if (0) { # TODO: MACRO } elsif ($text =~ m/\G$rx->{header}/cgi) { my $level = length($+{line}); $level = 7 - $level; # invert push @tree, ["h$level" => {}, $+{header}]; } elsif ($text =~ m/\G$rx->{codeblock}/cgi) { if ($+{tag} eq 'file' and $+{filename}) { my $attrs = $+{syntax} ? {class => $+{syntax}} : {}; my $dt = [dt => {}, $+{filename}]; my $dd = [dt => {}, [pre => $attrs, $+{block}]]; push @tree, [dl => {class => 'file'}, [$dt, $dd]]; } elsif ($+{tag} eq 'file') { push @tree, [pre => {class => 'file'}, $+{block}]; } else { my $attrs = $+{syntax} ? {class => $+{syntax}} : {}; push @tree, [code => $attrs, [pre => {}, $+{block}]]; } } elsif ($text =~ m/\G$rx->{table}/cgi) { my $lines = $+{table}; chomp $lines; push @tree, $self->_parse_table([ split(/\r?\n/, $lines) ]); } elsif ($text =~ m/\G$rx->{list}/cgi) { my $lines = $+{list}; chomp $lines; push @tree, $self->_parse_list([ split(/\r?\n/, $lines) ]); } elsif ($text =~ m/\G$rx->{blockquote}/cgi) { push @tree, [blockquote => {}, $+{block}]; } elsif ($text =~ m/\G$rx->{pre}/cgi) { push @tree, [pre => {}, $+{block}]; } elsif ($text =~ m/\G$rx->{paragraph}/cgi) { my $text = $+{text}; chomp $text; push @tree, [p => {}, $self->_parse_text($text)]; } elsif ($text =~ m/\G$rx->{emptyline}/cgi) { next; } else { my $msg = "Unmatched: '" . substr($text, pos($text)) . "'\n"; die($msg); } if (pos($text) <= $pos) { die("parser failed, abort\n"); } } if (scalar @{ $self->{footnotes} }) { my @fn = @{ $self->{footnotes} }; my @out = (); for (my $i = 0; $i <= $#fn; $i++) { my $name = sprintf "note_%d", $i + 1; push @out, [li => {}, [a => {name => $name}], $fn[$i]]; } push @tree, [ol => {}, @out]; } return [div => {}, @tree ]; } 1; __END__ [div => {class => 'block'}, #
Second paragraph
] #