package Text::Dokuwiki::Parser; use strict; use warnings; use feature qw/ switch /; use utf8; sub new { my ($class) = @_; my $self = {}; return bless($self, $class); } sub _parse_list { my ($self, $lines) = @_; my @lists = (); my @stack = ( \@lists ); my $types = {curr => '', last => ''}; my $level = {curr => 0, last => 0}; foreach my $line (@{ $lines }) { $line =~ m/^(\s+)/o; my ($ident, $dot, $rest) = ($line =~ m/^((?:\s{2})+)([\*-])\s*(.+)/); $level->{last} = $level->{curr}; $level->{curr} = $ident =~ tr/ / /; $types->{last} = $types->{curr}; $types->{curr} = ($dot eq '-') ? 'ol' : 'ul'; if ($level->{curr} == $level->{last} and $types->{curr} ne $types->{last}) { pop @stack; my $list = [$types->{curr} => {}]; push @{ $stack[-1] }, (@stack > 1) ? [li => {}, $list] : $list; push @stack, $list; } if ($level->{curr} > $level->{last}) { my $list = [$types->{curr} => {}]; push @{ $stack[-1] }, (@stack > 1) ? [li => {}, $list] : $list; push @stack, $list; } if ($level->{curr} < $level->{last}) { pop @stack; } push @{ $stack[-1] }, [li => {}, $self->_parse_text($rest)]; } pop @stack while @stack; return @lists; } sub _parse_table { my ($self, $lines) = @_; my ($i, $j, $colspan) = (0, 0, 0); my @rows = (); foreach my $line (@{ $lines }) { my @row = (); for ($j = 0; ;$j++) { last if ($line eq '|' or $line eq '^'); next if ($line !~ m/^(([\|\^])([^\|\^]*))/o); my $attrs = {}; my ($all, $key, $value) = ($1, $2, $3); $line = substr($line, length($all)); my $type = ($key eq '^') ? 'th' : 'td'; my ($lpad, $content, $rpad) = ($value =~ m/^(\s*)(.*?)(\s*)$/o); if ($line =~ m/^([\|\^]{2,})/o) { # colspan detected; $attrs->{colspan} = $colspan = length($1); } elsif (index($content, ":::") >= 0) { # rowspan detected for (my $k = $i - 1; $k >= 0; $k--) { # k is idx of prev rows my $cellptr = $rows[$k][$j + 2]; # +2 for (tr => {}) next if (index($cellptr->[2], ":::") >= 0); # also rowspan $cellptr->[1]->{rowspan} //= 1; # init attr, if missing $cellptr->[1]->{rowspan} += 1; # incr value last; } $attrs->{skip} = 1; } elsif ($colspan > 1) { # colspan-eaten column $attrs->{skip} = 1; $colspan -= 1; } if ($lpad eq '') { $attrs->{align} = ($rpad eq '') ? 'center' : 'left'; } else { $attrs->{align} = ($rpad eq '') ? 'right' : 'center'; } push @row, [$type => $attrs, $content]; } push @rows, [tr => {}, @row]; $i++; } return [table => {}, @rows]; } sub _parse_link { my ($self, $content) = @_; my ($href, @text); if ((my $pos = index($content, "|")) >= 0) { $href = substr($content, 0, $pos); @text = $self->_parse_text(substr($content, $pos + 1)); } else { $href = $content; @text = ($content); } unless ($href =~ m!^[a-z]+://!io) { $href =~ s{:}{/}oi; } return [a => {href => $href}, @text]; } sub _parse_include { my ($self, $content) = @_; if ($content =~ m|{{([a-z]+)>(.*)}}|oi) { ... } my ($lpad, $rpad, $src, %attrs) = ('', '', ''); if ($content =~ m!^{{(\s*)([^\s\|]+)(\s*)\|(.*)}}!oi) { ($lpad, $src, $rpad) = ($1, $2, $3); $attrs{title} = $4; } elsif ($content =~ m!^{{(\s*)(\S+)(\s*)}}!) { ($lpad, $src, $rpad) = ($1, $2, $3); } else { ($src) = ($content =~ s/{{(.+)}}/$1/or); } $attrs{align} = ($lpad ne '') ? (($rpad ne '') ? 'center' : 'right') : 'left'; $src =~ s<\?(\d+)(?:x(\d+))?> < $attrs{width} = $1 if $1; $attrs{height} = $2 if $2; '' >ioe; $src =~ s|:|/|o; $attrs{src} = $src; return [img => \%attrs]; } my $inlines = { "__" => "__", # underline "//" => "//", # italic "**" => "**", # bold "''" => "''", # monospace "{{" => "}}", # module/image "[[" => "]]", # link "((" => "))", # footnote }; sub _parse_text { my ($self, $line) = @_; my ($endtag, $endpos, $content); my @parts = (); while ($line) { $line =~ m!^(?:(.*?)(__|//|''|\*\*|\[\[|\{\{|\(\())?(.*)!o; my ($before, $match, $after) = ($1, $2, $3); if ($before) { push @parts, $before; $line = $match . $after; next; } if ($after and not $match) { push @parts, $after; last; } if ($match eq '\\ ') { # force newline push @parts, [br => {}]; $line = $after; next; } elsif ($match) { $endtag = $inlines->{$match}; $endpos = index($after, $endtag, 0); if ($endpos < 0) { # no closing marker? push @parts, $match; $line = $after; next; } $content = substr($after, 0, $endpos); $line = substr($after, $endpos + 2); } my %attrs; if ($match eq "//") { %attrs = ('font-style' => 'italic'); } elsif ($match eq "**") { %attrs = ('font-weight' => 'bold'); } elsif ($match eq "''") { %attrs = ('font-family' => 'monospace'); } elsif ($match eq "__") { %attrs = ('text-decoration' => 'underline'); } elsif ($match eq "{{") { push @parts, $self->_parse_include($content); next; } elsif ($match eq "[[") { push @parts, $self->_parse_link($content); next; } elsif ($match eq "((") { ...; next; } elsif ($match) { die("unrecognized inline: $match\n"); } if (%attrs) { push @parts, [span => \%attrs, $self->_parse_text($content)]; next; } } return @parts; } sub parse { my ($self, $text) = @_; my ($mode, $attrs, $buf, @tree) = ('text', '', ''); my @lines = split /\r?\n/o, $text; my $linenum = 0; foreach my $line (@lines) { $linenum++; if ($mode =~ m!block/(file|code|nowiki)!o) { $buf //= []; if ($line =~ m{^\s*}o) { $buf = join("\n", @{ $buf }); if ($1 eq 'file') { my $dt = [dt => {}, $attrs->{file}]; my $dd = [dt => {}, [pre => {class => $attrs->{class}}, $buf]]; push @tree, [dl => {class => 'file'}, [$dt, $dd]]; } elsif ($1 eq 'nowiki') { push @tree, [pre => {}, $buf]; } else { push @tree, [code => {class => $attrs->{class}}, $buf]; } ($buf, $mode, $attrs) = ('', '', {}); next; } push @{ $buf }, $line; next; } elsif ($mode eq 'code') { if ($line =~ m/^\s{2}(.+)/o) { $buf .= $line . "\n"; next; } push @tree, [pre => {}, $buf]; ($buf, $mode, $attrs) = ('', '', {}); } elsif ($mode eq 'list') { if ($line =~ m/^(\s{2})+([\*-])\s+(.+)/o) { push @{ $buf }, $line; next; } push @tree, $self->_parse_list($buf); ($buf, $mode, $attrs) = ('', '', {}); } elsif ($mode eq 'table') { if ($line =~ m/^\s?[\|\^]/o) { push @{ $buf }, $line; next; } push @tree, $self->_parse_table($buf); ($buf, $mode, $attrs) = ('', '', {}); } # macro if ($line =~ m/~~NO(TOC|CACHE)~~/) { next; # ignore } elsif ($line =~ m/^\s?(={2,6}) (.+) \g{1}\s*/o) { # header my $level = $1 =~ tr/=/=/; $level = 7 - $level; # invert push @tree, ["h$level" => {}, $2]; next; } elsif ($line =~ m/^\s?<(code|file)(?:\s+(\S+)\s+(\S+))?>\s*$/o) { # code/file block $mode = "block/$1"; $attrs = ($2) ? {class => $2, file => $3} : {}; next; } elsif ($line =~ m/\s?/o) { # nowiki block $mode = "block/nowiki"; next; } elsif ($line = m/^(\s{2})+([\*-])\s+(.+)/o) { # lists $mode = 'list'; $buf = []; push @{ $buf }, $line; next; } elsif ($line = m/^\s?(>)+\s*(.+)/o) { # quotes my $level = $1 =~ tr/>/>/; push @tree, [blockquote => {level => $level}, $2]; next; } elsif ($line = m/^\s?[\|\^]/o) { # table $mode = 'table'; $buf = []; push @{ $buf }, $line; next; } elsif ($line = m/^\s{2}(\S.+)/o) { # code idented with two spaces $mode = 'code'; $buf = $line . "\n"; next; } elsif ($line =~ m/^\s?(\S.+)/o) { # nonempty line push @tree, [p => {}, $self->_parse_text($1)]; next; } elsif ($line =~ m/^\s*$/o) { # empty lines push @tree, [br => {}]; $mode = ''; next; } # catchall printf "Unmatched % 3d: %s\n", $linenum, $line; } return [div => {}, @tree ]; } 1; __END__ [div => {class => 'block'}, #
'Hello!', # Hello! [strong => {}, 'user'], # user ', this is converted text.', # , this is converted text. [br => {}], #
[p => {}, 'Second paragraph'] #

Second paragraph

] #