You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

304 lines
8.4 KiB

package Text::Dokuwiki::Parser;
use strict;
use warnings;
use utf8;
use Text::Dokuwiki::Regexps;
sub new {
my ($class) = @_;
my $self = {
footnotes => [],
};
return bless($self, $class);
}
sub _parse_list {
my ($self, $lines) = @_;
my @lists = ();
my @stack = ( \@lists );
my $types = {curr => '', last => ''};
my $level = {curr => 0, last => 0};
foreach my $line (@{ $lines }) {
$line =~ m/^(\s+)/o;
my ($ident, $dot, $rest) = ($line =~ m/^((?:\s{2})+)([\*-])\s*(.+)/);
$level->{last} = $level->{curr};
$level->{curr} = $ident =~ tr/ / /;
$types->{last} = $types->{curr};
$types->{curr} = ($dot eq '-') ? 'ol' : 'ul';
if ($level->{curr} == $level->{last} and
$types->{curr} ne $types->{last}) {
pop @stack;
my $list = [$types->{curr} => {}];
push @{ $stack[-1] }, (@stack > 1) ? [li => {}, $list] : $list;
push @stack, $list;
}
if ($level->{curr} > $level->{last}) {
my $list = [$types->{curr} => {}];
push @{ $stack[-1] }, (@stack > 1) ? [li => {}, $list] : $list;
push @stack, $list;
}
if ($level->{curr} < $level->{last}) {
pop @stack;
}
push @{ $stack[-1] }, [li => {}, $self->_parse_text($rest)];
}
pop @stack while @stack;
return @lists;
}
sub _parse_table {
my ($self, $lines) = @_;
my ($i, $j, $colspan) = (0, 0, 0);
my @rows = ();
foreach my $line (@{ $lines }) {
my @row = ();
for ($j = 0; ;$j++) {
last if ($line eq '|' or $line eq '^');
next if ($line !~ m/^(([\|\^])([^\|\^]*))/o);
my $attrs = {};
my ($all, $key, $value) = ($1, $2, $3);
$line = substr($line, length($all));
my $type = ($key eq '^') ? 'th' : 'td';
my ($lpad, $content, $rpad) = ($value =~ m/^(\s*)(.*?)(\s*)$/o);
if ($line =~ m/^([\|\^]{2,})/o) {
# colspan detected;
$attrs->{colspan} = $colspan = length($1);
} elsif (index($content, ":::") >= 0) {
# rowspan detected
for (my $k = $i - 1; $k >= 0; $k--) { # k is idx of prev rows
my $cellptr = $rows[$k][$j + 2]; # +2 for (tr => {})
next if (index($cellptr->[2], ":::") >= 0); # also rowspan
$cellptr->[1]->{rowspan} //= 1; # init attr, if missing
$cellptr->[1]->{rowspan} += 1; # incr value
last;
}
$attrs->{skip} = 1;
} elsif ($colspan > 1) {
# colspan-eaten column
$attrs->{skip} = 1;
$colspan -= 1;
}
if ($lpad eq '') {
$attrs->{align} = ($rpad eq '') ? 'center' : 'left';
} else {
$attrs->{align} = ($rpad eq '') ? 'right' : 'center';
}
push @row, [$type => $attrs, $content];
}
push @rows, [tr => {}, @row];
$i++;
}
return [table => {}, @rows];
}
sub _parse_link {
my ($self, $content) = @_;
my ($href, @text);
if ((my $pos = index($content, "|")) >= 0) {
$href = substr($content, 0, $pos);
@text = $self->_parse_text(substr($content, $pos + 1));
} else {
$href = $content;
@text = ($content);
}
unless ($href =~ m!^[a-z]+://!io) {
$href =~ s{:}{/}go;
}
return [a => {href => $href}, @text];
}
sub _parse_include {
my ($self, $content) = @_;
$content =~ s/^{{//o;
$content =~ s/}}$//o;
if ($content =~ m|^([a-z]+)>(.*)|oi) {
warn "ignored: `$content`, unimplemented\n";
return;
}
my ($lpad, $rpad, $src, %attrs) = ('', '', '');
if ($content =~ m!^(\s*)([^\s\|]+)(\s*)[|](.*)!oi) {
($lpad, $src, $rpad) = ($1, $2, $3);
$attrs{title} = $4;
} elsif ($content =~ m!^(\s*)(\S+)(\s*)!) {
($lpad, $src, $rpad) = ($1, $2, $3);
} else {
($src) = ($content =~ s/^(.+)/$1/or);
}
$attrs{align} = ($lpad ne '')
? (($rpad ne '') ? 'center' : 'right')
: 'left';
$src =~ s<\?(\d+)(?:x(\d+))?>
<
$attrs{width} = $1 if $1;
$attrs{height} = $2 if $2;
''
>ioe;
$src =~ s|:|/|go;
$attrs{src} = $src;
return [img => \%attrs];
}
my $inlines = {
"__" => "__", # underline
"//" => "//", # italic
"**" => "**", # bold
"''" => "''", # monospace
"{{" => "}}", # module/image
"[[" => "]]", # link
"((" => "))", # footnote
};
sub _parse_text {
my ($self, $line) = @_;
my ($endtag, $endpos, $content);
my @parts = ();
while ($line) {
$line =~ m!^(?:(.*?)(__|//|''|\*\*|\[\[|\{\{|\(\(|\\\\ ))?(.*)!o;
my ($before, $match, $after) = ($1, $2, $3);
if ($before) {
push @parts, $before;
$line = $match . $after;
next;
}
if ($after and not $match) {
push @parts, $after;
last;
}
if ($match eq '\\\\ ') { # force newline
push @parts, [br => {}];
$line = $after;
next;
} elsif ($match) {
$endtag = $inlines->{$match};
$endpos = index($after, $endtag, 0);
if ($endpos < 0) { # no closing marker?
push @parts, $match;
$line = $after;
next;
}
$content = substr($after, 0, $endpos);
$line = substr($after, $endpos + 2);
}
my %attrs;
if ($match eq "//") { %attrs = ('font-style' => 'italic'); }
elsif ($match eq "**") { %attrs = ('font-weight' => 'bold'); }
elsif ($match eq "''") { %attrs = ('font-family' => 'monospace'); }
elsif ($match eq "__") { %attrs = ('text-decoration' => 'underline'); }
elsif ($match eq "{{") { push @parts, $self->_parse_include($content); next; }
elsif ($match eq "[[") { push @parts, $self->_parse_link($content); next; }
elsif ($match eq "((") {
push @{ $self->{footnotes} }, $self->_parse_text($content);
my $n = scalar @{ $self->{footnotes} };
push @parts, [sup => {}, [a => {href => "#note_$n"}, $n]];
next;
}
elsif ($match) { die("unrecognized inline: $match\n"); }
if (%attrs) {
push @parts, [span => \%attrs, $self->_parse_text($content)];
next;
}
}
return @parts;
}
sub parse {
my ($self, $text) = @_;
my $rx = $Text::Dokuwiki::Regexps::regexps;
my (@tree);
my $len = length($text);
pos($text) = 0;
while ($len - pos($text) > 0) {
my $pos = pos($text);
if (0) { # TODO: MACRO
} elsif ($text =~ m/\G$rx->{header}/cgi) {
my $level = length($+{line});
$level = 7 - $level; # invert
push @tree, ["h$level" => {}, $+{header}];
} elsif ($text =~ m/\G$rx->{codeblock}/cgi) {
if ($+{tag} eq 'file' and $+{filename}) {
my $attrs = $+{syntax} ? {class => $+{syntax}} : {};
my $dt = [dt => {}, $+{filename}];
my $dd = [dt => {}, [pre => $attrs, $+{block}]];
push @tree, [dl => {class => 'file'}, [$dt, $dd]];
} elsif ($+{tag} eq 'file') {
push @tree, [pre => {class => 'file'}, $+{block}];
} else {
my $attrs = $+{syntax} ? {class => $+{syntax}} : {};
push @tree, [code => $attrs, [pre => {}, $+{block}]];
}
} elsif ($text =~ m/\G$rx->{table}/cgi) {
my $lines = $+{table};
chomp $lines;
push @tree, $self->_parse_table([ split(/\r?\n/, $lines) ]);
} elsif ($text =~ m/\G$rx->{list}/cgi) {
my $lines = $+{list};
chomp $lines;
push @tree, $self->_parse_list([ split(/\r?\n/, $lines) ]);
} elsif ($text =~ m/\G$rx->{blockquote}/cgi) {
push @tree, [blockquote => {}, $+{block}];
} elsif ($text =~ m/\G$rx->{pre}/cgi) {
push @tree, [pre => {}, $+{block}];
} elsif ($text =~ m/\G$rx->{paragraph}/cgi) {
my $text = $+{text};
chomp $text;
push @tree, [p => {}, $self->_parse_text($text)];
} elsif ($text =~ m/\G$rx->{emptyline}/cgi) {
next;
} else {
my $msg = "Unmatched: '" . substr($text, pos($text)) . "'\n";
die($msg);
}
if (pos($text) <= $pos) {
die("parser failed, abort\n");
}
}
if (scalar @{ $self->{footnotes} }) {
my @fn = @{ $self->{footnotes} };
my @out = ();
for (my $i = 0; $i <= $#fn; $i++) {
my $name = sprintf "note_%d", $i + 1;
push @out, [li => {}, [a => {name => $name}], $fn[$i]];
}
push @tree, [ol => {}, @out];
}
return [div => {}, @tree ];
}
1;
__END__
[div => {class => 'block'}, # <div class='block'>
'Hello!', # Hello!
[strong => {}, 'user'], # <strong>user</strong>
', this is converted text.', # , this is converted text.
[br => {}], # <br/>
[p => {}, 'Second paragraph'] # <p>Second paragraph</p>
] # </div>