libtext-dokuwiki-perl/lib/Text/Dokuwiki/Parser.pm

package Text::Dokuwiki::Parser;

use strict;
use warnings;
use utf8;

use Text::Dokuwiki::Regexps;

sub new {
  my ($class) = @_;
  my $self = {
    footnotes => [],
  };

  return bless($self, $class);
}

sub _parse_list {
  my ($self, $lines) = @_;
  my @lists = ();
  my @stack = ( \@lists );
  my $types = {curr => '', last => ''};
  my $level = {curr => 0,  last => 0};

  foreach my $line (@{ $lines }) {
    $line =~ m/^(\s+)/o;
    my ($ident, $dot, $rest) = ($line =~ m/^((?:\s{2})+)([\*-])\s*(.+)/);
    $level->{last} = $level->{curr};
    $level->{curr} = $ident =~ tr/ / /;

    $types->{last} = $types->{curr};
    $types->{curr} = ($dot eq '-') ? 'ol' : 'ul';

    if ($level->{curr} == $level->{last} and
        $types->{curr} ne $types->{last}) {
      pop @stack;
      my $list = [$types->{curr} => {}];
      push @{ $stack[-1] }, (@stack > 1) ? [li => {}, $list] : $list;
      push @stack, $list;
    }
    if ($level->{curr} >  $level->{last}) {
      my $list = [$types->{curr} => {}];
      push @{ $stack[-1] }, (@stack > 1) ? [li => {}, $list] : $list;
      push @stack, $list;
    }
    if ($level->{curr} <  $level->{last}) {
      pop @stack;
    }
    push @{ $stack[-1] }, [li => {}, $self->_parse_text($rest)];
  }
  pop @stack while @stack;

  return @lists;
}

sub _parse_table {
  my ($self, $lines) = @_;
  my ($i, $j, $colspan) = (0, 0, 0);
  my @rows = ();

  foreach my $line (@{ $lines }) {
    my @row = ();
    for ($j = 0; ;$j++) {
      last if ($line eq '|' or $line eq '^');
      next if ($line !~ m/^(([\|\^])([^\|\^]*))/o);
      my $attrs = {};
      my ($all, $key, $value) = ($1, $2, $3);
      $line = substr($line, length($all));
      my $type = ($key eq '^') ? 'th' : 'td';
      my ($lpad, $content, $rpad) = ($value =~ m/^(\s*)(.*?)(\s*)$/o);
      if ($line =~ m/^([\|\^]{2,})/o) {
        # colspan detected;
        $attrs->{colspan} = $colspan = length($1);
      } elsif (index($content, ":::") >= 0) {
        # rowspan detected
        for (my $k = $i - 1; $k >= 0; $k--) {         # k is idx of prev rows
          my $cellptr = $rows[$k][$j + 2];            # +2 for (tr => {})
          next if (index($cellptr->[2], ":::") >= 0); # also rowspan
          $cellptr->[1]->{rowspan} //= 1;             # init attr, if missing
          $cellptr->[1]->{rowspan}  += 1;             # incr value
          last;
        }
        $attrs->{skip} = 1;
      } elsif ($colspan > 1) {
        # colspan-eaten column
        $attrs->{skip} = 1;
        $colspan -= 1;
      }
      if ($lpad eq '') {
        $attrs->{align} = ($rpad eq '') ? 'center' : 'left';
      } else {
        $attrs->{align} = ($rpad eq '') ? 'right' : 'center';
      }
      push @row, [$type => $attrs, $content];
    }
    push @rows, [tr => {}, @row];
    $i++;
  }

  return [table => {}, @rows];
}

sub _parse_link {
  my ($self, $content) = @_;
  my ($href, @text);

  if ((my $pos = index($content, "|")) >= 0) {
    $href = substr($content, 0, $pos);
    @text = $self->_parse_text(substr($content, $pos + 1));
  } else {
    $href = $content;
    @text = ($content);
  }

  unless ($href =~ m!^[a-z]+://!io) {
    $href =~ s{:}{/}go;
  }

  return [a => {href => $href}, @text];
}

sub _parse_include {
  my ($self, $content) = @_;

  $content =~ s/^{{//o;
  $content =~ s/}}$//o;

  if ($content =~ m|^([a-z]+)>(.*)|oi) {
    warn "ignored: `$content`, unimplemented\n";
    return;
  }

  my ($lpad, $rpad, $src, %attrs) = ('', '', '');
  if ($content =~ m!^(\s*)([^\s\|]+)(\s*)[|](.*)!oi) {
    ($lpad, $src, $rpad) = ($1, $2, $3);
    $attrs{title} = $4;
  } elsif ($content =~ m!^(\s*)(\S+)(\s*)!) {
    ($lpad, $src, $rpad) = ($1, $2, $3);
  } else {
    ($src) = ($content =~ s/^(.+)/$1/or);
  }

  $attrs{align} = ($lpad ne '')
    ? (($rpad ne '') ? 'center' : 'right')
    : 'left';

  $src =~ s<\?(\d+)(?:x(\d+))?>
  <
    $attrs{width}  = $1 if $1;
    $attrs{height} = $2 if $2;
    ''
  >ioe;

  $src =~ s|:|/|go;
  $attrs{src} = $src;

  return [img => \%attrs];
}

my $inlines = {
  "__" => "__", # underline
  "//" => "//", # italic
  "**" => "**", # bold
  "''" => "''", # monospace
  "{{" => "}}", # module/image
  "[[" => "]]", # link
  "((" => "))", # footnote
};
sub _parse_text {
  my ($self, $line) = @_;
  my ($endtag, $endpos, $content);
  my @parts = ();

  while ($line) {
    $line =~ m!^(?:(.*?)(__|//|''|\*\*|\[\[|\{\{|\(\(|\\\\ ))?(.*)!o;
    my ($before, $match, $after) = ($1, $2, $3);
    if ($before) {
      push @parts, $before;
      $line = $match . $after;
      next;
    }
    if ($after and not $match) {
      push @parts, $after;
      last;
    }

    if ($match eq '\\\\ ') { # force newline
      push @parts, [br => {}];
      $line = $after;
      next;
    } elsif ($match) {
      $endtag  = $inlines->{$match};
      $endpos  = index($after, $endtag, 0);
      if ($endpos < 0) { # no closing marker?
        push @parts, $match;
        $line = $after;
        next;
      }
      $content = substr($after, 0, $endpos);
      $line = substr($after, $endpos + 2);
    }

    my %attrs;
       if ($match eq "//") { %attrs = ('font-style'      => 'italic');    }
    elsif ($match eq "**") { %attrs = ('font-weight'     => 'bold');      }
    elsif ($match eq "''") { %attrs = ('font-family'     => 'monospace'); }
    elsif ($match eq "__") { %attrs = ('text-decoration' => 'underline'); }
    elsif ($match eq "{{") { push @parts, $self->_parse_include($content); next; }
    elsif ($match eq "[[") { push @parts, $self->_parse_link($content); next; }
    elsif ($match eq "((") {
      push @{ $self->{footnotes} }, $self->_parse_text($content);
      my $n = scalar @{ $self->{footnotes} };
      push @parts, [sup => {}, [a => {href => "#note_$n"}, $n]];
      next;
    }
    elsif ($match) { die("unrecognized inline: $match\n"); }

    if (%attrs) {
      push @parts, [span => \%attrs, $self->_parse_text($content)];
      next;
    }
  }

  return @parts;
}

sub parse {
  my ($self, $text) = @_;

  my $rx = $Text::Dokuwiki::Regexps::regexps;
  my (@tree);
  my $len = length($text);
  pos($text) = 0;

  while ($len - pos($text) > 0) {
    my $pos = pos($text);
    if (0) { # TODO: MACRO
    } elsif ($text =~ m/\G$rx->{header}/cgi) {
      my $level = length($+{line});
      $level = 7 - $level; # invert
      push @tree, ["h$level" => {}, $+{header}];
    } elsif ($text =~ m/\G$rx->{codeblock}/cgi) {
      if ($+{tag} eq 'file' and $+{filename}) {
        my $attrs = $+{syntax} ? {class => $+{syntax}} : {};
        my $dt = [dt => {}, $+{filename}];
        my $dd = [dt => {}, [pre => $attrs, $+{block}]];
        push @tree, [dl => {class => 'file'}, [$dt, $dd]];
      } elsif ($+{tag} eq 'file') {
        push @tree, [pre => {class => 'file'}, $+{block}];
      } else {
        my $attrs = $+{syntax} ? {class => $+{syntax}} : {};
        push @tree, [code => $attrs, [pre => {}, $+{block}]];
      }
    } elsif ($text =~ m/\G$rx->{table}/cgi) {
      my $lines = $+{table};
      chomp $lines;
      push @tree, $self->_parse_table([ split(/\r?\n/, $lines) ]);
    } elsif ($text =~ m/\G$rx->{list}/cgi) {
      my $lines = $+{list};
      chomp $lines;
      push @tree, $self->_parse_list([ split(/\r?\n/, $lines) ]);
    } elsif ($text =~ m/\G$rx->{blockquote}/cgi) {
      push @tree, [blockquote => {}, $+{block}];
    } elsif ($text =~ m/\G$rx->{pre}/cgi) {
      push @tree, [pre => {}, $+{block}];
    } elsif ($text =~ m/\G$rx->{paragraph}/cgi) {
      my $text = $+{text};
      chomp $text;
      push @tree, [p => {}, $self->_parse_text($text)];
    } elsif ($text =~ m/\G$rx->{emptyline}/cgi) {
      next;
    } else {
      my $msg = "Unmatched: '" . substr($text, pos($text)) . "'\n";
      die($msg);
    }
    if (pos($text) <= $pos) {
      die("parser failed, abort\n");
    }
  }

  if (scalar @{ $self->{footnotes} }) {
    my @fn  = @{ $self->{footnotes} };
    my @out = ();
    for (my $i = 0; $i <= $#fn; $i++) {
      my $name = sprintf "note_%d", $i + 1;
      push @out, [li => {}, [a => {name => $name}], $fn[$i]];
    }
    push @tree, [ol => {}, @out];
  }

  return [div => {}, @tree ];
}

1;

__END__

[div => {class => 'block'},           # <div class='block'>
  'Hello!',                           #   Hello!
  [strong => {}, 'user'],             #   <strong>user</strong>
  ', this is converted text.',        #   , this is converted text.
  [br => {}],                         #   <br/>
  [p => {}, 'Second paragraph']       #   <p>Second paragraph</p>
]                                     # </div>