kettext/kettext.pm

#!/usr/bin/perl
use strict;
use warnings;
package kettext;
use constant VERSION => 0.2;
# create our parse "enum"
use constant {
  TYPE_HEADER => 1,
  TYPE_PARAGRAPH => 2,
  TYPE_PRE => 3,
  TYPE_RULE => 4,
  TYPE_SPACER => 5,
  TYPE_BREAK => 6,
  TYPE_CODE => 7,
  TYPE_BLOCKQUOTE => 8,
  TYPE_LIST => 9
};
# create our general data
# parseFile("my_file", %cmd_settings)
sub parseFile {
  my $file_name = $_[0];
  my %cmd_settings = ($_[1] ? %{$_[1]} : ());
  my @elements = ();    # our list of ALL distinct elements
  my %block = ();       # some block of data, such as paragraph, pre, etc.
  my %settings = ();
  # get dirname for includes
  my $dir_name = $file_name;
  $dir_name =~ s/\/(.*)$/\//g;
  if ($dir_name eq $file_name) {
    $dir_name = "./";
  }
  # create file handler and read into @lines
  binmode STDOUT, ":utf8";
  use utf8;
  open(my $fh, '<:encoding(UTF-8)', $file_name)
    or die "ERR: could not open file '$file_name' $!";
  chomp(my @lines = <$fh>); close($fh);
  # main logic: line reading
  for (my $i = 0; $i <= $#lines; $i++) {
    # check for imply rules on first line
    if ($i == 0 || $i == $#lines) {
      if ($lines[$i] =~ m/^(\..*\)(\s|))(.*)/) {
        if ($1 =~ m/^.*\((.*)\)/) {
          my @options = split(',', $1);
          foreach my $opt (@options) {
            my @value = split('=', $opt);
            $settings{$value[0]} = ($value[1] ? $value[1] : 1);
          }
        }
        next;
      }
    }
    # 1. check for indent-related lines (pre, code, list, etc.)
    if ($lines[$i] =~ m/^(\t|\s\s|\s\s\s\s)(.)(.*)/) {
      # code
      if ($2 eq '#') {
        if (!%block) {
          #if ($3 =~ m/^([^\s]*)(\..*\)(\s|))(.*)/) {
          if ($3 =~ m/^(\..*?\)(?:\s|\n|))(.*?)/) {
            %block = (type => TYPE_CODE, text => $2, opts => parseOptions($1));
            if (exists ${$block{'opts'}}{'include'}) {
              my $include = $dir_name.${$block{'opts'}}{'include'};
              if (open(my $fh, '<:encoding(UTF-8)', $include)) {
                while (my $row = <$fh>) {
                  chomp $row;
                  $block{'text'} .= "$row\n";
                }
              } else {
                print $file_name;
                $block{'text'} .= "Could not read ".$include."\n";
              }
              delete ${$block{'opts'}}{'include'};
            }
          } else {
            %block = (type => TYPE_CODE, text => $3);
          }
        } else {
          if ($block{"type"} != TYPE_CODE) {
            push @elements, {%block};
            if ($3 =~ m/^(\..*?\)(?:\s|\n|))(.*?)/) {
              %block = (type => TYPE_CODE, text => $3, opts => parseOptions($2));
              if (${$block{'opts'}}{'include'}) {
                if (open(my $fh, '<:encoding(UTF-8)', ${$block{'opts'}}{'include'})) {
                  while (my $row = <$fh>) {
                    chomp $row;
                    $block{'text'} .= "$row\n";
                  }
                } else {
                  $block{'text'} .= "Could not read ".${$block{'opts'}}{'include'}."\n";
                }
                delete ${$block{'opts'}}{'include'};
              }
            } else {
              %block = (type => TYPE_CODE, text => $3);
            }
          } else {
            $block{'text'} .= "\n".$3;
          }
        }
      # lists - this is ugly, expensive, and by far the most shameful of this code
      } elsif ($lines[$i] =~ m/^(\t{1,}|\s\s{1,})(\*\s|\~\s|\-\s|\+\s|.{1,3}\.\s)(.*)/) {
        # this is hackish - I would prefer to get # of matches from above
        my $depth = length($1) / (substr($1, 0, 1) eq ' ' ? 2 : 1);
        if (%block) {
          if ($block{"type"} != TYPE_LIST) {
            push @elements, {%block};
            %block = (type => TYPE_LIST, depth => 0, children => []);
          }
        } else {
          %block = (type => TYPE_LIST, depth => 0, children => []);
        }
        my %this_item = (type => TYPE_LIST, text => $3, depth => $depth, parent => \%block, children => []);
        my $child_count = @{$block{'children'}};
        if ($child_count == 0) {
          push @{$block{'children'}}, {%this_item};
        } else {
          my $root = \%block;
          my $root_children = \@{${$root}{'children'}};
          my $diff = $depth-${$root}{'depth'};
          # travel down the list until we get our proper context.
          while ($diff > 0) {
            $root_children = \@{${$root}{'children'}};
            if (@{${$root}{'children'}} <= 0) {
              push @{${$root}{'children'}}, {%this_item};
              $diff = 0;
            } else {
              # diff 1 = we have proper parent, 0 = we're one step too deep
              $root = $root_children->[-1];
              $diff = $depth-${$root}{'depth'};
              if ($diff == 1) {
                push @{${$root}{'children'}}, {%this_item};
                $diff = 0;
              } elsif ($diff == 0) {
                push @{${${$root}{'parent'}}{'children'}}, {%this_item};
              }
            }
          }
        }
      # preformatted (default)
      } else {
        if (!%block) {
          if ($2 eq '.' && $3 =~ m/^(.*\)(\s|))(.*)/) {
            %block = (type => TYPE_PRE, text => $3, opts => parseOptions('.'.$1));
            if (exists ${$block{'opts'}}{'include'}) {
              my $include = $dir_name.${$block{'opts'}}{'include'};
              if (open(my $fh, '<:encoding(UTF-8)', $include)) {
                while (my $row = <$fh>) {
                  chomp $row;
                  $block{'text'} .= "$row\n";
                }
              } else {
                print $file_name;
                $block{'text'} .= "Could not read ".$include."\n";
              }
              delete ${$block{'opts'}}{'include'};
            }
          } else {
            %block = (type => TYPE_PRE, text => $2.$3);
          }
        } else {
          if ($block{"type"} != TYPE_PRE) {
            push @elements, {%block}; %block=();
            if ($2 eq '.' && $3 =~ m/^(.*\)(\s|))(.*)/) {
              %block = (type => TYPE_PRE, text => $3, opts => parseOptions('.'.$1));
              if (exists ${$block{'opts'}}{'include'}) {
                my $include = $dir_name.${$block{'opts'}}{'include'};
                if (open(my $fh, '<:encoding(UTF-8)', $include)) {
                  while (my $row = <$fh>) {
                    chomp $row;
                    $block{'text'} .= "$row\n";
                  }
                } else {
                  print $file_name;
                  $block{'text'} .= "Could not read ".$include."\n";
                }
                delete ${$block{'opts'}}{'include'};
              }
            } else {
              %block = (type => TYPE_PRE, text => $2.$3);
            }
          } else {
            $block{'text'} .= "\n".$2.$3;
          }
        }
      }
      next;
    }
    # 2. check for big headers, e.g.,
    # ,,,,
    # header
    # ````
    #   conditions: first and third lines must have at least 4 repeating chars, and the second line must not match header conditions.
    if ($i+1 < $#lines) {
      if ($lines[$i] =~ m/(\S\S)\1{2,}/) {
        my $size = $+[0];
        # only check closing header if contents appear to not be a header
        if ($lines[$i+1] !~ m/(\S\S)\1{2,}/g) {
          # check closing line for header appearance
          if ($lines[$i+2] =~ m/(\S\S)\1{2,}/g) {
            if (%block) { push @elements, {%block}; %block=() }
            if ($lines[$i+1] =~ m/^(.*?)(\..*\))/) {
              push @elements, {
                type => TYPE_HEADER,
                size => $size,
                text => $1,
                opts => parseOptions($2)
              };
            } else {
              push @elements, {
                type => TYPE_HEADER,
                size => $size,
                text => $lines[$i+1]
              };
            }
            $i += 2;
            next;
          }
        }
      }
    }
    # 3. check for small headers
    if ($lines[$i] =~ m/^(\S)\1{2,}\s/) {
      my $size = $+[0];
      my $match = substr($lines[$i], $-[0], $+[0]-$-[0]);
      my $post = substr($lines[$i], $+[0]);
      # two conditions:
      #   match repeated $match if something else came between
      #     e.g., ==== header ====
      #   otherwise match if there is some text following
      #     e.g., ==== header
      if ($post =~ m/(\t*|\s*|)(.*)\Q$match\E$/g) {
        if (%block) { push @elements, {%block}; %block=() }
        if ($2 =~ m/^(.*?)(\..*\))/) {
          push @elements, {
            type => TYPE_HEADER,
            size => $size,
            text => $1,
            opts => parseOptions($2)
          };
        } else {
          push @elements, {
            type => TYPE_HEADER,
            size => $size,
            text => $2
          };
        }
        next;
      } elsif ($post =~ m/^(\t*|\s*|)(.*[^(\n|\r|\s|\t)])/g) {
        if (%block) { push @elements, {%block}; %block=() }
        if ($post =~ m/^(.*?)(\..*\))/) {
          push @elements, {
            type => TYPE_HEADER,
            size => $size,
            text => $1,
            opts => parseOptions($2)
          };
        } else {
          push @elements, {
            type => TYPE_HEADER,
            size => $size,
            text => $post
          };
        }
        next;
      }
    }
    # 4. check for spacers
    if ($lines[$i] =~ m/^(\S)\1{2,}/) {
      if (%block) { push @elements, {%block}; %block=() }
      push @elements, {
        type => TYPE_SPACER,
        size => $+[0]-$-[0]
      };
      next;
    }
    # discover blank lines - these usually signify end of some type of block
    if ($lines[$i] =~ m/^\s*$/) {
      if (%block) { push @elements, {%block}; %block=() }
      push @elements, {
        type => TYPE_BREAK
      };
      next;
    }
    # check for blockquotes
    if ($lines[$i] =~ m/^\>(.*)$/) {
      if (!%block) {
        if ($1 =~ m/^(\..*\)(\s|))(.*)/) {
          %block = (type => TYPE_BLOCKQUOTE, text => $2, opts => parseOptions($1));
        } else {
          %block = (type => TYPE_BLOCKQUOTE, text => $1);
        }
      } else {
        if ($block{'type'} != TYPE_BLOCKQUOTE) {
          push @elements, {%block};
          if ($1 =~ m/^(\..*\)(\s|))(.*)/) {
            %block = (type => TYPE_BLOCKQUOTE, text => $2, opts => parseOptions($1));
          } else {
            %block = (type => TYPE_BLOCKQUOTE, text => $1);
          }
        } else {
          $block{'text'} .= "\n".$1;
        }
      }
      next;
    }
    # finally, create a paragraph or append to it.
    if (!%block) {
      if ($lines[$i] =~ m/^(\..*\)(\s|))(.*)/) {
        %block = (type => TYPE_PARAGRAPH, text => $2, opts => parseOptions($1));
        if (${$block{'opts'}}{'include'}) {
          my %sub_file = kettext::parseFile($dir_name.${$block{'opts'}}{'include'}, \%cmd_settings);
          for my $i (0 .. $#{$sub_file{'elements'}}) {
            push @elements, ${$sub_file{'elements'}}[$i];
          }
          delete ${$block{'opts'}}{'include'};
        }
      } else {
        %block = (type => TYPE_PARAGRAPH, text => $lines[$i]);
      }
    } else {
      $block{'text'} .= "\n".$lines[$i];
    }
  }
  # push final block if it exists
  if (%block) { push @elements, {%block}; %block=() }
  # merge file settings with command-line settings
  @settings{ keys %cmd_settings } = values %cmd_settings;
  # get our various header sizes and organize an array with unique values from smallest to largest.
  my @headers;
  foreach (@elements) {
    my $element = $_;
    if ($element->{type} == TYPE_HEADER) {
      if (@headers) {
        my $h_i = 0;
        my $last_size = $headers[0];
        for (my $h_i = 0; $h_i < scalar @headers; $h_i++) {
          if ($element->{size} == $headers[$h_i]) {
            $h_i = scalar @headers;
          } elsif ($element->{size} < $headers[$h_i]) {
            splice @headers, $h_i, 0, $element->{size};
            $h_i = scalar @headers;
          } elsif ($element->{size} > $headers[$h_i]) {
            if ($h_i+1 >= scalar @headers) {
              splice @headers, $h_i+1, 0, $element->{size};
              $h_i = scalar @headers;
            }
          }
        }
      } else {
        $headers[0] = $element->{size};
      }
    }
  }
  # this is really dumb, but convert our array to a hash
  my %header_map;
  for (my $h_i = 0; $h_i < scalar @headers; $h_i++) {
    if ($settings{'header.reverse'}) {
      $header_map{$headers[$h_i]} = $h_i+1+($settings{'header.depth'} ? $settings{'header.depth'} : 0);
    } else {
      $header_map{$headers[$h_i]} = scalar ($settings{'header.depth'} ? $settings{'header.depth'} : 0) + @headers - $h_i;
    }
  }

  # set up our array for header numbering. The first element
  my @header_numbers;
  for (my $h_i = 0; $h_i < scalar @headers; $h_i++) {
    $header_numbers[$h_i] = 0;
  }
  my $header_number = 0; # at which numbering depth are we
  # prepend to headers
  foreach (@elements) {
    if ($_->{type} == TYPE_HEADER) {
      my $hsize = $header_map{$_->{size}};
      $_->{hsize} = $hsize;
      $header_number = $hsize;
      for (my $h_i = $header_number+1; $h_i < scalar @headers; $h_i++) {
        $header_numbers[$h_i] = 0;
      }
      $header_numbers[$header_number]++;
      # prepend an ordered number to headers based on depth
      if ($settings{"header.numbering"}) {
        $_->{text} = $header_numbers[$header_number].'. '.$_->{text};
      }
    }
  }
  my %ret = (
    "elements" => \@elements,
    "settings" => \%settings,
    "filename" => $file_name
  );
  return %ret;
}
sub parseOptions {
  my $text = $_[0];
  my %opts = ();
  while ($text =~ m/(\.|)([a-zA-Z]*[^\(])(\()([^\)]*)/g) {
    $opts{$2} = $4;
  }
  return \%opts;
}

package kettext::convertTo;
sub HTML {
  my %data = @_;
  my @elements = @{$data{elements}};
  my %settings = %{$data{settings}};
  my $file_name = $data{filename};

  our @prefixes = (["img", "image"], ["aud", "audio"], ["vid", "video"], ["yt", "youtube"], ["vimeo"], ["twitter", "tweet"]);
  our @urls = ([], [], [], ["youtube.com", "youtu.be"], ["vimeo.com"], ["twitter.com"]);
  our @url_parse = (
    "(.*)",
    "",
    "",
    "(?:.*)(?:v=)([a-zA-Z0-9]*)",
    "(?:.*)(?:/)([0-9)*)\$",
    "");
  our @embeds = (
    ['"<img src=\"$1\">"',''],
    ['"<audio src=\"$1\" controls>"','</audio>'],
    ['"<video src=\"$1\" controls>"','</video>'],
    ['"<iframe src=\"//www.youtube.com/embed/$1\" frameborder=\"0\" allowfullscreen>"', '</iframe>'],
    ['"<iframe src=\"//player.vimeo.com/video/$1\" frameborder="0" webkitallowfullscreen mozallowfullscreen allowfullscreen>"', '</iframe>'],
    ["", ""]);

  # print 'em out
  if ($settings{'toc'}) {
    foreach (@elements) {
      if ($_->{type} == kettext::TYPE_HEADER) {
        # TODO: collect and organize TYPE_HEADER elements into TYPE_LIST element. Use header IDs and surround in anchors appropriately.
      }
    }
  } else {
    my $previous_type = 0;
    my $indent = ("\t" x ($settings{"indent_level"} ? $settings{"indent_level"} : 0));
    foreach (@elements) {
      if ($_->{type} == kettext::TYPE_HEADER) {
        my $hid = $_->{text};
        $hid =~ s/ /_/g;
        print("<h$_->{hsize}".($_->{opts} ? convertOpts($_->{opts}) : '').($settings{'header.ids'} ? " id=\"".$hid."\"" : '').">".convertText($_->{text})."</h$_->{hsize}>\n");
      } elsif ($_->{type} == kettext::TYPE_BREAK) {
        if ($previous_type == kettext::TYPE_BREAK) {
          print("<br />\n");
        }
      } elsif ($_->{type} == kettext::TYPE_PARAGRAPH) {
        my $text = convertText($_->{text});
        $text =~ s/\n/<br>\n/g;
        print("<p".($_->{opts} ? convertOpts($_->{opts}) : '').">$text</p>\n");
      } elsif ($_->{type} == kettext::TYPE_PRE) {
        print("<pre".($_->{opts} ? convertOpts($_->{opts}) : '').">$_->{text}</pre>\n");
      } elsif ($_->{type} == kettext::TYPE_CODE) {
        print("<pre><code".($_->{opts} ? convertOpts($_->{opts}) : '').">$_->{text}</code></pre>\n");
      } elsif ($_->{type} == kettext::TYPE_BLOCKQUOTE) {
        print("<blockquote".($_->{opts} ? convertOpts($_->{opts}) : '').">".($settings{'blockquote.parse'} ? convertText($_->{text}) : $_->{text})."</blockquote>\n");
      } elsif ($_->{type} == kettext::TYPE_SPACER) {
        print("<hr />\n");
      } elsif ($_->{type} == kettext::TYPE_LIST) {
        printf("<p></p>"); # @@ should this be optional?
        printList($_);
      }
      $previous_type = $_->{type};
    }
  }
  if (!$settings{"version.none"}) {
    if ($settings{"version.hide"}) {
      print "<!-- $file_name generated by kettext ".kettext::VERSION." -->\n";
    } else {
      print "<small>$file_name generated by kettext ".kettext::VERSION."</small>\n";
    }
  }

  sub printList {
    my $list = $_[0];
    my $child_count = @{$list->{'children'}};
    #my $child = $children->[$child_i];
    if ($list->{'text'}) {
      print "<li>".convertText($list->{'text'});
    }
    if ($child_count > 0) {
      print "<ul>\n";
      my $children = \@{$list->{'children'}};
      my $child_i = 0;
      for (my $child = $children->[$child_i]; $child_i < $child_count; $child = $children->[++$child_i]) {
        printList($child);
      }
      print "</ul>\n";
    }
    if ($list->{'text'}) {
      print "</li>\n";
    }
  }
  sub convertText {
    my $text = $_[0];
    my @text_patterns = ('\*\*','\*\*', '\/\/','\/\/', '\[\[','\]\]', '\(\(','\)\)', '``', '``', '__','__');
    my @default_opts = ("", "", "href", "", "", "");
    my @text_replace = ('<b>','</b>', '<i>','</i>', '<a>','</a>', '<small>','</small>', '<code>','</code>', '<u>','</u>');
    my @char_patterns = ('&', '<', '>', '"', '\'', '--');
    my @char_replace = ('&amp;', '&lt;', '&gt;', '&quot;', '&apos;', '&mdash;');
    my $i = 0;
    my $new_text = $text;
    # 1. replace character(s)
    #for (my $ch = 0; $ch <= $#char_patterns; $ch++) {
    #  if ($new_text =~ m/(([^\\])$char_patterns[$ch])/g) {
    #    my $bzr = $2;
    #    $new_text =~ s/\Q$bzr$char_patterns[$ch]\E/\Q$bzr\E$char_replace[$ch]/g;
    #  }
    #  if ($new_text =~ m/(\\)($char_patterns[$ch])/g) {
    #    $new_text =~ s/\\$char_patterns[$ch]/$char_patterns[$ch]/g;
    #  }
    #}
    # 2. replace character pairs
    for (my $p = 0; $p <= $#text_patterns; $p+=2) {
      my $open = $text_patterns[$p];
      my $close = $text_patterns[$p+1];
      #while ($new_text =~ m/($open)([^$close]*)($close)(\.\S*\(.*?\)|)/g) {
      #while ($new_text =~ m/($open)([^$close]*)($close)((?:\()(.*?)(?:\))|)(\.\S*\(.*?\)|)/g) {
      while ($new_text =~ m/(?:\s|)($open)(.*?)($close)((?:\()(.*?)(?:\))|)(\.\S*\(.*?\)|)/g) {
        my $converted_text;
        if ($p == 4) {
          my @media = convertLink($2);
          my $ts = substr($media[0], 0, -1);
          my $te = substr($media[0], -1);
          my $opts = kettext::parseOptions($6);
          if ($5) {
            $opts->{$default_opts[$i]} = $5;
          }
          # FIXME: dirty hack to allow [[img:image]](url) or [[img:image]].href(url) to expand to <a href="url"><img src="image"></a>
          if ($opts->{'href'} && !($media[0] eq "<a>")) {
            $converted_text = "<a href=\"".$opts->{'href'}."\">";
            delete $opts->{'href'};
            $converted_text .= $ts.convertOpts($opts).$te.$media[1];
            $converted_text .= "</a>";
            $new_text =~ s/\Q$1$2$3$4$6\E/$converted_text/g;
          } else {
            $converted_text = $ts.convertOpts($opts).$te.$media[1];
            $new_text =~ s/\Q$1$2$3$4$6\E/$converted_text/g;
          }
        } else {
          my $ts = substr($text_replace[$p], 0, -1);
          my $te = substr($text_replace[$p], -1);
          my $opts = kettext::parseOptions($6);
          if ($5) {
            $opts->{$default_opts[$i]} = $5;
          }
          $converted_text = $ts.convertOpts($opts).$te.$2.$text_replace[$p+1];
          $new_text =~ s/\Q$1$2$3$4$6\E/$converted_text/g;
        }
      }
      $i++;
    }
    return $new_text;
  }
  sub convertLink {
    my $text = $_[0];
    my $find = $text;
    # 1. check for prefix such as "image:", "audio:", etc.
    if ($find =~ m/^(.*?)(?::)(.*)$/) {
      for (my $prefix_group = 0; $prefix_group <= $#prefixes; $prefix_group++) {
        for (my $prefix = 0; $prefix <= $#{${prefixes[$prefix_group]}}; $prefix++) {
          if ($1 eq ${prefixes[$prefix_group]}[$prefix]) {
            my @media = urlToMedia($2); #"prefix:http://..."
            if ($media[0] eq "") { # "prefix:data"
              $media[0] = $2;
              $media[0] =~ s/(.*)/$embeds[$prefix_group][0]/ee;
              $media[1] = $embeds[$prefix_group][1];
            }
            return @media;
          }
        }
      }
    }
    sub convertOpts {
      my $opts = $_[0];
      my $string = "";
      keys %{$opts};
      while (my ($key, $value) = each %{$opts}) {
        $string .= " $key=\"$value\"";
      }
      return $string;
    }
    # 2. check for (http(s)://)(www.)youtube/youtu.be/vimeo/etc.
    my @media = urlToMedia($text);
    if (!$media[0] eq "") { #"http://..."
      return @media;
    }
    # 3. Finally, check file extension

    # 4. Just an anchor
    return ("<a>","$text</a>");
  }
  sub urlToMedia {
    my $find = $_[0];
    if ($find =~ m/^(?:http(?:s|):\/\/(?:www.|)|)(.*)\/(.*)/) {
      for (my $urls_i = 0; $urls_i <= $#urls; $urls_i++) {
        for (my $url_i = 0; $url_i <= $#{${urls[$urls_i]}}; $url_i++) {
          if ($1 eq ${urls[$urls_i]}[$url_i]) {
            my $match = $2;
            $match =~ s/$url_parse[$urls_i]/$embeds[$urls_i][0]/ee; # the ee is qqing
            return ($match,$embeds[$urls_i][1]);
          }
        }
      }
    }
    return ("","");
  }
}
1;