kettext/kettext.pl

#!/usr/bin/perl
use strict;
use warnings;
#
my %cmd_settings = ();
# arg check / usage
if ($#ARGV+1 == 0) {
    printf "Usage: kettext.pl file_name.ktx\n";
    exit;
}
my $file_name;

for (my $arg_i = 0; $arg_i < $#ARGV+1; $arg_i++) {
  if ($ARGV[$arg_i] =~ m/^-(-|)(.*)/) {
    if ($2 =~ m/^(h(elp|))$/) {
      printf "Usage: kettext.pl file_name.ktx\n";
      exit;
    } else {
      if ($2 =~ m/^(.[^\=]*)(\=|)(.*|)/) {
        $cmd_settings{$1} = (length($3) ? $3 : 1);
      }
    }
  } else {
    $file_name = $ARGV[$arg_i];
  }
}
# create file handler
binmode STDOUT, ":utf8";
use utf8;
open(my $fh, '<:encoding(UTF-8)', $file_name)
  or die "ERR: could not open file '$file_name' $!";
# create our parse "enum"
use constant {
  PARSE_HEADER_BIG => 1 << 1,
  PARSE_HEADER_SMALL => 1 << 2,
  PARSE_HEADER_BOTH => 1 << 1 | 1 << 2,
};
use constant {
  TYPE_HEADER => 1,
  TYPE_PARAGRAPH => 2,
  TYPE_PRE => 3,
  TYPE_RULE => 4,
  TYPE_SPACER => 5,
  TYPE_BREAK => 6,
  TYPE_CODE => 7,
  TYPE_BLOCKQUOTE => 8,
  TYPE_LIST => 9
};
# create our general data
my %sections;
my @elements = ();    # our list of ALL distinct elements
my %block = ();       # some block of data, such as paragraph, pre, etc.
my %settings = ();
# main logic: line reading
chomp(my @lines = <$fh>); close($fh);
for (my $i = 0; $i <= $#lines; $i++) {
  # check for imply rules on first line
  if ($i == 0 || $i == $#lines) {
    if ($lines[$i] =~ m/^(\..*\)(\s|))(.*)/) {
      if ($1 =~ m/^.*\((.*)\)/) {
        my @options = split(',', $1);
        foreach my $opt (@options) {
          my @value = split('=', $opt);
          $settings{$value[0]} = ($value[1] ? $value[1] : 1);
        }
      }
      next;
    }
  }
  # 1. check for indent-related lines (pre, code, list, etc.)
  if ($lines[$i] =~ m/^(\t|\s\s|\s\s\s\s)(.)(.*)/) {
    # code
    if ($2 eq '#') {
      if (!%block) {
        if ($3 =~ m/^([^\s]*)(\..*\)(\s|))(.*)/) {
          %block = (type => TYPE_CODE, text => $3, opts => parseOptions($2));
        } else {
          %block = (type => TYPE_CODE, text => $3);
        }
      } else {
        if ($block{"type"} != TYPE_CODE) {
          push @elements, {%block};
          if ($3 =~ m/^([^\s*])(\..*\)(\s|))(.*)/) {
            %block = (type => TYPE_CODE, text => $3, opts => parseOptions($2));
          } else {
            %block = (type => TYPE_CODE, text => $3);
          }
        } else {
          $block{'text'} .= "\n".$3;
        }
      }
    # lists - this is ugly, expensive, and by far the most shameful of this code
    } elsif ($lines[$i] =~ m/^(\t{1,}|\s\s{1,})(\*\s|\~\s|\-\s|\+\s|.{1,3}\.\s)(.*)/) {
      # this is hackish - I would prefer to get # of matches from above
      my $depth = length($1) / (substr($1, 0, 1) eq ' ' ? 2 : 1);
      if (%block) {
        if ($block{"type"} != TYPE_LIST) {
          push @elements, {%block};
          %block = (type => TYPE_LIST, depth => 0, children => []);
        }
      } else {
        %block = (type => TYPE_LIST, depth => 0, children => []);
      }
      my %this_item = (type => TYPE_LIST, text => parseText($3), depth => $depth, parent => \%block, children => []);
      my $child_count = @{$block{'children'}};
      if ($child_count == 0) {
        push @{$block{'children'}}, {%this_item};
      } else {
        my $root = \%block;
        my $root_children = \@{${$root}{'children'}};
        my $diff = $depth-${$root}{'depth'};
        # travel down the list until we get our proper context.
        while ($diff > 0) {
          $root_children = \@{${$root}{'children'}};
          if (@{${$root}{'children'}} <= 0) {
            push @{${$root}{'children'}}, {%this_item};
            $diff = 0;
          } else {
            # diff 1 = we have proper parent, 0 = we're one step too deep
            $root = $root_children->[-1];
            $diff = $depth-${$root}{'depth'};
            if ($diff == 1) {
              push @{${$root}{'children'}}, {%this_item};
              $diff = 0;
            } elsif ($diff == 0) {
              push @{${${$root}{'parent'}}{'children'}}, {%this_item};
            }
          }
        }
      }
    # preformatted (default)
    } else {
      if (!%block) {
        if ($2 eq '.' && $3 =~ m/^(.*\)(\s|))(.*)/) {
          %block = (type => TYPE_PRE, text => $3, opts => parseOptions('.'.$1));
        } else {
          %block = (type => TYPE_PRE, text => $2.$3);
        }
      } else {
        if ($block{"type"} != TYPE_PRE) {
          push @elements, {%block}; %block=();
          if ($2 eq '.' && $3 =~ m/^(.*\)(\s|))(.*)/) {
            %block = (type => TYPE_PRE, text => $3, opts => parseOptions('.'.$1));
          } else {
            %block = (type => TYPE_PRE, text => $2.$3);
          }
        } else {
          $block{'text'} .= "\n".$2.$3;
        }
      }
    }
    next;
  }
  # 2. check for big headers, e.g.,
  # ,,,,
  # header
  # ````
  #   conditions: first and third lines must have at least 4 repeating chars, and the second line must not match header conditions.
  if ($i+1 < $#lines) {
    if ($lines[$i] =~ m/(\S\S)\1{2,}/) {
      my $size = $+[0];
      # only check closing header if contents appear to not be a header
      if ($lines[$i+1] !~ m/(\S\S)\1{2,}/g) {
        # check closing line for header appearance
        if ($lines[$i+2] =~ m/(\S\S)\1{2,}/g) {
          if (%block) { push @elements, {%block}; %block=() }
          if ($lines[$i+1] =~ m/^(.*?)(\..*\))/) {
            push @elements, {
              type => TYPE_HEADER,
              size => $size,
              text => parseText($1),
              opts => parseOptions($2)
            };
          } else {
            push @elements, {
              type => TYPE_HEADER,
              size => $size,
              text => parseText($lines[$i+1])
            };
          }
          $i += 2;
          next;
        }
      }
    }
  }
  # 3. check for small headers
  if ($lines[$i] =~ m/(\S)\1{2,}/) {
    my $size = $+[0];
    my $match = substr($lines[$i], $-[0], $+[0]-$-[0]);
    my $post = substr($lines[$i], $+[0]);
    # two conditions:
    #   match repeated $match if something else came between
    #     e.g., ==== header ====
    #   otherwise match if there is some text following
    #     e.g., ==== header
    if ($post =~ m/(\t*|\s*|)(.*)$match$/g) {
      if (%block) { push @elements, {%block}; %block=() }
      if ($2 =~ m/^(.*?)(\..*\))/) {
        push @elements, {
          type => TYPE_HEADER,
          size => $size,
          text => parseText($1),
          opts => parseOptions($2)
        };
      } else {
        push @elements, {
          type => TYPE_HEADER,
          size => $size,
          text => parseText($2)
        };
      }
      next;
    } elsif ($post =~ m/^(\t*|\s*|)(.*[^(\n|\r|\s|\t)])/g) {
      if (%block) { push @elements, {%block}; %block=() }
      if ($post =~ m/^(.*?)(\..*\))/) {
        push @elements, {
          type => TYPE_HEADER,
          size => $size,
          text => parseText($1),
          opts => parseOptions($2)
        };
      } else {
        push @elements, {
          type => TYPE_HEADER,
          size => $size,
          text => parseText($post)
        };
      }
      next;
    }
  }
  # 4. check for spacers
  if ($lines[$i] =~ m/^(\S)\1{2,}/) {
    if (%block) { push @elements, {%block}; %block=() }
    push @elements, {
      type => TYPE_SPACER,
      size => $+[0]-$-[0]
    };
    next;
  }
  # discover blank lines - these usually signify end of some type of block
  if ($lines[$i] =~ m/^\s*$/) {
    if (%block) { push @elements, {%block}; %block=() }
    push @elements, {
      type => TYPE_BREAK
    };
    next;
  }
  # check for blockquotes
  if ($lines[$i] =~ m/^\>(.*)$/) {
    if (!%block) {
      if ($1 =~ m/^(\..*\)(\s|))(.*)/) {
        %block = (type => TYPE_BLOCKQUOTE, text => $2, opts => parseOptions($1));
      } else {
        %block = (type => TYPE_BLOCKQUOTE, text => $1);
      }
    } else {
      if ($block{'type'} != TYPE_BLOCKQUOTE) {
        push @elements, {%block};
        if ($1 =~ m/^(\..*\)(\s|))(.*)/) {
          %block = (type => TYPE_BLOCKQUOTE, text => $2, opts => parseOptions($1));
        } else {
          %block = (type => TYPE_BLOCKQUOTE, text => $1);
        }
      } else {
        $block{'text'} .= "\n".$1;
      }
    }
    next;
  }
  # finally, create a paragraph or append to it.
  if (!%block) {
    if ($lines[$i] =~ m/^(\..*\)(\s|))(.*)/) {
      %block = (type => TYPE_PARAGRAPH, text => parseText($2), opts => parseOptions($1));
    } else {
      %block = (type => TYPE_PARAGRAPH, text => parseText($lines[$i]));
    }
  } else {
    $block{'text'} .= "\n".parseText($lines[$i]);
  }
}
# push final block if it exists
if (%block) { push @elements, {%block}; %block=() }
# merge file settings with command-line settings
@settings{ keys %cmd_settings } = values %cmd_settings;
# get our various header sizes and organize an array with unique values from smallest to largest.
my @headers;
foreach (@elements) {
  my $element = $_;
  if ($element->{type} == TYPE_HEADER) {
    if (@headers) {
      my $h_i = 0;
      my $last_size = $headers[0];
      for (my $h_i = 0; $h_i < scalar @headers; $h_i++) {
        if ($element->{size} == $headers[$h_i]) {
          $h_i = scalar @headers;
        } elsif ($element->{size} < $headers[$h_i]) {
          splice @headers, $h_i, 0, $element->{size};
          $h_i = scalar @headers;
        } elsif ($element->{size} > $headers[$h_i]) {
          if ($h_i+1 >= scalar @headers) {
            splice @headers, $h_i+1, 0, $element->{size};
            $h_i = scalar @headers;
          }
        }
      }
    } else {
      $headers[0] = $element->{size};
    }
  }
}
# this is really dumb, but convert our array to a hash
my %header_map;
for (my $h_i = 0; $h_i < scalar @headers; $h_i++) {
  if ($settings{'header.reverse'}) {
    $header_map{$headers[$h_i]} = $h_i+1+($settings{'header.depth'} ? $settings{'header.depth'} : 0);
  } else {
    $header_map{$headers[$h_i]} = scalar ($settings{'header.depth'} ? $settings{'header.depth'} : 0) + @headers - $h_i;
  }
}

# set up our array for header numbering. The first element
my @header_numbers;
for (my $h_i = 0; $h_i < scalar @headers; $h_i++) {
  $header_numbers[$h_i] = 0;
}
my $header_number = 0; # at which numbering depth are we

# print 'em out
if ($settings{'toc'}) {
  foreach (@elements) {
    if ($_->{type} == TYPE_HEADER) {

    }
  }
} else {
  my $previous_type = 0;
  my $indent = ("\t" x ($settings{"indent_level"} ? $settings{"indent_level"} : 0));
  foreach (@elements) {
    if ($_->{type} == TYPE_HEADER) {
      my $hsize = $header_map{$_->{size}};
      my $htext = $_->{text};
      if ($settings{"header.numbering"}) {
        $header_number = $hsize;
        for (my $h_i = $header_number+1; $h_i < scalar @headers; $h_i++) {
          $header_numbers[$h_i] = 0;
        }
        $header_numbers[$header_number]++;
        $htext = ($settings{"header.numbering"} ? $header_numbers[$header_number].'. ' : '') . $htext;
      }
      my $hid = $_->{text};
      $hid =~ s/ /_/g;
      print("<h$hsize".($_->{opts} ? $_->{opts} : '').($settings{'header.ids'} ? " id=\"".$hid."\"" : '').">$htext</h$hsize>\n");
    } elsif ($_->{type} == TYPE_BREAK) {
      if ($previous_type == TYPE_BREAK) {
        print("<br />\n");
      }
    } elsif ($_->{type} == TYPE_PARAGRAPH) {
      print("<p".($_->{opts} ? $_->{opts} : '').">$_->{text}</p>\n");
    } elsif ($_->{type} == TYPE_PRE) {
      print("<pre".($_->{opts} ? $_->{opts} : '').">$_->{text}</pre>\n");
    } elsif ($_->{type} == TYPE_CODE) {
      print("<pre><code".($_->{opts} ? $_->{opts} : '').">$_->{text}</code></pre>\n");
    } elsif ($_->{type} == TYPE_BLOCKQUOTE) {
      print("<blockquote".($_->{opts} ? $_->{opts} : '').">".($settings{'blockquote.parse'} ? parseText($_->{text}) : $_->{text})."</blockquote>\n");
    } elsif ($_->{type} == TYPE_SPACER) {
      print("<hr />\n");
    } elsif ($_->{type} == TYPE_LIST) {
      printList($_);
    }
    $previous_type = $_->{type};
  }
}
if (!$settings{"version.none"}) {
  if ($settings{"version.hide"}) {
    print "<!-- $file_name generated by kettext 0.1 -->\n";
  } else {
    print "<small>$file_name generated by kettext 0.1</small>\n";
  }
}

sub printList {
  my $list = $_[0];
  my $child_count = @{$list->{'children'}};
  #my $child = $children->[$child_i];
  if ($list->{'text'}) {
    print "<li>$list->{'text'}";
  }
  if ($child_count > 0) {
    print "<ul>\n";
    my $children = \@{$list->{'children'}};
    my $child_i = 0;
    for (my $child = $children->[$child_i]; $child_i < $child_count; $child = $children->[++$child_i]) {
      printList($child);
    }
    print "</ul>\n";
  }
  if ($list->{'text'}) {
    print "</li>\n";
  }
}

sub parseText {
  my $text = $_[0];
  my @text_patterns = ('\*\*','\*\*', '\/\/','\/\/', '\[\[','\]\]', '\(\(','\)\)', '``', '``');
  my @text_replace = ('<b>','</b>', '<i>','</i>', '<a>','</a>', '<small>','</small>', '<code>','</code>');
  my $i = 0;
  my $new_text = $text;
  for (my $p = 0; $p <= $#text_patterns; $p+=2) {
    my $open = $text_patterns[$p];
    my $close = $text_patterns[$p+1];
    while ($new_text =~ m/($open)([^$close]*)($close)(\.\S*\(.*?\)|)/g) {
    #while ($new_text =~ m/($open)([^$close]*)($close)(\.\S*\(.*\)|)/g) {
      my $converted_text;
      if ($4) {
        my $ts = substr($text_replace[$i], 0, -1);
        my $te = substr($text_replace[$i], -1);
        $converted_text = $ts.parseOptions($4).$te.$2.$text_replace[$i+1];
      } else {
        $converted_text = $text_replace[$i].$2.$text_replace[$i+1];
      }
      $new_text =~ s/\Q$1$2$3$4\E/$converted_text/g;
    }
    $i +=2;
  }
  return $new_text;
}

sub parseOptions {
  my $text = $_[0];
  my $opts;
  while ($text =~ m/(\.)([a-zA-Z]*[^\(])(\()([^\)]*)/g) {
    $opts.=" $2=\"$4\"";
  }
  return $opts;
}