#!/usr/bin/perl use strict; use warnings; package kettext; use constant VERSION => 0.2; # create our parse "enum" use constant { TYPE_HEADER => 1, TYPE_PARAGRAPH => 2, TYPE_PRE => 3, TYPE_RULE => 4, TYPE_SPACER => 5, TYPE_BREAK => 6, TYPE_CODE => 7, TYPE_BLOCKQUOTE => 8, TYPE_LIST => 9 }; # create our general data # parseFile("my_file", %cmd_settings) sub parseFile { my $file_name = $_[0]; my %cmd_settings = ($_[1] ? %{$_[1]} : ()); my @elements = (); # our list of ALL distinct elements my %block = (); # some block of data, such as paragraph, pre, etc. my %settings = (); # create file handler and read into @lines binmode STDOUT, ":utf8"; use utf8; open(my $fh, '<:encoding(UTF-8)', $file_name) or die "ERR: could not open file '$file_name' $!"; chomp(my @lines = <$fh>); close($fh); # main logic: line reading for (my $i = 0; $i <= $#lines; $i++) { # check for imply rules on first line if ($i == 0 || $i == $#lines) { if ($lines[$i] =~ m/^(\..*\)(\s|))(.*)/) { if ($1 =~ m/^.*\((.*)\)/) { my @options = split(',', $1); foreach my $opt (@options) { my @value = split('=', $opt); $settings{$value[0]} = ($value[1] ? $value[1] : 1); } } next; } } # 1. check for indent-related lines (pre, code, list, etc.) if ($lines[$i] =~ m/^(\t|\s\s|\s\s\s\s)(.)(.*)/) { # code if ($2 eq '#') { if (!%block) { if ($3 =~ m/^([^\s]*)(\..*\)(\s|))(.*)/) { %block = (type => TYPE_CODE, text => $3, opts => parseOptions($2)); } else { %block = (type => TYPE_CODE, text => $3); } } else { if ($block{"type"} != TYPE_CODE) { push @elements, {%block}; if ($3 =~ m/^([^\s*])(\..*\)(\s|))(.*)/) { %block = (type => TYPE_CODE, text => $3, opts => parseOptions($2)); } else { %block = (type => TYPE_CODE, text => $3); } } else { $block{'text'} .= "\n".$3; } } # lists - this is ugly, expensive, and by far the most shameful of this code } elsif ($lines[$i] =~ m/^(\t{1,}|\s\s{1,})(\*\s|\~\s|\-\s|\+\s|.{1,3}\.\s)(.*)/) { # this is hackish - I would prefer to get # of matches from above my $depth = length($1) / (substr($1, 0, 1) eq ' ' ? 2 : 1); if (%block) { if ($block{"type"} != TYPE_LIST) { push @elements, {%block}; %block = (type => TYPE_LIST, depth => 0, children => []); } } else { %block = (type => TYPE_LIST, depth => 0, children => []); } my %this_item = (type => TYPE_LIST, text => $3, depth => $depth, parent => \%block, children => []); my $child_count = @{$block{'children'}}; if ($child_count == 0) { push @{$block{'children'}}, {%this_item}; } else { my $root = \%block; my $root_children = \@{${$root}{'children'}}; my $diff = $depth-${$root}{'depth'}; # travel down the list until we get our proper context. while ($diff > 0) { $root_children = \@{${$root}{'children'}}; if (@{${$root}{'children'}} <= 0) { push @{${$root}{'children'}}, {%this_item}; $diff = 0; } else { # diff 1 = we have proper parent, 0 = we're one step too deep $root = $root_children->[-1]; $diff = $depth-${$root}{'depth'}; if ($diff == 1) { push @{${$root}{'children'}}, {%this_item}; $diff = 0; } elsif ($diff == 0) { push @{${${$root}{'parent'}}{'children'}}, {%this_item}; } } } } # preformatted (default) } else { if (!%block) { if ($2 eq '.' && $3 =~ m/^(.*\)(\s|))(.*)/) { %block = (type => TYPE_PRE, text => $3, opts => parseOptions('.'.$1)); } else { %block = (type => TYPE_PRE, text => $2.$3); } } else { if ($block{"type"} != TYPE_PRE) { push @elements, {%block}; %block=(); if ($2 eq '.' && $3 =~ m/^(.*\)(\s|))(.*)/) { %block = (type => TYPE_PRE, text => $3, opts => parseOptions('.'.$1)); } else { %block = (type => TYPE_PRE, text => $2.$3); } } else { $block{'text'} .= "\n".$2.$3; } } } next; } # 2. check for big headers, e.g., # ,,,, # header # ```` # conditions: first and third lines must have at least 4 repeating chars, and the second line must not match header conditions. if ($i+1 < $#lines) { if ($lines[$i] =~ m/(\S\S)\1{2,}/) { my $size = $+[0]; # only check closing header if contents appear to not be a header if ($lines[$i+1] !~ m/(\S\S)\1{2,}/g) { # check closing line for header appearance if ($lines[$i+2] =~ m/(\S\S)\1{2,}/g) { if (%block) { push @elements, {%block}; %block=() } if ($lines[$i+1] =~ m/^(.*?)(\..*\))/) { push @elements, { type => TYPE_HEADER, size => $size, text => $1, opts => parseOptions($2) }; } else { push @elements, { type => TYPE_HEADER, size => $size, text => $lines[$i+1] }; } $i += 2; next; } } } } # 3. check for small headers if ($lines[$i] =~ m/^(\S)\1{2,}/) { my $size = $+[0]; my $match = substr($lines[$i], $-[0], $+[0]-$-[0]); my $post = substr($lines[$i], $+[0]); # two conditions: # match repeated $match if something else came between # e.g., ==== header ==== # otherwise match if there is some text following # e.g., ==== header if ($post =~ m/(\t*|\s*|)(.*)\Q$match\E$/g) { if (%block) { push @elements, {%block}; %block=() } if ($2 =~ m/^(.*?)(\..*\))/) { push @elements, { type => TYPE_HEADER, size => $size, text => $1, opts => parseOptions($2) }; } else { push @elements, { type => TYPE_HEADER, size => $size, text => $2 }; } next; } elsif ($post =~ m/^(\t*|\s*|)(.*[^(\n|\r|\s|\t)])/g) { if (%block) { push @elements, {%block}; %block=() } if ($post =~ m/^(.*?)(\..*\))/) { push @elements, { type => TYPE_HEADER, size => $size, text => $1, opts => parseOptions($2) }; } else { push @elements, { type => TYPE_HEADER, size => $size, text => $post }; } next; } } # 4. check for spacers if ($lines[$i] =~ m/^(\S)\1{2,}/) { if (%block) { push @elements, {%block}; %block=() } push @elements, { type => TYPE_SPACER, size => $+[0]-$-[0] }; next; } # discover blank lines - these usually signify end of some type of block if ($lines[$i] =~ m/^\s*$/) { if (%block) { push @elements, {%block}; %block=() } push @elements, { type => TYPE_BREAK }; next; } # check for blockquotes if ($lines[$i] =~ m/^\>(.*)$/) { if (!%block) { if ($1 =~ m/^(\..*\)(\s|))(.*)/) { %block = (type => TYPE_BLOCKQUOTE, text => $2, opts => parseOptions($1)); } else { %block = (type => TYPE_BLOCKQUOTE, text => $1); } } else { if ($block{'type'} != TYPE_BLOCKQUOTE) { push @elements, {%block}; if ($1 =~ m/^(\..*\)(\s|))(.*)/) { %block = (type => TYPE_BLOCKQUOTE, text => $2, opts => parseOptions($1)); } else { %block = (type => TYPE_BLOCKQUOTE, text => $1); } } else { $block{'text'} .= "\n".$1; } } next; } # finally, create a paragraph or append to it. if (!%block) { if ($lines[$i] =~ m/^(\..*\)(\s|))(.*)/) { %block = (type => TYPE_PARAGRAPH, text => $2, opts => parseOptions($1)); } else { %block = (type => TYPE_PARAGRAPH, text => $lines[$i]); } } else { $block{'text'} .= "\n".$lines[$i]; } } # push final block if it exists if (%block) { push @elements, {%block}; %block=() } # merge file settings with command-line settings @settings{ keys %cmd_settings } = values %cmd_settings; # get our various header sizes and organize an array with unique values from smallest to largest. my @headers; foreach (@elements) { my $element = $_; if ($element->{type} == TYPE_HEADER) { if (@headers) { my $h_i = 0; my $last_size = $headers[0]; for (my $h_i = 0; $h_i < scalar @headers; $h_i++) { if ($element->{size} == $headers[$h_i]) { $h_i = scalar @headers; } elsif ($element->{size} < $headers[$h_i]) { splice @headers, $h_i, 0, $element->{size}; $h_i = scalar @headers; } elsif ($element->{size} > $headers[$h_i]) { if ($h_i+1 >= scalar @headers) { splice @headers, $h_i+1, 0, $element->{size}; $h_i = scalar @headers; } } } } else { $headers[0] = $element->{size}; } } } # this is really dumb, but convert our array to a hash my %header_map; for (my $h_i = 0; $h_i < scalar @headers; $h_i++) { if ($settings{'header.reverse'}) { $header_map{$headers[$h_i]} = $h_i+1+($settings{'header.depth'} ? $settings{'header.depth'} : 0); } else { $header_map{$headers[$h_i]} = scalar ($settings{'header.depth'} ? $settings{'header.depth'} : 0) + @headers - $h_i; } } # set up our array for header numbering. The first element my @header_numbers; for (my $h_i = 0; $h_i < scalar @headers; $h_i++) { $header_numbers[$h_i] = 0; } my $header_number = 0; # at which numbering depth are we # prepend to headers foreach (@elements) { if ($_->{type} == TYPE_HEADER) { my $hsize = $header_map{$_->{size}}; $_->{hsize} = $hsize; $header_number = $hsize; for (my $h_i = $header_number+1; $h_i < scalar @headers; $h_i++) { $header_numbers[$h_i] = 0; } $header_numbers[$header_number]++; # prepend an ordered number to headers based on depth if ($settings{"header.numbering"}) { $_->{text} = $header_numbers[$header_number].'. '.$_->{text}; } } } my %ret = ( "elements" => \@elements, "settings" => \%settings, "filename" => $file_name ); return %ret; } sub parseOptions { my $text = $_[0]; my $opts; while ($text =~ m/(\.)([a-zA-Z]*[^\(])(\()([^\)]*)/g) { $opts.=" $2=\"$4\""; } return $opts; } package kettext::convertTo; sub HTML { my %data = @_; my @elements = @{$data{elements}}; my %settings = %{$data{settings}}; my $file_name = $data{filename}; # print 'em out if ($settings{'toc'}) { foreach (@elements) { if ($_->{type} == kettext::TYPE_HEADER) { } } } else { my $previous_type = 0; my $indent = ("\t" x ($settings{"indent_level"} ? $settings{"indent_level"} : 0)); foreach (@elements) { if ($_->{type} == kettext::TYPE_HEADER) { my $hid = $_->{text}; $hid =~ s/ /_/g; print("{hsize}".($_->{opts} ? $_->{opts} : '').($settings{'header.ids'} ? " id=\"".$hid."\"" : '').">".convertText($_->{text})."{hsize}>\n"); } elsif ($_->{type} == kettext::TYPE_BREAK) { if ($previous_type == kettext::TYPE_BREAK) { print("
\n"); } } elsif ($_->{type} == kettext::TYPE_PARAGRAPH) { my $text = convertText($_->{text}); $text =~ s/\n/
\n/g; print("{opts} ? $_->{opts} : '').">$text

\n"); } elsif ($_->{type} == kettext::TYPE_PRE) { print("{opts} ? $_->{opts} : '').">$_->{text}\n"); } elsif ($_->{type} == kettext::TYPE_CODE) { print("
{opts} ? $_->{opts} : '').">$_->{text}
\n"); } elsif ($_->{type} == kettext::TYPE_BLOCKQUOTE) { print("{opts} ? $_->{opts} : '').">".($settings{'blockquote.parse'} ? convertText($_->{text}) : $_->{text})."\n"); } elsif ($_->{type} == kettext::TYPE_SPACER) { print("
\n"); } elsif ($_->{type} == kettext::TYPE_LIST) { printf("

"); # @@ should this be optional? printList($_); } $previous_type = $_->{type}; } } if (!$settings{"version.none"}) { if ($settings{"version.hide"}) { print "\n"; } else { print "$file_name generated by kettext ".kettext::VERSION."\n"; } } sub printList { my $list = $_[0]; my $child_count = @{$list->{'children'}}; #my $child = $children->[$child_i]; if ($list->{'text'}) { print "
  • ".convertText($list->{'text'}); } if ($child_count > 0) { print "
      \n"; my $children = \@{$list->{'children'}}; my $child_i = 0; for (my $child = $children->[$child_i]; $child_i < $child_count; $child = $children->[++$child_i]) { printList($child); } print "
    \n"; } if ($list->{'text'}) { print "
  • \n"; } } sub convertText { my $text = $_[0]; my @text_patterns = ('\*\*','\*\*', '\/\/','\/\/', '\[\[','\]\]', '\(\(','\)\)', '``', '``'); my @text_replace = ('','', '','', '','', '','', '',''); my $i = 0; my $new_text = $text; for (my $p = 0; $p <= $#text_patterns; $p+=2) { my $open = $text_patterns[$p]; my $close = $text_patterns[$p+1]; while ($new_text =~ m/($open)([^$close]*)($close)(\.\S*\(.*?\)|)/g) { #while ($new_text =~ m/($open)([^$close]*)($close)(\.\S*\(.*\)|)/g) { my $converted_text; if ($4) { my $ts = substr($text_replace[$i], 0, -1); my $te = substr($text_replace[$i], -1); $converted_text = $ts.kettext::parseOptions($4).$te.$2.$text_replace[$i+1]; } else { $converted_text = $text_replace[$i].$2.$text_replace[$i+1]; } $new_text =~ s/\Q$1$2$3$4\E/$converted_text/g; } $i +=2; } return $new_text; } } 1;