#!/usr/bin/perl use strict; use warnings; package kettext; use constant VERSION => 0.2; # create our parse "enum" use constant { TYPE_HEADER => 1, TYPE_PARAGRAPH => 2, TYPE_PRE => 3, TYPE_RULE => 4, TYPE_SPACER => 5, TYPE_BREAK => 6, TYPE_CODE => 7, TYPE_BLOCKQUOTE => 8, TYPE_LIST => 9 }; # create our general data # parseFile("my_file", %cmd_settings) sub parseFile { my $file_name = $_[0]; my %cmd_settings = ($_[1] ? %{$_[1]} : ()); my @elements = (); # our list of ALL distinct elements my %block = (); # some block of data, such as paragraph, pre, etc. my %settings = (); # get dirname for includes my $dir_name = $file_name; $dir_name =~ s/\/(.*)$/\//g; if ($dir_name eq $file_name) { $dir_name = "./"; } # create file handler and read into @lines binmode STDOUT, ":utf8"; use utf8; open(my $fh, '<:encoding(UTF-8)', $file_name) or die "ERR: could not open file '$file_name' $!"; chomp(my @lines = <$fh>); close($fh); # main logic: line reading for (my $i = 0; $i <= $#lines; $i++) { # check for imply rules on first line if ($i == 0 || $i == $#lines) { if ($lines[$i] =~ m/^(\..*\)(\s|))(.*)/) { if ($1 =~ m/^.*\((.*)\)/) { my @options = split(',', $1); foreach my $opt (@options) { my @value = split('=', $opt); $settings{$value[0]} = ($value[1] ? $value[1] : 1); } } next; } } # 1. check for indent-related lines (pre, code, list, etc.) if ($lines[$i] =~ m/^(\t|\s\s|\s\s\s\s)(.)(.*)/) { # code if ($2 eq '#') { if (!%block) { #if ($3 =~ m/^([^\s]*)(\..*\)(\s|))(.*)/) { if ($3 =~ m/^(\..*?\)(?:\s|\n|))(.*?)/) { %block = (type => TYPE_CODE, text => $2, opts => parseOptions($1)); if (exists ${$block{'opts'}}{'include'}) { my $include = $dir_name.${$block{'opts'}}{'include'}; if (open(my $fh, '<:encoding(UTF-8)', $include)) { while (my $row = <$fh>) { chomp $row; $block{'text'} .= "$row\n"; } } else { print $file_name; $block{'text'} .= "Could not read ".$include."\n"; } delete ${$block{'opts'}}{'include'}; } } else { %block = (type => TYPE_CODE, text => $3); } } else { if ($block{"type"} != TYPE_CODE) { push @elements, {%block}; if ($3 =~ m/^(\..*?\)(?:\s|\n|))(.*?)/) { %block = (type => TYPE_CODE, text => $3, opts => parseOptions($2)); if (${$block{'opts'}}{'include'}) { if (open(my $fh, '<:encoding(UTF-8)', ${$block{'opts'}}{'include'})) { while (my $row = <$fh>) { chomp $row; $block{'text'} .= "$row\n"; } } else { $block{'text'} .= "Could not read ".${$block{'opts'}}{'include'}."\n"; } delete ${$block{'opts'}}{'include'}; } } else { %block = (type => TYPE_CODE, text => $3); } } else { $block{'text'} .= "\n".$3; } } # lists - this is ugly, expensive, and by far the most shameful of this code } elsif ($lines[$i] =~ m/^(\t{1,}|\s\s{1,})(\*\s|\~\s|\-\s|\+\s|.{1,3}\.\s)(.*)/) { # this is hackish - I would prefer to get # of matches from above my $depth = length($1) / (substr($1, 0, 1) eq ' ' ? 2 : 1); if (%block) { if ($block{"type"} != TYPE_LIST) { push @elements, {%block}; %block = (type => TYPE_LIST, depth => 0, children => []); } } else { %block = (type => TYPE_LIST, depth => 0, children => []); } my %this_item = (type => TYPE_LIST, text => $3, depth => $depth, parent => \%block, children => []); my $child_count = @{$block{'children'}}; if ($child_count == 0) { push @{$block{'children'}}, {%this_item}; } else { my $root = \%block; my $root_children = \@{${$root}{'children'}}; my $diff = $depth-${$root}{'depth'}; # travel down the list until we get our proper context. while ($diff > 0) { $root_children = \@{${$root}{'children'}}; if (@{${$root}{'children'}} <= 0) { push @{${$root}{'children'}}, {%this_item}; $diff = 0; } else { # diff 1 = we have proper parent, 0 = we're one step too deep $root = $root_children->[-1]; $diff = $depth-${$root}{'depth'}; if ($diff == 1) { push @{${$root}{'children'}}, {%this_item}; $diff = 0; } elsif ($diff == 0) { push @{${${$root}{'parent'}}{'children'}}, {%this_item}; } } } } # preformatted (default) } else { if (!%block) { if ($2 eq '.' && $3 =~ m/^(.*\)(\s|))(.*)/) { %block = (type => TYPE_PRE, text => $3, opts => parseOptions('.'.$1)); if (exists ${$block{'opts'}}{'include'}) { my $include = $dir_name.${$block{'opts'}}{'include'}; if (open(my $fh, '<:encoding(UTF-8)', $include)) { while (my $row = <$fh>) { chomp $row; $block{'text'} .= "$row\n"; } } else { print $file_name; $block{'text'} .= "Could not read ".$include."\n"; } delete ${$block{'opts'}}{'include'}; } } else { %block = (type => TYPE_PRE, text => $2.$3); } } else { if ($block{"type"} != TYPE_PRE) { push @elements, {%block}; %block=(); if ($2 eq '.' && $3 =~ m/^(.*\)(\s|))(.*)/) { %block = (type => TYPE_PRE, text => $3, opts => parseOptions('.'.$1)); if (exists ${$block{'opts'}}{'include'}) { my $include = $dir_name.${$block{'opts'}}{'include'}; if (open(my $fh, '<:encoding(UTF-8)', $include)) { while (my $row = <$fh>) { chomp $row; $block{'text'} .= "$row\n"; } } else { print $file_name; $block{'text'} .= "Could not read ".$include."\n"; } delete ${$block{'opts'}}{'include'}; } } else { %block = (type => TYPE_PRE, text => $2.$3); } } else { $block{'text'} .= "\n".$2.$3; } } } next; } # 2. check for big headers, e.g., # ,,,, # header # ```` # conditions: first and third lines must have at least 4 repeating chars, and the second line must not match header conditions. if ($i+1 < $#lines) { if ($lines[$i] =~ m/(\S\S)\1{2,}/) { my $size = $+[0]; # only check closing header if contents appear to not be a header if ($lines[$i+1] !~ m/(\S\S)\1{2,}/g) { # check closing line for header appearance if ($lines[$i+2] =~ m/(\S\S)\1{2,}/g) { if (%block) { push @elements, {%block}; %block=() } if ($lines[$i+1] =~ m/^(.*?)(\..*\))/) { push @elements, { type => TYPE_HEADER, size => $size, text => $1, opts => parseOptions($2) }; } else { push @elements, { type => TYPE_HEADER, size => $size, text => $lines[$i+1] }; } $i += 2; next; } } } } # 3. check for small headers if ($lines[$i] =~ m/^(\S)\1{2,}\s/) { my $size = $+[0]; my $match = substr($lines[$i], $-[0], $+[0]-$-[0]); my $post = substr($lines[$i], $+[0]); # two conditions: # match repeated $match if something else came between # e.g., ==== header ==== # otherwise match if there is some text following # e.g., ==== header if ($post =~ m/(\t*|\s*|)(.*)\Q$match\E$/g) { if (%block) { push @elements, {%block}; %block=() } if ($2 =~ m/^(.*?)(\..*\))/) { push @elements, { type => TYPE_HEADER, size => $size, text => $1, opts => parseOptions($2) }; } else { push @elements, { type => TYPE_HEADER, size => $size, text => $2 }; } next; } elsif ($post =~ m/^(\t*|\s*|)(.*[^(\n|\r|\s|\t)])/g) { if (%block) { push @elements, {%block}; %block=() } if ($post =~ m/^(.*?)(\..*\))/) { push @elements, { type => TYPE_HEADER, size => $size, text => $1, opts => parseOptions($2) }; } else { push @elements, { type => TYPE_HEADER, size => $size, text => $post }; } next; } } # 4. check for spacers if ($lines[$i] =~ m/^(\S)\1{2,}/) { if (%block) { push @elements, {%block}; %block=() } push @elements, { type => TYPE_SPACER, size => $+[0]-$-[0] }; next; } # discover blank lines - these usually signify end of some type of block if ($lines[$i] =~ m/^\s*$/) { if (%block) { push @elements, {%block}; %block=() } push @elements, { type => TYPE_BREAK }; next; } # check for blockquotes if ($lines[$i] =~ m/^\>(.*)$/) { if (!%block) { if ($1 =~ m/^(\..*\)(\s|))(.*)/) { %block = (type => TYPE_BLOCKQUOTE, text => $2, opts => parseOptions($1)); } else { %block = (type => TYPE_BLOCKQUOTE, text => $1); } } else { if ($block{'type'} != TYPE_BLOCKQUOTE) { push @elements, {%block}; if ($1 =~ m/^(\..*\)(\s|))(.*)/) { %block = (type => TYPE_BLOCKQUOTE, text => $2, opts => parseOptions($1)); } else { %block = (type => TYPE_BLOCKQUOTE, text => $1); } } else { $block{'text'} .= "\n".$1; } } next; } # finally, create a paragraph or append to it. if (!%block) { if ($lines[$i] =~ m/^(\..*\)(\s|))(.*)/) { %block = (type => TYPE_PARAGRAPH, text => $2, opts => parseOptions($1)); if (${$block{'opts'}}{'include'}) { my %sub_file = kettext::parseFile($dir_name.${$block{'opts'}}{'include'}, \%cmd_settings); for my $i (0 .. $#{$sub_file{'elements'}}) { push @elements, ${$sub_file{'elements'}}[$i]; } delete ${$block{'opts'}}{'include'}; } } else { %block = (type => TYPE_PARAGRAPH, text => $lines[$i]); } } else { $block{'text'} .= "\n".$lines[$i]; } } # push final block if it exists if (%block) { push @elements, {%block}; %block=() } # merge file settings with command-line settings @settings{ keys %cmd_settings } = values %cmd_settings; # get our various header sizes and organize an array with unique values from smallest to largest. my @headers; foreach (@elements) { my $element = $_; if ($element->{type} == TYPE_HEADER) { if (@headers) { my $h_i = 0; my $last_size = $headers[0]; for (my $h_i = 0; $h_i < scalar @headers; $h_i++) { if ($element->{size} == $headers[$h_i]) { $h_i = scalar @headers; } elsif ($element->{size} < $headers[$h_i]) { splice @headers, $h_i, 0, $element->{size}; $h_i = scalar @headers; } elsif ($element->{size} > $headers[$h_i]) { if ($h_i+1 >= scalar @headers) { splice @headers, $h_i+1, 0, $element->{size}; $h_i = scalar @headers; } } } } else { $headers[0] = $element->{size}; } } } # this is really dumb, but convert our array to a hash my %header_map; for (my $h_i = 0; $h_i < scalar @headers; $h_i++) { if ($settings{'header.reverse'}) { $header_map{$headers[$h_i]} = $h_i+1+($settings{'header.depth'} ? $settings{'header.depth'} : 0); } else { $header_map{$headers[$h_i]} = scalar ($settings{'header.depth'} ? $settings{'header.depth'} : 0) + @headers - $h_i; } } # set up our array for header numbering. The first element my @header_numbers; for (my $h_i = 0; $h_i < scalar @headers; $h_i++) { $header_numbers[$h_i] = 0; } my $header_number = 0; # at which numbering depth are we # prepend to headers foreach (@elements) { if ($_->{type} == TYPE_HEADER) { my $hsize = $header_map{$_->{size}}; $_->{hsize} = $hsize; $header_number = $hsize; for (my $h_i = $header_number+1; $h_i < scalar @headers; $h_i++) { $header_numbers[$h_i] = 0; } $header_numbers[$header_number]++; # prepend an ordered number to headers based on depth if ($settings{"header.numbering"}) { $_->{text} = $header_numbers[$header_number].'. '.$_->{text}; } } } my %ret = ( "elements" => \@elements, "settings" => \%settings, "filename" => $file_name ); return %ret; } sub parseOptions { my $text = $_[0]; my %opts = (); while ($text =~ m/(\.|)([a-zA-Z]*[^\(])(\()([^\)]*)/g) { $opts{$2} = $4; } return \%opts; } package kettext::convertTo; sub HTML { my %data = @_; my @elements = @{$data{elements}}; my %settings = %{$data{settings}}; my $file_name = $data{filename}; our @prefixes = (["img", "image"], ["aud", "audio"], ["vid", "video"], ["yt", "youtube"], ["vimeo"], ["twitter", "tweet"]); our @urls = ([], [], [], ["youtube.com", "youtu.be"], ["vimeo.com"], ["twitter.com"]); our @url_parse = ( "(.*)", "", "", "(?:.*)(?:v=)([a-zA-Z0-9]*)", "(?:.*)(?:/)([0-9)*)\$", ""); our @embeds = ( ['""',''], ['"'], ['"'], ['"'], ['"'], ["", ""]); # print 'em out if ($settings{'toc'}) { foreach (@elements) { if ($_->{type} == kettext::TYPE_HEADER) { # TODO: collect and organize TYPE_HEADER elements into TYPE_LIST element. Use header IDs and surround in anchors appropriately. } } } else { my $previous_type = 0; my $indent = ("\t" x ($settings{"indent_level"} ? $settings{"indent_level"} : 0)); foreach (@elements) { if ($_->{type} == kettext::TYPE_HEADER) { my $hid = $_->{text}; $hid =~ s/ /_/g; print("{hsize}".($_->{opts} ? convertOpts($_->{opts}) : '').($settings{'header.ids'} ? " id=\"".$hid."\"" : '').">".convertText($_->{text})."{hsize}>\n"); } elsif ($_->{type} == kettext::TYPE_BREAK) { if ($previous_type == kettext::TYPE_BREAK) { print("
\n"); } } elsif ($_->{type} == kettext::TYPE_PARAGRAPH) { my $text = convertText($_->{text}); $text =~ s/\n/
\n/g; print("{opts} ? convertOpts($_->{opts}) : '').">$text

\n"); } elsif ($_->{type} == kettext::TYPE_PRE) { print("{opts} ? convertOpts($_->{opts}) : '').">$_->{text}\n"); } elsif ($_->{type} == kettext::TYPE_CODE) { print("
{opts} ? convertOpts($_->{opts}) : '').">$_->{text}
\n"); } elsif ($_->{type} == kettext::TYPE_BLOCKQUOTE) { print("{opts} ? convertOpts($_->{opts}) : '').">".($settings{'blockquote.parse'} ? convertText($_->{text}) : $_->{text})."\n"); } elsif ($_->{type} == kettext::TYPE_SPACER) { print("
\n"); } elsif ($_->{type} == kettext::TYPE_LIST) { printf("

"); # @@ should this be optional? printList($_); } $previous_type = $_->{type}; } } if (!$settings{"version.none"}) { if ($settings{"version.hide"}) { print "\n"; } else { print "$file_name generated by kettext ".kettext::VERSION."\n"; } } sub printList { my $list = $_[0]; my $child_count = @{$list->{'children'}}; #my $child = $children->[$child_i]; if ($list->{'text'}) { print "
  • ".convertText($list->{'text'}); } if ($child_count > 0) { print "
      \n"; my $children = \@{$list->{'children'}}; my $child_i = 0; for (my $child = $children->[$child_i]; $child_i < $child_count; $child = $children->[++$child_i]) { printList($child); } print "
    \n"; } if ($list->{'text'}) { print "
  • \n"; } } sub convertText { my $text = $_[0]; my @text_patterns = ('\*\*','\*\*', '\/\/','\/\/', '\[\[','\]\]', '\(\(','\)\)', '``', '``', '__','__'); my @default_opts = ("", "", "href", "", "", ""); my @text_replace = ('','', '','', '','', '','', '','', '',''); my @char_patterns = ('&', '<', '>', '"', '\'', '--'); my @char_replace = ('&', '<', '>', '"', ''', '—'); my $i = 0; my $new_text = $text; # 1. replace character(s) #for (my $ch = 0; $ch <= $#char_patterns; $ch++) { # if ($new_text =~ m/(([^\\])$char_patterns[$ch])/g) { # my $bzr = $2; # $new_text =~ s/\Q$bzr$char_patterns[$ch]\E/\Q$bzr\E$char_replace[$ch]/g; # } # if ($new_text =~ m/(\\)($char_patterns[$ch])/g) { # $new_text =~ s/\\$char_patterns[$ch]/$char_patterns[$ch]/g; # } #} # 2. replace character pairs for (my $p = 0; $p <= $#text_patterns; $p+=2) { my $open = $text_patterns[$p]; my $close = $text_patterns[$p+1]; #while ($new_text =~ m/($open)([^$close]*)($close)(\.\S*\(.*?\)|)/g) { #while ($new_text =~ m/($open)([^$close]*)($close)((?:\()(.*?)(?:\))|)(\.\S*\(.*?\)|)/g) { while ($new_text =~ m/(?:\s|)($open)(.*?)($close)((?:\()(.*?)(?:\))|)(\.\S*\(.*?\)|)/g) { my $converted_text; if ($p == 4) { my @media = convertLink($2); my $ts = substr($media[0], 0, -1); my $te = substr($media[0], -1); my $opts = kettext::parseOptions($6); if ($5) { $opts->{$default_opts[$i]} = $5; } # FIXME: dirty hack to allow [[img:image]](url) or [[img:image]].href(url) to expand to if ($opts->{'href'} && !($media[0] eq "")) { $converted_text = "{'href'}."\">"; delete $opts->{'href'}; $converted_text .= $ts.convertOpts($opts).$te.$media[1]; $converted_text .= ""; $new_text =~ s/\Q$1$2$3$4$6\E/$converted_text/g; } else { $converted_text = $ts.convertOpts($opts).$te.$media[1]; $new_text =~ s/\Q$1$2$3$4$6\E/$converted_text/g; } } else { my $ts = substr($text_replace[$p], 0, -1); my $te = substr($text_replace[$p], -1); my $opts = kettext::parseOptions($6); if ($5) { $opts->{$default_opts[$i]} = $5; } $converted_text = $ts.convertOpts($opts).$te.$2.$text_replace[$p+1]; $new_text =~ s/\Q$1$2$3$4$6\E/$converted_text/g; } } $i++; } return $new_text; } sub convertLink { my $text = $_[0]; my $find = $text; # 1. check for prefix such as "image:", "audio:", etc. if ($find =~ m/^(.*?)(?::)(.*)$/) { for (my $prefix_group = 0; $prefix_group <= $#prefixes; $prefix_group++) { for (my $prefix = 0; $prefix <= $#{${prefixes[$prefix_group]}}; $prefix++) { if ($1 eq ${prefixes[$prefix_group]}[$prefix]) { my @media = urlToMedia($2); #"prefix:http://..." if ($media[0] eq "") { # "prefix:data" $media[0] = $2; $media[0] =~ s/(.*)/$embeds[$prefix_group][0]/ee; $media[1] = $embeds[$prefix_group][1]; } return @media; } } } } sub convertOpts { my $opts = $_[0]; my $string = ""; keys %{$opts}; while (my ($key, $value) = each %{$opts}) { $string .= " $key=\"$value\""; } return $string; } # 2. check for (http(s)://)(www.)youtube/youtu.be/vimeo/etc. my @media = urlToMedia($text); if (!$media[0] eq "") { #"http://..." return @media; } # 3. Finally, check file extension # 4. Just an anchor return ("","$text"); } sub urlToMedia { my $find = $_[0]; if ($find =~ m/^(?:http(?:s|):\/\/(?:www.|)|)(.*)\/(.*)/) { for (my $urls_i = 0; $urls_i <= $#urls; $urls_i++) { for (my $url_i = 0; $url_i <= $#{${urls[$urls_i]}}; $url_i++) { if ($1 eq ${urls[$urls_i]}[$url_i]) { my $match = $2; $match =~ s/$url_parse[$urls_i]/$embeds[$urls_i][0]/ee; # the ee is qqing return ($match,$embeds[$urls_i][1]); } } } } return ("",""); } } 1;