package Razor2::Preproc::deHTML; sub new { my $class = shift; my %html_tags = ( "lt" => '<', "gt" => '>', "amp" => '&', "quot" => '"', "nbsp" => ' ', "iexcl" => chr(161), "cent" => chr(162), "pound" => chr(163), "curren" => chr(164), "yen" => chr(165), "brvbar" => chr(166), "sect" => chr(167), "uml" => chr(168), "copy" => chr(169), "ordf" => chr(170), "laquo" => chr(171), "not" => chr(172), "shy" => chr(173), "reg" => chr(174), "macr" => chr(175), "deg" => chr(176), "plusmn" => chr(177), "sup2" => chr(178), "sup3" => chr(179), "acute" => chr(180), "micro" => chr(181), "para" => chr(182), "middot" => chr(183), "cedil" => chr(184), "sup1" => chr(185), "ordm" => chr(186), "raquo" => chr(187), "frac14" => chr(188), "frac12" => chr(189), "frac34" => chr(190), "iquest" => chr(191), "Agrave" => chr(192), "Aacute" => chr(193), "Acirc" => chr(194), "Atilde" => chr(195), "Auml" => chr(196), "Aring" => chr(197), "AElig" => chr(198), "Ccedil" => chr(199), "Egrave" => chr(200), "Eacute" => chr(201), "Ecirc" => chr(202), "Euml" => chr(203), "Igrave" => chr(204), "Iacute" => chr(205), "Icirc" => chr(206), "Iuml" => chr(207), "ETH" => chr(208), "Ntilde" => chr(209), "Ograve" => chr(210), "Oacute" => chr(211), "Ocirc" => chr(212), "Otilde" => chr(213), "Ouml" => chr(214), "times" => chr(215), "Oslash" => chr(216), "Ugrave" => chr(217), "Uacute" => chr(218), "Ucirc" => chr(219), "Uuml" => chr(220), "Yacute" => chr(221), "THORN" => chr(222), "szlig" => chr(223), "agrave" => chr(224), "aacute" => chr(225), "acirc" => chr(226), "atilde" => chr(227), "auml" => chr(228), "aring" => chr(229), "aelig" => chr(230), "ccedil" => chr(231), "egrave" => chr(232), "eacute" => chr(233), "ecirc" => chr(234), "euml" => chr(235), "igrave" => chr(236), "iacute" => chr(237), "icirc" => chr(238), "iuml" => chr(239), "eth" => chr(240), "ntilde" => chr(241), "ograve" => chr(242), "oacute" => chr(243), "ocirc" => chr(244), "otilde" => chr(245), "ouml" => chr(246), "divide" => chr(247), "oslash" => chr(248), "ugrave" => chr(249), "uacute" => chr(250), "ucirc" => chr(251), "uuml" => chr(252), "yacute" => chr(253), "thorn" => chr(254), "yuml" => chr(255) ); return bless { html_tags => \%html_tags, }, $class; } sub isit { my ($self, $text) = @_; my $isit = 0; my ($hdr, $body) = split /\n\r*\n/, $$text, 2; return 0 unless $body; $isit = $body =~ /(?:|EOS, or $i+10, and use # that to build a compare string for use with substr. Otherwise, # requesting ``lengths'' from char arrays that exceeds the actual # array produce additional 'undef's. 10 is an arbitrary #, but at # least greater than the max length (+ 1) of any html &tag. my($offset) = (scalar @{$chars} - $i > 10 ? 10 : scalar @{$chars} ); my($s) = join ('', @{$chars}[$i .. $i + $offset]); while ( ($tag, $val) = each %{$self->{html_tags}} ) { if (substr($s, 0, length($tag)) eq $tag) { ($r_tag, $r_val) = ($tag => $val); $r_tag .= ';' # so the length($r_tag) consumes the ``;'' if (substr($s, length($tag), 1) eq ';'); } } return (length($r_tag), $r_val); } sub html_xlat { my($self, $chars, $i) = @_; #print "html_xlat($r_tag) start\n"; return 0 if ($$chars[$i] !~ /[a-zA-Z]/); my $r_tag; # we used to walk till we got a ';', but to be compatible # with c, we won't check for ';' while ($$chars[$i] && $$chars[$i] =~ /[a-zA-Z]/) { $r_tag .= $$chars[$i++]; } my $len = length($r_tag); # do not include ; $len++ if ($$chars[$i] eq ';'); my $val = $self->{html_tags}->{$r_tag}; #print "html_xlat($r_tag) = ($len,$val)\n"; return 0 unless $val; # not found return ( $len, $val ); } sub doit { my ($self, $text) = @_; my ($hdr, $body) = split /\n\r*\n/, $$text, 2; my(@chars) = split //, $body; my($len) = scalar @chars; my($last, $quote, $sgml, $tag) = ("", "", "", ""); my(@out); my($i) = 0; while ($i < $len) { my($c) = $chars[$i++]; if ($c eq $quote) { if ($c eq '-' && $last ne '-') { $last = $c; next; } else { $last = 0; } $quote = ""; } elsif (!$quote) { if ($c eq '<') { $tag = 1; if ($chars[$i++] eq '!') { my($s) = join('', @chars[$i .. $i + 10]); $sgml = 1; } } elsif ($c eq '>') { if ($tag) { $sgml = 0; $tag = 0; } } elsif ($c eq '-') { if ($sgml and $last eq '-') { $quote = '-'; } else { push @out, $c if (! $tag); } } elsif ($c eq "\"" or $c eq "\'") { if ($tag) { $quote = $c ; } else { push @out, $c if (! $tag); } } elsif ($c eq "&") { my($len, $char) = $self->html_xlat(\@chars, $i); if ($len) { push @out, $char; $i += $len; } else { push @out, $c; } } else { push @out, $c if (! $tag); } } $last = $c; } $$text = "$hdr\n\n". join('', @out); } 1;