#!/usr/bin/perl -w
# -T can only be given on the command line (taint checking)
use strict; # homework by Eric Auer      see perlre for regexp
use English; # to have $MATCH $PREMATCH $POSTMATCH

my $newcr = "\n";              # set this to " " to remove "\n",
                               # set to "\n" to keep "\n" !

sub onthtml { # arg is a filename, returns content w/o HTML tags
  my $a =  $_[0]; # copy arg
  unless (open(FIL,"<$a")) {
    print STDERR "Cannot read file <$a>\n";
    return "";
  }
  my @binnen = <FIL>;
  close(FIL);
  my $out = "";
  my $text = join($newcr,@binnen);
                                # kill line breaks as tags
                                # may be multiline anyway
                                # (use s option otherwise to
                                # treat \n as simple . (any))

  $text =~ /<title>(.*)<\/title>/si; # get the title
  my $title = $1; # not $MATCH of course
  $title =~ s/[&]nbsp;/ /gsi;       # un-nonbreak spaces
  $title =~ s/[&][a-zA-Z#][^;]*;/_/gs; # replace special chars
  $title =~ tr/A-Za-z0-9 /_/c;      # replace all strange by _
  $title =~ s/[\s\n][\s\n]+/ /gs;   # compress spaces and \n
  $title =~ s/ [_ ]* / /gs;         # remove unprintables
  print STDERR "File $a: $title\n";
  $text .= " $title"; # add the title to the body text
  # now make "title\007word" from every word in the title:
  $title = substr(join(" title\007",split(/ /," $title")),1);
  

  $text =~ s/^.*[\/]head>//gsi; # remove head part + style

  $text =~ s/<!--.*?-->//gs;    # ungreedily remove remarks
                                # (which may contain tags!)
  $text =~ s/<[^>]*>//gs;       # ungreedily remove all tags

  # for testing:
  $text =~ s/[&]nbsp;/ /gsi;    # un-nonbreak spaces
  $text =~ s/[&][a-zA-Z#][^;]*;/_/gs; # replace special chars

  if ($newcr eq "\n") {
    $text =~ s/[^\S\n][^\S\n]+/ /gs; # compress spaces
                                # (non-non-(whitespace or \n))
    $text =~ s/\n+/\n/gs;       # make multi-cr to single-crs
  } else {
    $text =~ s/[\s\n][\s\n]+/ /gs;   # compress spaces and \n
  }
  $text =~ s/ [_ ]* / /gs;      # remove unprintables

  return "$text $title";
}

return 1; # make require succeed
