#!/usr/bin/perl -w
# -T can only be given on the command line (taint checking)
use strict; # homework by Eric Auer...

# Indexing over a set of HTML files
my $THERE = "."; # where to start in directory tree...
my $logfile = "/dev/null";
my $DOWNCASE = 1; # set to one to force the index to lower case

require "index-browse.pl";
require "index-demarkup.pl";

my %hsh;     # the big hash of hashes for the index
my @htmllist = split(":",&collecthtmlfiles($THERE));

foreach my $onefile (@htmllist) {

  my $content = &onthtml($onefile);

  $content =~ s/[^\w\007]/ /gs;   # simplify "other" but "\007"
                               # non-word -> " "
  $content =~ s/[\s\n][\s\n]+/ /gs;   # compress spaces and \n

  my $pos = 0;

  foreach my $oneword (split(" ",$content)) {
    if ($oneword =~ /^_*$/) { next; } # skip "nowords"
    $pos++; # we store the word numbers where the word occurs
    if ($DOWNCASE == 1) {
      $oneword = lc($oneword);     # downcase if requested...
    }
    if (defined $hsh{$oneword}->{$onefile}) {
      # add one more reference to that word in that file
      $hsh{$oneword}->{$onefile} = 
        $hsh{$oneword}->{$onefile} . "," . $pos;
    } else {
      # create a new sub-hash for that word in that file
      $hsh{$oneword}->{$onefile} = $pos;
    }
  }
  $hsh{"wordcount\007"}->{$onefile} = $pos;
    # store this for relevance ranking
}

open(LOG,">$logfile") || die "cannot write log file $logfile\n";
open(OFILE,">index.txt") || die "cannot write index.txt\n";
foreach my $oneword (sort keys %hsh) {
  printf LOG "%20s: ", $oneword;
  print OFILE "$oneword";
  foreach my $item (sort keys %{$hsh{$oneword}}) { # confusing,
                                                   # but works!
    print LOG "<\"$item\"," . $hsh{$oneword}->{$item} . "> ";
    print OFILE ";$item," . $hsh{$oneword}->{$item};
  }
  print LOG "\n";
  print OFILE "\n";
}
close(OFILE);
close(LOG);
