#!/usr/bin/perl -w
# -T can only be given on the command line (taint checking)
use strict; # homework by Eric Auer...

require "query-read.pl"; # read the index into a %hash
# usage: %{$hsh{$word}} has elements $hsh{$word}->{$file}
# which are position strings like "2,5,11". We also have special
# words: "title:word" and "wordcount:"

my $oneword;
my %results;
my %stopwords = ("and",1,"und",1,"en",1);
my $adjust;


my %hsh = %{&reftoindex()};


sub findterm { # returns a hash: keys are files, values weights
  my $word = $_[0];
  my %results = ();

  if (defined $stopwords{$word}) {
    print "You may not search for <$word>,"
     .    " that could lead to a huge list of hits\n";
    return \%results; # empty list
  }

  if (defined $hsh{$word}) {
    print "$word was found\n";
    foreach my $onefile (keys %{$hsh{$word}}) {
      # print "In file $onefile, ";
      my @posn = split(/,/,$hsh{$word}->{$onefile});
      # print $#posn . " of "
      #  . $hsh{"wordcount:"}->{$onefile} . " times\n";
      $results{$onefile} = $adjust + log(
        $#posn / $hsh{"wordcount:"}->{$onefile});
      # store result along with relevance (log relative
      # frequency)
    }
  } else {
    if ($oneword) {
      print "$word was not found\n";
    }
  }

  return \%results;
}


$adjust = 0.0;
foreach $oneword (keys %{$hsh{"wordcount:"}}) {
  if (log($hsh{"wordcount:"}->{$oneword}) >
      $adjust) {
    $adjust = log($hsh{"wordcount:"}->{$oneword});
  }
}
print "The longest file contains " . exp($adjust) . " words\n";


my $words;
do {
  %results = ();              # 0 pairs (empty list)
  print "Search for which terms (space separated) ?\n";
  $words = <STDIN>;
  chomp $words;
  $words = lc($words);    # only lowercase terms

  foreach my $oneword (split(/ /,$words)) {
    print "Checking for <$oneword>\n";
    if ($oneword) {
      my %moreres = %{&findterm($oneword)};
      foreach my $oneres (keys %moreres) {
        $results{$oneres} = $moreres{$oneres};
        # of course this is no nice set operation yet,
        # as we overwrite weights from previous keywords!
      }
    }
  }

  print "Results by relevance:\n";
  foreach my $res (sort {$results{$a} <=> $results{$b}}
                   (keys(%results)) ) {
    printf "%-20s (%3.3f)\n", $res, $results{$res};
  }

# print "Results for query <$words> completed.\n";
} while ($words);

