#!/usr/bin/perl -w
# -T can only be given on the command line (taint checking)
use strict; # homework by Eric Auer...

# this is what we should implement as the final task of week4
# (not counting the optional tasks): queries with +word and
# -word and word, weighting and multiple words.

require "query-read.pl"; # read the index into a %hash
# usage: %{$hsh{$word}} has elements $hsh{$word}->{$file}
# which are position strings like "2,5,11". We also have special
# words: "title:word" and "wordcount:"

require "query-sets.pl"; # setAND setOR setSUB setCOPY
# all operate on hashrefs, all with 2 args modify the first
# hash in place as well, set ops are done on keys, values
# are taken as numeric weights and modified as well

my $TESTQ = 0; # set for debugging output: 0 none, 1 set size,
               # 2 for set keys, 3 for keys and values

my $oneword;
my %results;
my %stopwords = ("and",1,"und",1,"en",1);
my $adjust; # do not forget to set this before using findterm!


my %hsh = %{&reftoindex()};


sub setSHOW {
  if ($TESTQ < 1) { return 0; }
  if ($TESTQ < 2) {
    print scalar keys %{$_[0]}; # keys in scalar context give
    print "\n"; # size...
  } elsif ($TESTQ < 3) {
    print "(";
    print join(" ",(keys %{$_[0]}));
    print ")\n";
  } elsif ($TESTQ < 4) {
    my %theset = %{$_[0]};
    foreach my $thekey (keys %theset) { # no "keys" or "values"
                                        # -> k,v,k,v,...
      printf "%s ( %3.3f ) ", $thekey, $theset{$thekey};
    }
    print "\n";
  } else { print "setSHOW what???\n"; }
  return 1;
}


sub findterm { # returns hashref: keys are files, values weights
  my $word = $_[0];
  my %results = ();

  if (defined $stopwords{$word}) {
    print "You may not search for <$word>,"
     .    " that could lead to a huge list of hits\n";
    return \%results; # empty list
  }

  if (defined $hsh{$word}) {
    unless ($TESTQ < 1) { print "$word was found\n"; };
    foreach my $onefile (keys %{$hsh{$word}}) {
      # print "In file $onefile, ";
      my @posn = split(/,/,$hsh{$word}->{$onefile});
      # print $#posn . " of "
      #  . $hsh{"wordcount:"}->{$onefile} . " times\n";
      $results{$onefile} = $adjust + log(
        ($#posn + 1) / $hsh{"wordcount:"}->{$onefile});
      # store result along with relevance (log relative
      # frequency)
    }
  } else {
    if ($word) {
      print "$word was not found\n";
    }
  }

  &setSHOW(\%results);
  return \%results;
}


$adjust = 0.0;
foreach $oneword (keys %{$hsh{"wordcount:"}}) {
  if (log($hsh{"wordcount:"}->{$oneword}) >
      $adjust) {
    $adjust = log($hsh{"wordcount:"}->{$oneword});
  }
}
print "The longest file contains " . exp($adjust) . " words\n";


my $words;
do {
  %results = ();              # 0 pairs (empty list)
  print "You can use +word and -word to force/disallow terms\n";
  print "Search for which terms (space separated) ?\n";
  $words = <STDIN>;
  chomp $words;
  $words = lc($words);    # only lowercase terms
  my @andterms = ();
  my  @orterms = ();
  my @notterms = ();

  foreach my $oneword (split(/ /,$words)) {
    if ($oneword =~ /^[+]/) {
      unshift(@andterms,substr($oneword,1));
      unshift(@orterms,substr($oneword,1));
    } elsif  ($oneword =~ /^[-]/) {
      unshift(@notterms,substr($oneword,1));
    } else {
      unshift(@orterms,$oneword);
    }
  }

  foreach my $oneword (@orterms) { # collect all potential hits
    unless ($TESTQ < 1) {
      print "Adding files with <$oneword>\n";
    }
    # my %moreres = %{&findterm($oneword)}; # always search
    %results = %{&setOR(\%results,&findterm($oneword))};
    &setSHOW(\%results);
  }

  foreach my $oneword (@andterms) { # enforce +words
    unless ($TESTQ < 1) {
      print "Require files to have <$oneword>\n";
    }
    %results = %{&setAND(\%results,&findterm($oneword))};
    &setSHOW(\%results);
  }

  foreach my $oneword (@notterms) { # enforce -words
    unless ($TESTQ < 1) {
      print "Exclude files with <$oneword>\n";
    }
    %results = %{&setSUB(\%results,&findterm($oneword))};
    &setSHOW(\%results);
  }

  if ($words) {
    print "Results by relevance:\n";
    foreach my $res (sort {$results{$a} <=> $results{$b}}
                     (keys(%results)) ) {
      printf "%-20s (%3.3f)\n", $res, $results{$res};
    }
    print scalar keys %results;
    print " file(s) matched the query\n\n";
  }

# print "Results for query <$words> completed.\n";
} while ($words);

