#!/usr/bin/perl -w # -T can only be given on the command line (taint checking) use strict; # homework by Eric Auer... # this is what we should implement as the final task of week4 # (not counting the optional tasks): queries with +word and # -word and word, weighting and multiple words. require "query-read.pl"; # read the index into a %hash # usage: %{$hsh{$word}} has elements $hsh{$word}->{$file} # which are position strings like "2,5,11". We also have special # words: "title:word" and "wordcount:" require "query-sets.pl"; # setAND setOR setSUB setCOPY # all operate on hashrefs, all with 2 args modify the first # hash in place as well, set ops are done on keys, values # are taken as numeric weights and modified as well my $TESTQ = 0; # set for debugging output: 0 none, 1 set size, # 2 for set keys, 3 for keys and values my $oneword; my %results; my %stopwords = ("and",1,"und",1,"en",1); my $adjust; # do not forget to set this before using findterm! my %hsh = %{&reftoindex()}; sub setSHOW { if ($TESTQ < 1) { return 0; } if ($TESTQ < 2) { print scalar keys %{$_[0]}; # keys in scalar context give print "\n"; # size... } elsif ($TESTQ < 3) { print "("; print join(" ",(keys %{$_[0]})); print ")\n"; } elsif ($TESTQ < 4) { my %theset = %{$_[0]}; foreach my $thekey (keys %theset) { # no "keys" or "values" # -> k,v,k,v,... printf "%s ( %3.3f ) ", $thekey, $theset{$thekey}; } print "\n"; } else { print "setSHOW what???\n"; } return 1; } sub findterm { # returns hashref: keys are files, values weights my $word = $_[0]; my %results = (); if (defined $stopwords{$word}) { print "You may not search for <$word>," . " that could lead to a huge list of hits\n"; return \%results; # empty list } if (defined $hsh{$word}) { unless ($TESTQ < 1) { print "$word was found\n"; }; foreach my $onefile (keys %{$hsh{$word}}) { # print "In file $onefile, "; my @posn = split(/,/,$hsh{$word}->{$onefile}); # print $#posn . " of " # . $hsh{"wordcount:"}->{$onefile} . " times\n"; $results{$onefile} = $adjust + log( ($#posn + 1) / $hsh{"wordcount:"}->{$onefile}); # store result along with relevance (log relative # frequency) } } else { if ($word) { print "$word was not found\n"; } } &setSHOW(\%results); return \%results; } $adjust = 0.0; foreach $oneword (keys %{$hsh{"wordcount:"}}) { if (log($hsh{"wordcount:"}->{$oneword}) > $adjust) { $adjust = log($hsh{"wordcount:"}->{$oneword}); } } print "The longest file contains " . exp($adjust) . " words\n"; my $words; do { %results = (); # 0 pairs (empty list) print "You can use +word and -word to force/disallow terms\n"; print "Search for which terms (space separated) ?\n"; $words = ; chomp $words; $words = lc($words); # only lowercase terms my @andterms = (); my @orterms = (); my @notterms = (); foreach my $oneword (split(/ /,$words)) { if ($oneword =~ /^[+]/) { unshift(@andterms,substr($oneword,1)); unshift(@orterms,substr($oneword,1)); } elsif ($oneword =~ /^[-]/) { unshift(@notterms,substr($oneword,1)); } else { unshift(@orterms,$oneword); } } foreach my $oneword (@orterms) { # collect all potential hits unless ($TESTQ < 1) { print "Adding files with <$oneword>\n"; } # my %moreres = %{&findterm($oneword)}; # always search %results = %{&setOR(\%results,&findterm($oneword))}; &setSHOW(\%results); } foreach my $oneword (@andterms) { # enforce +words unless ($TESTQ < 1) { print "Require files to have <$oneword>\n"; } %results = %{&setAND(\%results,&findterm($oneword))}; &setSHOW(\%results); } foreach my $oneword (@notterms) { # enforce -words unless ($TESTQ < 1) { print "Exclude files with <$oneword>\n"; } %results = %{&setSUB(\%results,&findterm($oneword))}; &setSHOW(\%results); } if ($words) { print "Results by relevance:\n"; foreach my $res (sort {$results{$a} <=> $results{$b}} (keys(%results)) ) { printf "%-20s (%3.3f)\n", $res, $results{$res}; } print scalar keys %results; print " file(s) matched the query\n\n"; } # print "Results for query <$words> completed.\n"; } while ($words);