Beefy Boxes and Bandwidth Generously Provided by pair Networks
XP is just a number
 
PerlMonks  

Search term stats from web log

by fuzzysteve (Beadle)
on Nov 16, 2001 at 18:15 UTC ( [id://125818]=CUFP: print w/replies, xml ) Need Help??

Just a quick script to parse though an altavista se log and generate a file that has what terms were used, and how often. Outputs in reverse numerical order of frequency, then alphabetical order of terms when there are more than one of the same number of uses. Nothing fancy, just for creating a file to be displayed by another script


Update:
edited the sort loop to dump the use of a coupe of temporary variables. I left the older code there so its easier to understand. (and cause I havn't got round to testing the newer code)
Update:
take a look at this node it has some suggestions for tightening up the sorting code at then end of the script. I'll probably get round to rewriting this to use a more efficent algorithm at some point, but for now it works, as long as the files you're using it on aren't huge.
#!/usr/bin/perl -w use strict; ## program to parse out the various search terms that are entered i +nto the search engine. ## use Text::ParseWords; use CGI; use Unix::Syslog qw(:macros); # Syslog macros use Unix::Syslog qw(:subs); # Syslog functions use Date::Manip; #to get the week of the year ## Generic Error Function that logs to the syslog. sub error_with_rotation{ my $error_mess = shift; openlog "Log Rotate",0,LOG_SYSLOG; syslog LOG_CRIT, "Daily Log rotation failed: $error_mess"; closelog; return; } ## End of error function my $file_to_be_parsed="/usr/local/apache/logs/oldlog/weeklysearch.log" +; ## Date information for the previous day ($p_****) my $p_day=&UnixDate("yesterday","%d"); my $p_month=&UnixDate("yesterday","%m"); my $p_year=&UnixDate("yesterday","%Y"); my $p_week=&UnixDate("yesterday","%W"); my %queryterms; my $results="/usr/local/apache/htdocs/stats/$p_week$p_year.log"; open (INFILE,"$file_to_be_parsed") or error_with_rotation("Search log +parse open failed: $!"); while (<INFILE>) { my $line=$_; next if ($line !~ /\/cgi-bin\/query/); my @words=&parse_line(' ',0,$line); my $phrase=$words[5]; my @phrase2=&parse_line(' ',0,$phrase); my $query= new CGI($phrase2[1]); my $queryphrase=$query->param('q'); if ($queryphrase){ my @terms=&parse_line('\s',0,$queryphrase); my $term; foreach $term (@terms){ if ($term){ $queryterms{$term}++; } } } } close (INFILE) or error_with_rotation("Search log parse close failed: + $!"); my @sortedkeys = sort(keys %queryterms); my $key; my @array1; my @array2; foreach $key (@sortedkeys) { push(@array1,$key); push(@array2,$queryterms{$key}); } ## sort area. sorts the two arrays keeping each pair value together ac +ross the split. my $index=0; my $smallest=0; #my $temp1=""; #my $temp2=0; my $count=@array1; my $x; for ($x=($count-1);$x>0;$x--){ $smallest=$x; #Find the smallest element with this for ($index=0;$index<=$x;$index++) { next if ($array2[$smallest] < $array2[$index]); next if (($array2[$smallest] == $array2[$index]) && (uc($array +1[$smallest]) gt uc($array1[$index]))); $smallest=$index; } #swap the smallest and the top element if ($x != $smallest){ ($array1[$smallest],$array1[$x])=($array1[$x],$array1[$smalles +t]); ($array2[$smallest],$array2[$x])=($array2[$x],$array2[$smalles +t]); # $temp1=$array1[$smallest]; # $temp2=$array2[$smallest]; # $array1[$smallest]=$array1[$x]; # $array2[$smallest]=$array2[$x]; # $array1[$x]=$temp1; # $array2[$x]=$temp2; } } open (OUTFILE,">$results") or error_with_rotation("Search log results +open failed: $!"); for ($x=0;$x<$count;$x++){ print OUTFILE $array1[$x]."::::".$array2[$x]."\n"; } close (OUTFILE) or error_with_rotation("Search log results close fai +led: $!");

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: CUFP [id://125818]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others avoiding work at the Monastery: (2)
As of 2024-04-25 19:32 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found