#!/usr/bin/perl use strict; use warnings; my %id; my $next_id = 10000; while (<>) { next unless m!^\S+ (\S+) .+ "GET ([^"]+) HTTP/\d\.\d" 200!; my ($ip,$path) = ($1,$2); study $path; # Skip directories next if $path =~ /\/$/; # Directory next if $path =~ /\/\?/; # Directory with sort parms # Skip certain directories next if $path =~ /^\/(icons|misc|ports|src)\//; # Skip certain file extensions next if $path =~ /\.(rss|html|meta|readme)$/; # Skip CPAN & distro maintenance stuff next if $path =~ /CHECKSUMS$/; next if $path =~ /MIRRORING/; # Module list stuff next if $path =~ /\Q00whois./; next if $path =~ /\Q01mailrc./; next if $path =~ /\Q02packages.details/; next if $path =~ /\Q03modlist./; my $id = ($id{$ip} ||= ++$next_id); print "$id $path\n"; } #### 16395 /authors/id/K/KE/KESTER/WWW-Yahoo-DrivingDirections-0.07.tar.gz 10001 /authors/id/K/KW/KWOOLERY/Buzznet-API-0.01.tar.gz 85576 /authors/id/J/JR/JROGERS/Net-Telnet-3.01.tar.gz 85576 /authors/id/J/JR/JROGERS/Net-Telnet-3.02.tar.gz 85576 /authors/id/J/JR/JROGERS/Net-Telnet-3.03.tar.gz