count_urls
Here's a script that count URL usage from an Apache log. By default, any URL usage that is over 1% of total will be listed and the user's IPs. This is an attempt to look for data miners who tend to hit one URL over and over. 1% works out well here, but we have ~20000 hits a day. This is slightly customized for our site (e.g. default URL to ignore) so YMMV.
Usage: /opt/util/count_urls.pl [-p X|-d x|-t X|-N]
where
-d X : debug level
-I : all URLs (def is ignore: Dispatch.pl)
-p X : percent of total URLs to consider
(def is 1 %)
-n : don't use nslookup to resolv IPs
-t X : threshold to consider (overrides percentage)
-r X : retry X times w/ smallter thresholds count (def is 5)
#!/usr/bin/perl -w
#$Header, count_urls.p v0.5.5 05/06afb$
#
# v0.5 whole cloth invention - ugly data structure
# v0.5.1 added hack to at least keep 2nd place in list
# v0.5.2 sorted results, nslookup option (-N)
# v0.5.3 added 'known IPs list'
# v0.5.4 updated 'known IPs list'
# v0.5.5 fixed Wade IP range
#
use strict;
use Getopt::Std;
# default % of total URLs
my $percent_total_def = 1;
my $retry_threshold_count_def = 5;
my $nslookup = '/usr/bin/nslookup';
my $ignore_urls = 'Dispatch.pl';
my $Usage = "Usage: $0 [-p X|-d x|-t X|-N]
where
-d X : debug level
-I : all URLs (def is ignore: $ignore_urls)
-p X : percent of total URLs to consider
(def is $percent_total_def %)
-n : don't use nslookup to resolv IPs
-t X : threshold to consider (overrides percentage)
-r X : retry X times w/ smallter thresholds count (def is $retry_threshold_count_def)
";
my %opts;
my (%ips, %max_urls);
getopts('d:Inp:r:t:', \%opts)
or die "$Usage";
my $threshold = $opts{t} || 0;
my $percent_total = $opts{p} || $percent_total_def;
my $debug = $opts{d} || 0;
my $resolve_ips = $opts{n} ? 0 : 1;
# allow for -r 0
my $retry_threshold_count =
defined($opts{r}) ? $opts{r} : $retry_threshold_count_def;
my $dont_ignore_urls = $opts{I} || 0;
# user/site name and IP Re - for ID in output
my %known_ips = ( Epoch => '106.40.(?:192|2(?:[01][0-9]|2[0123])).',
'Jupiter Esources' => '15.11.156.5',
'Jupiter Esources 65' => '15.11.156.5',
);
my $count_urls = 0;
while (<>) {
#next unless /POST|GET/;
next if /$ignore_urls/ and not $dont_ignore_urls;
$count_urls++;
# get IP and URL part - only considering those w/ data, i.e. '?'
# 68.143.95.26 - - [24/Mar/2006:17:42:07 -0600] "GET /cgi-bin/login.pl HTTP/1.1" 200 516
if ( /^(\d+\.\d+\.\d+\.\d+)\s+[^"]+"(\S+\s+[^?]+)\?/ ) {
my ($ip, $url) = ($1, $2);
warn("$ip - $url\n")
if $debug > 3;
# track count
$ips{$ip}->{$url}++;
# max and an array of top 5 IPs -
# cheesey stack
if ( not defined($max_urls{$url}->{count}) ) {
$max_urls{$url}->{count} = $ips{$ip}->{$url};
# hack to keep the 2nd place in the race
$max_urls{$url}->{prev_count} = 0;
push(@{ $max_urls{$url}->{ips} }, $ip);
}
elsif ( $ips{$ip}->{$url} > $max_urls{$url}->{count} ) {
$max_urls{$url}->{count} = $ips{$ip}->{$url};
# ditch IP if its already in array
@{ $max_urls{$url}->{ips} }
= grep { ! /^$ip$/ } @{ $max_urls{$url}->{ips} };
# 5 only
pop @{ $max_urls{$url}->{ips} }
if @{ $max_urls{$url}->{ips} } > 4;
# add new high score
unshift(@{ $max_urls{$url}->{ips} }, $ip );
}
# hack to keep the 2nd place in the race
elsif ( $ips{$ip}->{$url} > $max_urls{$url}->{prev_count} ) {
$max_urls{$url}->{prev_count} = $ips{$ip}->{$url};
@{ $max_urls{$url}->{ips} }
= grep { ! /^$ip$/ } @{ $max_urls{$url}->{ips} };
pop @{ $max_urls{$url}->{ips} }
if @{ $max_urls{$url}->{ips} } > 4;
push(@{ $max_urls{$url}->{ips} }, $ip );
}
}
} # while <>
if ( $debug > 8 ) {
warn "Threshold of $threshold\n\n";
foreach my $ip ( keys %ips ) {
foreach my $url ( keys %{ $ips{$ip} } ) {
warn "$ip -> $url ($ips{$ip}->{$url})\n"
if $ips{$ip}->{$url} > $threshold;
warn "max: $ip -> $url ($ips{$ip}->{$url})\n"
if $ips{$ip}->{$url} + $threshold > $max_urls{$url}->{count};
}
}
} # if debug > 8
my $shown_header = 0;
# if using percent of total try a couple if nothing prints.
my $retry_smaller = 1;
# either x % of total URLs or specific threshold
if ( ! $threshold ) {
$threshold = int($count_urls * ($percent_total * .01));
}
for ( 0 .. $retry_threshold_count) {
foreach my $url ( sort keys %max_urls ) {
next
unless $threshold <= $max_urls{$url}->{count};
print "Threshold of $threshold ($count_urls)\n\n"
unless $shown_header++;
print "URL: $url\n";
#Schwartzian transform
my @ip_list =
map { $_->[1] }
sort { $b->[0] <=> $a->[0] }
map { [ $ips{$_}->{$url}, $_ ] }
@{ $max_urls{$url}->{ips} };
#foreach my $ip ( @{ $max_urls{$url}->{ips} } ) {
my %ip_looked_up;
foreach my $ip ( @ip_list ) {
my $ip_name_str = '';
if ( $resolve_ips ) {
if ( ! $ip_looked_up{$ip} ) {
my ($res) = map { /Name:\s+(.*)/ }`$nslookup $ip 2> /dev/null`;
$ip_name_str = "($res)" if $res;
foreach my $known_ip ( keys %known_ips ) {
my $ip_re = $known_ips{$known_ip};
warn("trying $ip =~ $ip_re ($known_ip)" )
if $debug > 4;
if ( $ip =~ /$ip_re/ ) {
$ip_name_str .= "($known_ip)";
}
}
$ip_looked_up{$ip} = $ip_name_str;
}
else {
$ip_name_str = $ip_looked_up{$ip};
}
} # if ( $resolve_ips )
printf("\t%s had %02d %s\n", $ip, $ips{$ip}->{$url}, $ip_name_str);
# we've printed so don't retry
$retry_smaller = 0;
}
}
# set threshold or something printed
last unless $retry_smaller;
# notch it down a tad and try again
$threshold -= $threshold > 100 ? 10
: $threshold > 10 ? 5
: 1;
} # for ( 1 .. $retry_threshold_count)
Keywords:



