count_urls

Here's a script that count URL usage from an Apache log.  By default, any URL usage that is over 1% of total will be listed and the user's IPs. This is an attempt to look for data miners who tend to hit one URL over and over. 1% works out well here, but we have ~20000 hits a day. This is slightly customized for our site (e.g. default URL to ignore) so YMMV.

Usage: /opt/util/count_urls.pl [-p X|-d x|-t X|-N]
  where
    -d X   : debug level
    -I     : all URLs (def is ignore: Dispatch.pl)
    -p X   : percent of total URLs to consider
              (def is 1 %)
    -n     : don't use nslookup to resolv IPs
    -t X   : threshold to consider (overrides percentage)
    -r X   : retry X times w/ smallter thresholds count (def is 5)

 

#!/usr/bin/perl -w
#$Header, count_urls.p v0.5.5 05/06afb$
#
# v0.5     whole cloth invention - ugly data structure
# v0.5.1   added hack to at least keep 2nd place in list
# v0.5.2   sorted results, nslookup option (-N)
# v0.5.3   added 'known IPs list'
# v0.5.4   updated 'known IPs list'
# v0.5.5   fixed Wade IP range
#
use strict;
use Getopt::Std;
# default % of total URLs
my $percent_total_def = 1;
my $retry_threshold_count_def = 5;
my $nslookup = '/usr/bin/nslookup';
my $ignore_urls = 'Dispatch.pl';
my $Usage = "Usage: $0 [-p X|-d x|-t X|-N]
  where
    -d X   : debug level
    -I     : all URLs (def is ignore: $ignore_urls)
    -p X   : percent of total URLs to consider
              (def is $percent_total_def %)
    -n     : don't use nslookup to resolv IPs
    -t X   : threshold to consider (overrides percentage)
    -r X   : retry X times w/ smallter thresholds count (def is $retry_threshold_count_def)
";
my %opts;
my (%ips, %max_urls);
getopts('d:Inp:r:t:', \%opts)
  or die "$Usage";
my $threshold = $opts{t} || 0;
my $percent_total = $opts{p} || $percent_total_def;
my $debug = $opts{d} || 0;
my $resolve_ips = $opts{n} ? 0 : 1;
# allow for -r 0
my $retry_threshold_count =
           defined($opts{r}) ? $opts{r} :  $retry_threshold_count_def;
my $dont_ignore_urls = $opts{I} || 0;

# user/site name and IP Re  - for ID in output

my %known_ips = ( Epoch => '106.40.(?:192|2(?:[01][0-9]|2[0123])).',
                  'Jupiter Esources' => '15.11.156.5',
                  'Jupiter Esources 65' => '15.11.156.5',

);
 

my $count_urls = 0;

while (<>) {
  #next unless /POST|GET/;
  next if /$ignore_urls/ and not $dont_ignore_urls;
  $count_urls++;
# get IP and URL part - only considering those w/ data, i.e. '?'
# 68.143.95.26 - - [24/Mar/2006:17:42:07 -0600] "GET /cgi-bin/login.pl HTTP/1.1" 200 516
  if ( /^(\d+\.\d+\.\d+\.\d+)\s+[^"]+"(\S+\s+[^?]+)\?/ ) {
    my ($ip, $url) =  ($1, $2);
    warn("$ip - $url\n")
      if $debug > 3;
# track count
    $ips{$ip}->{$url}++;
# max and an array of top 5 IPs -
# cheesey stack
    if ( not defined($max_urls{$url}->{count}) ) {
      $max_urls{$url}->{count} = $ips{$ip}->{$url};
# hack to keep the 2nd place in the race
      $max_urls{$url}->{prev_count} = 0;
      push(@{ $max_urls{$url}->{ips} }, $ip);
    }
    elsif ( $ips{$ip}->{$url} > $max_urls{$url}->{count} ) {
      $max_urls{$url}->{count} = $ips{$ip}->{$url};
# ditch IP if its already in array
      @{ $max_urls{$url}->{ips} }
        = grep { ! /^$ip$/ } @{ $max_urls{$url}->{ips} };
# 5 only
      pop @{ $max_urls{$url}->{ips} }
        if @{ $max_urls{$url}->{ips} } > 4;
# add new high score
      unshift(@{ $max_urls{$url}->{ips} }, $ip );
    }
# hack to keep the 2nd place in the race
    elsif ( $ips{$ip}->{$url} > $max_urls{$url}->{prev_count} ) {
      $max_urls{$url}->{prev_count} = $ips{$ip}->{$url};
      @{ $max_urls{$url}->{ips} }
        = grep { ! /^$ip$/ } @{ $max_urls{$url}->{ips} };
      pop @{ $max_urls{$url}->{ips} }
        if @{ $max_urls{$url}->{ips} } > 4;
      push(@{ $max_urls{$url}->{ips} }, $ip );
    }

  }
}  # while <>

    if ( $debug > 8 ) {
      warn "Threshold of $threshold\n\n";
      foreach my $ip ( keys %ips ) {
        foreach my $url ( keys %{ $ips{$ip} } ) {
          warn "$ip -> $url ($ips{$ip}->{$url})\n"
              if $ips{$ip}->{$url} > $threshold;
           warn "max: $ip -> $url ($ips{$ip}->{$url})\n"
              if $ips{$ip}->{$url} + $threshold > $max_urls{$url}->{count};
        }
      }
    }    # if debug > 8
my $shown_header = 0;
# if using percent of total try a couple if nothing prints.
my $retry_smaller = 1;
# either x % of total URLs or specific threshold
if ( ! $threshold ) {
  $threshold = int($count_urls * ($percent_total * .01));
}

for ( 0 .. $retry_threshold_count) {
  foreach my $url ( sort keys %max_urls ) {
     next
        unless $threshold <= $max_urls{$url}->{count};
    print "Threshold of $threshold ($count_urls)\n\n"
       unless $shown_header++;
    print "URL: $url\n";
#Schwartzian transform
    my @ip_list =
      map { $_->[1] }
      sort { $b->[0] <=> $a->[0] }
      map { [ $ips{$_}->{$url}, $_ ] }
      @{ $max_urls{$url}->{ips} };
    #foreach my $ip ( @{ $max_urls{$url}->{ips} } ) {
    my %ip_looked_up;
    foreach my $ip ( @ip_list ) {
      my $ip_name_str = '';
      if ( $resolve_ips ) {
        if ( ! $ip_looked_up{$ip} ) {
          my ($res) = map { /Name:\s+(.*)/ }`$nslookup $ip 2> /dev/null`;
          $ip_name_str = "($res)" if $res;
          foreach my $known_ip ( keys %known_ips ) {
            my $ip_re = $known_ips{$known_ip};
            warn("trying $ip =~ $ip_re ($known_ip)" )
              if $debug > 4;
            if ( $ip =~ /$ip_re/ ) {
              $ip_name_str .= "($known_ip)";
            }
          }
          $ip_looked_up{$ip} = $ip_name_str;
        }
        else {
          $ip_name_str = $ip_looked_up{$ip};
        }
      }  #  if ( $resolve_ips )
      printf("\t%s had %02d %s\n", $ip, $ips{$ip}->{$url}, $ip_name_str);
# we've printed so don't retry
      $retry_smaller = 0;
    }
  }

# set threshold or something printed
  last unless $retry_smaller;
# notch it down a tad and try again
  $threshold -= $threshold > 100 ? 10
              : $threshold > 10  ? 5
              :                    1;

}  # for ( 1 .. $retry_threshold_count)
 

 

Keywords:

Search | Most Popular | Recent Changes | Wiki Home
Madison Area Perl Mongers