1: #!/usr/bin/perl
2:
3: #############################
4: #
5: # Version 2.0
6: #
7: # A simple multi-format log parser which is intended to
8: # to be used as a filter. Could be faster, but it does
9: # allow you to define a pretty output format.
10: #
11: # Author: Chris Jensen
12: #
13: # Update:
14: #
15: # - If log format is unspecified, an attempt is
16: # made to determine the closest matching format
17: # by analyzing a log entry.
18: #
19: # - Reduced amount of code; Sub-formats defined
20: # similar to log formats; Minor changes.
21: #
22:
23: use Getopt::Long;
24:
25: my %optctl;
26: GetOptions (\%optctl, "type|t=s", "pattern|p=s");
27:
28:
29: my $log_formats = {
30: 'common' => [ qr{(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d+) (\d+)}, [qw(h l u t r c b)] ],
31: 'virtual' => [ qr{(\S+) (\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d+) (\d+)}, [qw(v h l u t r c b)] ],
32: 'combined' => [ qr{(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d+) (\d+) \"([^\"]*)\" \"([^\"]*)\"}, [qw(h l u t r c b R A)] ],
33: 'referer' => [ qr{(\S+) \-\> (\S+)}, [qw(R r)] ],
34: 'agent' => [ qr{(\S+)}, [qw(A)] ],
35: 'extended' => [ qr{(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d+) (\d+) \"([^\"]*)\" \"([^\"]*)\" (\d+) (\d+)}, [qw(h l u t r c b R A P T)] ],
36: 'custom' => [ qr{(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d+) (\d+) \"([^\"]*)\" \"([^\"]*)\" (\d+)}, [qw(h l u t r c b A R T)] ],
37: };
38:
39:
40: my $type = $optctl{type} || 'unknown';
41:
42: my $pattern = $optctl{pattern} or usage();
43:
44: my ($format, $control) = @{$log_formats->{$type}};
45:
46: my @pats;
47:
48: map { $_ =~ /^([^a-zA-Z\_\%]*)(.)/ && push(@pats, $2) } (split(/\%/, $pattern));
49:
50: my $outpat = $pattern . "\n";
51: $outpat =~ s/(\%[^a-zA-Z\_\%]*)([a-zA-Z\_])/$1s/g;
52:
53:
54: # Formats and sub-formats are now defined similarly
55:
56: my $sub_formats = {
57: 't' => [ qr{(\d+)\/(\w+)\/(\d+)\:(\d+)\:(\d+)\:(\d+)\s}, [qw(d m y H M S)] ],
58: 'r' => [ qr{(\w+)\s([^\?]*)\??([^\s]*)?\s(.*)}, [qw(a f q p)] ],
59: 'u' => [ qr{(\w*)\-(\w*)}, [qw(s i)] ],
60: 'R' => [ qr{.*\:\/\/([^\/]+)(\/.*)}, [qw(o F)] ]
61: };
62:
63:
64: while(<>) {
65:
66: # Attempt to automatically determine log type/format
67: # Pick the matching format with the most control entities
68:
69: if ($type eq 'unknown') {
70: my ($last, $t, $p);
71: while (($t, $p) = each(%{$log_formats})) {
72: my ($f, $c) = @{$p};
73: if (/$f/ && scalar @{$c} > $last) {
74: $last = scalar @{$c};
75: $format = $f;
76: $control = $c;
77: $type = $t;
78: }
79: }
80: die "Can't auto-determine log type\n" if ($type eq 'unknown');
81: }
82:
83: my @vals;
84: my %info;
85: if (/$format/) {
86: my $x = 0;
87:
88: foreach my $ctl (@{$control}) {
89: $info{$ctl} = ${++$x};
90:
91: my ($sfmt, $sctl) = @{$sub_formats->{$ctl}};
92:
93: if (defined($sfmt)) {
94: my $y = 0;
95: $info{$ctl} =~ /$sfmt/ &&
96: map { $info{$_} = ${++$y} } @{$sctl};
97: }
98: }
99:
100: map { push(@vals, $info{$_}) } @pats;
101:
102: printf $outpat, @vals;
103: }
104: }
105:
106:
107: sub usage {
108:
109: print qq{
110: usage: logparse [-t=<type>] -p=<pattern>
111:
112: example: tail -50 access_log | logparse -t=extended -p="%H:%M %-15o %f"
113:
114: Formatting characters:
115:
116: v - The virtual host name/address
117: h - The host IP name/address
118: l - The remote logname
119: u - Remote User/Session
120: t - The time of the request
121: r - The full request
122: c - The HTTP code (302, 200, etc)
123: b - Bytes
124: R - Referrer string
125: A - User Agent string
126: P - Process ID
127: T - Time taken in seconds
128:
129: Request string breakdown:
130:
131: a - Action/Method (GET, POST, etc)
132: f - File path
133: q - Query string
134: p - HTTP protocol version
135:
136: Time of request breakdown:
137:
138: d - Day of the month
139: m - Month (Apr, May, etc)
140: y - Year
141: H - Hour
142: M - Minute
143: S - Second
144:
145: User Session breakdown:
146:
147: s - Session ID
148: i - User ID
149:
150: Referrer string breakdown:
151:
152: o - Host of referrer
153: F - File path of referrer
154:
155: };
156:
157: exit(0);
158:
159: }
Re: Multi-Format Log Parser - Version 2.0
by grinder (Bishop) on Jan 16, 2002 at 17:07 UTC
|
Neat stuff. ++ for using \"([^\"]*)\" instead of \"(.*?)\" that is all too often seen.
Bear in mind though, that strange User-Agent strings can break your regexp. Specifically, I once encountered "Slurp 1.0" (literally, with the quotes) as a user agent in my log file.
This was a real bugger to work around. I suppose a sufficiently well crafted regexp could extract foo from "foo" as well as bar from ""bar"". I solved the problem in a two-step process, by matching the prior fields, and then matching the latter fields, and then what was left was the user agent field. Keep in mind that ""user "foo" bar" could appear as a user agent. It gets icky.
--g r i n d e r
print@_{sort keys %_},$/if%_=split//,'= & *a?b:e\f/h^h!j+n,o@o;r$s-t%t#u';
| [reply] [d/l] [select] |
|
You're right about the improper use of quotes within a user agent string. That could cause pattern matches to fail, and those would be skipped. I'm thinking about adding an option to print log lines that don't match the currently selected format to STDERR, or a count of lines that didn't match.
From using this on a fairly large web site, I know the patterns match our traffic fairly well, but it will be interesting to see how many lines don't match and why. I did a dump of counts per unique user agent string using this log parser a few days ago for our QA department and in one day's worth of logs there were 82,279 unique user agent strings. Our QA guys are after percentages of traffic per browser and platform, and I don't relish their job of parsing all the user agent strings to get that information since they don't follow any standardized format.
| [reply] |
|
I implemented a quick debug option that spits non-matches out to STDERR. In testing I found a pattern bug with byte counts of 304 log entries. Both are fixed in the following diff:
26c26
< GetOptions (\%optctl, "type|t=s", "pattern|p=s");
---
> GetOptions (\%optctl, "type|t=s", "pattern|p=s", "debug|d=i");
30,32c30,32
< 'common' => [ qr{(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d
++) (\d+)}, [qw(h l u t r c b)] ],
< 'virtual' => [ qr{(\S+) (\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)
+\" (\d+) (\d+)}, [qw(v h l u t r c b)] ],
< 'combined' => [ qr{(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d
++) (\d+) \"([^\"]*)\" \"([^\"]*)\"}, [qw(h l u t r c b R A)] ],
---
> 'common' => [ qr{(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d
++) ([\d\-]+)}, [qw(h l u t r c b)] ],
> 'virtual' => [ qr{(\S+) (\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)
+\" (\d+) ([\d\-]+)}, [qw(v h l u t r c b)] ],
> 'combined' => [ qr{(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d
++) ([\d\-]+) \"([^\"]*)\" \"([^\"]*)\"}, [qw(h l u t r c b R A)] ],
35,36c35,36
< 'extended' => [ qr{(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d
++) (\d+) \"([^\"]*)\" \"([^\"]*)\" (\d+) (\d+)}, [qw(h l u t r c b R
+A P T)] ],
< 'custom' => [ qr{(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d
++) (\d+) \"([^\"]*)\" \"([^\"]*)\" (\d+)}, [qw(h l u t r c b A R T)]
+],
---
> 'extended' => [ qr{(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d
++) ([\d\-]+) \"([^\"]*)\" \"([^\"]*)\" (\d+) (\d+)}, [qw(h l u t r c
+b R A P T)] ],
> 'custom' => [ qr{(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d
++) ([\d\-]+) \"([^\"]*)\" \"([^\"]*)\" (\d+)}, [qw(h l u t r c b A R
+T)] ],
102a103,104
> } elsif ($optctl{debug} == 1) {
> print STDERR $_;
With the new patterns, a quick match against 79154 lines from an access log of 'extended' format had 8 lines which didn't match. All of them were because of quotes in the request or the user agent strings.
Here's a user agent that didn't match...
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Q312461; <HTML><A%
+20HREF="http://www.pghconnect.com/">www.pghconnect.com</a></HTML>)"
| [reply] [d/l] [select] |
|
|