1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
|
# comment
# Function definitions can be used to write re-usable expressions. Recursion is
# not allowed. All functions are pure. In fact, all expressions are pure.
func pi() 3.1415;
func isbot(ua) ua ~ /googlebot|bing/i;
# This defines a procedure, which is a list of statements that can be run or
# re-used in other procedures.
proc apache {
# Apache log parser. Reads the log entry from the 'msg' variable, and
# stores the parsed results in separate variables.
regex msg /^ (?<ip> [^ ]+ )
[ ] (?<vhost> [^ ]+ )
[ ] (?<user> [^ ]+ )
[ ] \[ (?<timestamp> [^\]]+ ) \]
[ ] " (?<method> [^ ]+ )
[ ] (?<path> [^ ]+ )
[ ] HTTP\/ (?<version> [012]\.[0-9] ) "
[ ] (?<code> [0-9]{3} )
[ ] ((?<size> [0-9]+ )|-)
[ ] " (?<referer> [^"]+ ) "
[ ] " (?<useragent> [^"]+ ) "/x;
}
proc foo {
# Setting and using variables/fields (I still need to settle on consistent
# terminology)
set somefield 2e100;
set somefield2 somefield.1;
# 'include' the apache proc defined above. This behaves as if the
# statements from the referenced procedure are copy-pasted into this
# location.
use apache;
# Remove events that do not match an expression.
# XXX: 'filter' is a shitty name for this, I still can't tell whether this
# is supposed to throw away or keep the events that match the expression.
# Better would be 'ignore' or 'skip' or 'exclude' or whatever.
filter size != 0;
filter !isbot(useragent);
## Aggregation syntax:
group ip, code;
## After a 'group', all grouped-on fields are directly acessible, the
## fields from above the 'group' statement must be accessed through
## aggregation functions (like in SQL). e.g.
# set avgsize = avg(size)
## Syntax unclear, but the following SQL-like things would be nice:
# set avgsize = sum(size) / count() where isint(size)
# set avg200 = avg(size) where code = 200
# filter count() < 50 # Less than 50 hits from a single IP -> not interesting
## Double aggregation (rather contrived example)
# group avgsize
# filter count() < 5 # Less than 5 IP addresses for a single average size
## ORDER BY .. LIMIT .. (we don't need the generality of SQL, so stick to getting a top n sort)
# sort <number|string> <expr> <asc|desc> limit <num>
#sort string ip asc limit 10;
sort string ip desc limit 10;
# The expression(s) to output. Only one 'show' is allowed in a program, and it
# must be the last statement.
show ip, code;
## Multiple expressions should generate a suitably-sized table.
## ("suitably-sized" is hard to tell when there's no top-n-sort or
## aggregation going on. Probably stick to CSV or so in that case?)
## If no 'show' statement is provided, then by default it should behave as
## if a 'show msg' command was used for non-aggregated procedures. For
## aggregated procedures it could be 'show <expr-grouped-on>'.
}
# Type system notes:
# I'd prefer actually typed variables, with automatic type inference. This
# could be kept simple with just a few basic types, e.g.:
# String, Int, Float, Bool, Timestamp, IP
# Each type implementing from-string and to-string, to allow for displaying
# and automatic parsing from a regex. Operators and functions can be made
# strict, e.g. (Int > Float) is a type error.
#
# Unfortunately, that doesn't seem terribly easy to implement. So as an
# alternative I'll stick with string typing instead. I.e. everything is a
# string, it is up to the function or operator to define how they interpret
# their arguments. This, of course, requires separate operators for string
# and integer comparison.
#
# Ints vs. Floats? Use the (old) Lua approach and force everything to be a
# Float? Or, better yet, Haskell 'Rational'? But I suspect that might get
# slow though.
#
# Boolean context: "" and "0" are false, otherwise true.
|