summaryrefslogtreecommitdiff
path: root/src/Logstat/Parse.hs
blob: 725f2cf3b9bdd5c4e8231d090a2b89a2c1f6889e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
{-# LANGUAGE GADTs #-}
module Logstat.Parse
  ( parseStmts
  , parseStmtsIO
  , parseFileIO
  )
where

import Control.Applicative (empty)
import Control.Monad (void,when)
import Control.Monad.IO.Class (liftIO)
import Control.Monad.Identity
import Control.Monad.State
import Data.List (sortOn)
import System.Directory
import System.Exit (exitFailure)
import System.FilePath (takeDirectory,(</>))
import System.IO (hPutStr,stderr)
import Text.Megaparsec
import Text.Megaparsec.Expr
import qualified Data.ByteString as B
import qualified Data.Map.Strict as M
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Text.Megaparsec.Lexer as L

import Logstat.Types
import Logstat.Regex
import Logstat.Value


emapM :: Monad m => (Expr -> m Expr) -> Expr -> m Expr
emapM f e = f e >>= \(Loc pos e') -> Loc pos <$> case e' of
  EOp o a b  -> EOp o    <$> emapM f a <*> emapM f b
  EIf a b c  -> EIf      <$> emapM f a <*> emapM f b <*> emapM f c
  EMatch a b -> EMatch a <$> emapM f b
  ENeg a     -> ENeg     <$> emapM f a
  ENot a     -> ENot     <$> emapM f a
  r          -> return r

emap :: (Expr -> Expr) -> Expr -> Expr
emap f e = runIdentity (emapM (return . f) e)


type Parser m = StateT Config (ParsecT Dec String m)

-- Extracts the named subpatterns from a Regex and sorts them into a list that
-- can be zipped with the match results for quick capture extraction.
namedPatterns :: Regex -> [Maybe Field]
namedPatterns re = fill 0 $ sortOn snd $ captureNames re
  where
  fill _ [] = []
  fill i m@((name,n):xs)
    | i == n    = Just (T.unpack $ T.decodeUtf8 name) : fill (i+1) xs
    | otherwise = Nothing : fill (i+1) m


-- Whitespace and comments
ws :: Monad m => Parser m ()
ws = L.space (void spaceChar) (L.skipLineComment "#") empty

lexeme :: Monad m => Parser m a -> Parser m a
lexeme = L.lexeme ws

symbol :: Monad m => String -> Parser m ()
symbol s = void $ L.symbol ws s

identifier :: Monad m => Parser m String
identifier = lexeme $ do
  f <- letterChar
  t <- many alphaNumChar
  return (f:t)


-- Quoted literals, str: "string..", reg: /regex../
quotedlit :: Monad m => Char -> Parser m String
quotedlit q = lexeme $ between (char q) (char q) $
  many $ esc q
     <|> esc '\\'
     <|> satisfy (/= q)
  where
  esc c = string ('\\':c:[]) >> return c

str :: Monad m => Parser m B.ByteString
str = T.encodeUtf8 . T.pack <$> quotedlit '"'

reg :: Monad m => [PCREOption] -> Parser m Regex
reg defopt = do
  _ <- optional (char 'm') -- m/whatever/; If we're pretending to be perl, might as well go at least this far
  s <- quotedlit '/'
  opts <- many $ -- /s and /m seem a bit useless on single-line log entries
        extended <$ char 'x'
    <|> caseless <$ char 'i'
  case compileM (T.encodeUtf8 $ T.pack s) (defopt ++ opts) of
    Left e -> fail $ "Error in regular expression: " ++ e
    -- Studying the regex at this point is somewhat premature - we may not
    -- actually be using this regex in our program. However, we also should not
    -- defer the study to after the main program has been assembled, because at
    -- that point the func/proc expansion may have duplicated the Regex object.
    -- Studying should ideally happen after assembling the main program but
    -- before func/proc expansion. These actions are currently performed in the
    -- wrong order.
    Right r -> return (study r)


loc :: Monad m => Parser m a -> Parser m (Loc a)
loc f = getPosition >>= \pos -> Loc (Just pos) <$> f


expr :: Monad m => Parser m Expr
expr = label "expression" $ makeExprParser term
     [ [ InfixR $ bin (EOp OPow)    $ symbol "**" ]
     , [ Prefix enot
       , Prefix $ un  ENeg          $ symbol "-"  ]
     , [ InfixL $ bin (EOp OMul   ) $ (lexeme $ try $ char '*' <* notFollowedBy (char '*'))
       , InfixL $ bin (EOp ODiv   ) $ symbol "/"
       , InfixL $ bin (EOp OMod   ) $ symbol "%"  ]
     , [ InfixL $ bin (EOp OPlus  ) $ symbol "+"
       , InfixL $ bin (EOp OMinus ) $ symbol "-"
       , InfixL $ bin (EOp OConcat) $ symbol "."  ]
     , [ InfixN $ bin (EOp OLt    ) $ symbol "lt"
       , InfixN $ bin (EOp OGt    ) $ symbol "gt"
       , InfixN $ bin (EOp OLe    ) $ symbol "le"
       , InfixN $ bin (EOp OGe    ) $ symbol "ge"
       , InfixN $ bin (EOp OILt   ) $ (lexeme $ try $ char '<' <* notFollowedBy (char '='))
       , InfixN $ bin (EOp OIGt   ) $ (lexeme $ try $ char '>' <* notFollowedBy (char '='))
       , InfixN $ bin (EOp OILe   ) $ symbol "<="
       , InfixN $ bin (EOp OIGe   ) $ symbol ">=" ]
     , [ InfixN $ bin (EOp OEq    ) $ symbol "eq"
       , InfixN $ bin (EOp ONeq   ) $ symbol "ne"
       , InfixN $ bin (EOp OIEq   ) $ symbol "=="
       , InfixN $ bin (EOp OINeq  ) $ symbol "!="
       , Postfix ematch
       , Postfix enmatch ]
     , [ InfixL $ bin (EOp OAnd   ) $ symbol "&&" ]
     , [ InfixL $ bin (EOp OOr    ) $ symbol "||" ]
     , [ InfixR eif ]
     ]
  where
  bin f p = loc p >>= \(Loc pos _) -> return (\a b -> Loc pos (f a b))
  un  f p = loc p >>= \(Loc pos _) -> return (\a   -> Loc pos (f a  ))

  enot = (some $ loc $ symbol "!") >>= \l -> return $ foldr1 (.) $ map (\(Loc pos _) -> \a -> Loc pos (ENot a)) l
  ematch  = loc (symbol "~"  >> reg [no_auto_capture]) >>= \(Loc pos r) -> return (\a -> Loc pos                  $ EMatch r a)
  enmatch = loc (symbol "!~" >> reg [no_auto_capture]) >>= \(Loc pos r) -> return (\a -> Loc pos $ ENot $ Loc pos $ EMatch r a)
  eif = (try $ loc $ between (symbol "?") (symbol ":") expr) >>= \(Loc pos m) -> return (\l r -> Loc pos $ EIf l m r)

  term = loc (try (do { n <- identifier; symbol "("; return n}) >>= func)
     <|> loc (EField <$> identifier)
     <|> loc (ELit . bs <$> str)
     <|> loc (ELit . num <$> number)
     <|> between (symbol "(") (symbol ")") expr

  number = lexeme $ try L.float
       <|> fromIntegral <$> try (char '0' >> char' 'x' >> L.hexadecimal)
       <|> fromIntegral <$> try (char '0' >> char' 'o' >> L.octal)
       <|> fromIntegral <$> L.decimal

  func n
    | n == "extract" = fextract
    | n == "replace" = freplace
    | otherwise      = fdefined n

  fextract = do
    e <- expr
    symbol ","
    r <- reg []
    when (captureCount r < 1) $ fail "Regular expression does not have any subpatterns to extract"
    symbol ")"
    return $ EExtract e r

  freplace = do
    e <- expr
    symbol ","
    r <- reg []
    symbol ","
    n <- str
    symbol ")"
    return $ EReplace e r n

  fdefined n = do
    f' <- M.lookup n . cfgFuncs <$> get
    (args, e) <- maybe (fail $ "Unknown function " ++ n) return f'
    a <- sepBy expr (symbol ",")
    symbol ")"
    when (length a /= length args) $ fail
      $ "Function " ++ n ++ " expects " ++ show (length args) ++ " arguments, but " ++ show (length a) ++ " were given"
    -- XXX: Throwing away the original location here is ugly.
    let Loc _ r = fillargs e $ zip args a
    return r

  fillargs e a =
    let f (Loc _ (EField x)) = maybe undefined id $ lookup x a
        f x = x
    in emap f e


-- List of statements, final semicolon is optional
stmts :: Monad m => Parser m a -> Parser m [Stmt]
stmts end = concat <$> sepEndBy1 stmt (symbol ";") <* end
  where
  stmt = stmtUse <|> (:[]) <$> loc (stmtFilter <|> stmtShow <|> stmtGroup <|> stmtRegex <|> stmtSet <|> stmtSort)

  stmtUse = do
    symbol "use"
    n <- identifier
    proc <- M.lookup n . cfgProcs <$> get
    case proc of
      Nothing -> fail $ "No such proc: " ++ n
      Just s -> return s

  stmtFilter = symbol "filter" >> SFilter <$> expr
  stmtShow   = symbol "show"   >> SShow   <$> sepBy1 expr (symbol ",")

  stmtGroup = do
    symbol "group"
    n <- sepBy1 identifier (symbol ",")
    return (SGroup n)

  stmtRegex = do
    symbol "regex"
    f <- identifier
    r <- reg [no_auto_capture]
    return (SRegex f r (namedPatterns r))

  stmtSet = do
    symbol "set"
    f <- identifier
    e <- expr
    return (SSet f e)

  stmtSort = do
    symbol "sort"
    f <- True  <$ symbol "string"
     <|> False <$ symbol "number"
    e <- expr
    o <- True  <$ symbol "asc"
     <|> False <$ symbol "desc"
    symbol "limit"
    n <- fromIntegral <$> L.decimal
    let st = case (f,o) of
               (True,  True ) -> SortAscBS
               (False, True ) -> SortAscNum
               (True,  False) -> SortDescBS
               (False, False) -> SortDescNum
    return (SSort n e st)


runPure :: Parser Identity a -> Config -> String -> String -> Either String a
runPure p c fn code =
  case runIdentity $ runParserT (evalStateT p c) fn code of
    Left e  -> Left $ parseErrorPretty e
    Right x -> Right x


runIO :: Parser IO a -> Config -> String -> String -> IO a
runIO p c fn code = runParserT (evalStateT p c) fn code >>= ret
  where
  ret (Left e)  = hPutStr stderr (parseErrorPretty e) >> exitFailure
  ret (Right x) = return x


-- Parse a list of statements given on the command line
parseStmts :: Config -> String -> String -> Either String [Stmt]
parseStmts = runPure (ws >> stmts eof)

parseStmtsIO :: Config -> String -> String -> IO [Stmt]
parseStmtsIO = runIO (ws >> stmts eof)


-- Parse a config file
parseFileIO :: Config -> String -> IO Config
parseFileIO cfg' fn' = do
  fn <- canonicalizePath fn'
  if elem fn (cfgFiles cfg')
    then return cfg'
    else do
      let cfg = cfg' { cfgFiles = fn : cfgFiles cfg' }
      readFile fn >>= runIO parser cfg fn

  where
  parser = between ws eof body >> get
  body = many $ proc <|> func <|> include

  proc = do
    symbol "proc"
    n <- identifier

    -- Fail as early as possible, so that we have proper error context. Also
    -- has the fancy side effect that duplicate procs are detected before any
    -- syntax errors further down the road.
    st <- cfgProcs <$> get
    when (M.member n st) $ fail $ "Duplicate definition of proc " ++ n

    symbol "{"
    c <- stmts (symbol "}")

    modify' $ \conf -> conf {cfgProcs = M.insert n c st}

  func = do
    symbol "func"
    n <- identifier

    st <- cfgFuncs <$> get
    when (M.member n st) $ fail $ "Duplicate definition of func " ++ n

    symbol "("
    a <- sepBy identifier (symbol ",")
    symbol ")"
    e <- expr
    symbol ";"

    -- Make sure that the expression can only refer to variables declared in the function definition
    _ <- emapM (\fe -> do
      case fe of
        Loc _ (EField fn) -> when (notElem fn a) $ fail $ "Unknown variable " ++ fn
        _ -> return ()
      return fe) e

    modify' $ \conf -> conf {cfgFuncs = M.insert n (a,e) st}

  include = do
    symbol "include"
    n <- quotedlit '"'
    symbol ";"
    conf <- get
    conf' <- liftIO $ parseFileIO conf $ takeDirectory fn' </> n
    put conf'