diff options
author | Yorhel <git@yorhel.nl> | 2017-10-15 11:13:45 +0200 |
---|---|---|
committer | Yorhel <git@yorhel.nl> | 2017-10-15 11:13:45 +0200 |
commit | 947c61f3665e8058df785c3923332b7c5f7f35e6 (patch) | |
tree | 4892f614fc802b6a5fc77bb84d84c656b2ca7ff1 | |
parent | 63b8fae12372a7183cf9da89ad5ebb04241c974d (diff) |
Add replace() and extract(); study regexes; Add own PCRE wrapper
-rw-r--r-- | Logstat/Eval.hs | 17 | ||||
-rw-r--r-- | Logstat/Parse.hs | 35 | ||||
-rw-r--r-- | Logstat/Regex.hs | 167 | ||||
-rw-r--r-- | Logstat/Types.hs | 7 | ||||
-rw-r--r-- | Tests/Main.hs | 6 | ||||
-rw-r--r-- | logstat.cabal | 2 | ||||
-rw-r--r-- | stack.yaml | 4 |
7 files changed, 225 insertions, 13 deletions
diff --git a/Logstat/Eval.hs b/Logstat/Eval.hs index 603a5a6..6a92c60 100644 --- a/Logstat/Eval.hs +++ b/Logstat/Eval.hs @@ -5,14 +5,14 @@ import Control.Monad.Except import Control.Monad.State.Strict import Data.Fixed (mod') import Data.Foldable (foldl',toList) -import Data.Maybe (catMaybes,isJust) -import Text.Regex.PCRE.Light +import Data.Maybe (catMaybes) import qualified Data.ByteString as B import qualified Data.Heap as Heap import qualified Data.Map.Strict as Map import Logstat.Types import Logstat.Value +import Logstat.Regex except :: MonadError e m => Either e a -> m a @@ -43,7 +43,16 @@ evalExpr st expr = case expr of EMatch r e -> do v <- evalExpr st e - return $ bool $ isJust $ match r (asBS v) [] + return $ bool $ reMatch r (asBS v) + + EExtract e r -> do + v <- asBS <$> evalExpr st e + ma <- maybe (throwError (NoExtract v)) return $ match r v + return $ bs $ ma !! 1 + + EReplace e r n -> + -- TODO: Support subpattern substitution + bs . gsub r (const n) . asBS <$> evalExpr st e EOp op a' b' -> case op of @@ -113,7 +122,7 @@ step ev = get >>= \stmt -> case stmt of SRegex f r p -> do val <- asBS <$> getField ev f - ma <- maybe (throwError (NoMatch f val)) return $ match r val [] + ma <- maybe (throwError (NoMatch f val)) return $ match r val return $ foldl' ins ev $ zip ma p where ins s (_, Nothing) = s diff --git a/Logstat/Parse.hs b/Logstat/Parse.hs index 29415da..7e59258 100644 --- a/Logstat/Parse.hs +++ b/Logstat/Parse.hs @@ -18,7 +18,6 @@ import System.FilePath (takeDirectory,(</>)) import System.IO (hPutStr,stderr) import Text.Megaparsec import Text.Megaparsec.Expr -import Text.Regex.PCRE.Light import qualified Data.ByteString as B import qualified Data.Map.Strict as M import qualified Data.Text as T @@ -26,6 +25,7 @@ import qualified Data.Text.Encoding as T import qualified Text.Megaparsec.Lexer as L import Logstat.Types +import Logstat.Regex import Logstat.Value @@ -93,7 +93,14 @@ reg defopt = do <|> caseless <$ char 'i' case compileM (T.encodeUtf8 $ T.pack s) (defopt ++ opts) of Left e -> fail $ "Error in regular expression: " ++ e - Right r -> return r + -- Studying the regex at this point is somewhat premature - we may not + -- actually be using this regex in our program. However, we also should not + -- defer the study to after the main program has been assembled, because at + -- that point the func/proc expansion may have duplicated the Regex object. + -- Studying should ideally happen after assembling the main program but + -- before func/proc expansion. These actions are currently performed in the + -- wrong order. + Right r -> return (study r) expr :: Monad m => Parser m Expr expr = label "expression" $ makeExprParser term @@ -137,7 +144,29 @@ expr = label "expression" $ makeExprParser term <|> fromIntegral <$> try (char '0' >> char' 'o' >> L.octal) <|> fromIntegral <$> L.decimal - func n = do + func n + | n == "extract" = fextract + | n == "replace" = freplace + | otherwise = fdefined n + + fextract = do + e <- expr + symbol "," + r <- reg [] + when (captureCount r < 1) $ fail "Regular expression does not have any subpatterns to extract" + symbol ")" + return $ EExtract e r + + freplace = do + e <- expr + symbol "," + r <- reg [] + symbol "," + n <- str + symbol ")" + return $ EReplace e r n + + fdefined n = do f' <- M.lookup n . cfgFuncs <$> get (args, e) <- maybe (fail $ "Unknown function " ++ n) return f' a <- sepBy expr (symbol ",") diff --git a/Logstat/Regex.hs b/Logstat/Regex.hs new file mode 100644 index 0000000..b537342 --- /dev/null +++ b/Logstat/Regex.hs @@ -0,0 +1,167 @@ +-- This is (yet another) wrapper around PCRE. The pcre-light wrapper is missing +-- some functionality (named captures, study, replacement support) and +-- pcre-heavy also lacks two of those. I'd like this to be integrated with +-- pcre-light, but its maintainer is unresponsive. :( + +module Logstat.Regex + ( Regex(..) + + , PCREOption + , caseless + , extended + , no_auto_capture + + , compileM + , study + , reMatch + , match + , gsub + , captureCount + , captureNames + ) where + +import Data.Maybe (isJust) +import Foreign +import Foreign.C.Types +import Foreign.C.String +import System.IO.Unsafe (unsafePerformIO) +import Text.Regex.PCRE.Light.Base hiding (Regex(..)) +import qualified Data.ByteString as B +import qualified Data.ByteString.Internal as B + +type PCREExtra = () -- Not exported by pcre-light. + +data Regex = Regex !(ForeignPtr PCRE) !(ForeignPtr PCREExtra) !B.ByteString + +foreign import ccall unsafe "pcre.h pcre_study" + c_pcre_study :: Ptr PCRE -> CInt -> Ptr CString -> IO (Ptr PCREExtra) + +foreign import ccall unsafe "pcre.h &pcre_free_study" + c_pcre_free_study :: FunPtr (Ptr PCREExtra -> IO ()) + + +instance Eq Regex where (Regex _ _ a) == (Regex _ _ b) = a == b +instance Ord Regex where (Regex _ _ a) <= (Regex _ _ b) = a <= b +instance Show Regex where show (Regex _ _ a) = "Regex " ++ show a + + +-- Based on pcre-light's compileM +compileM :: B.ByteString -> [PCREOption] -> Either String Regex +compileM str os = unsafePerformIO $ + B.useAsCString str $ \pattern -> + alloca $ \errptr -> + alloca $ \erroffset -> do + re <- c_pcre_compile pattern (combineOptions os) errptr erroffset nullPtr + if re == nullPtr + then do + err <- peekCString =<< peek errptr + return (Left err) + else do + st' <- newForeignPtr finalizerFree nullPtr + re' <- newForeignPtr finalizerFree re + return (Right (Regex re' st' str)) + + +study :: Regex -> Regex +study regex@(Regex re _ str) = unsafePerformIO $ + withRegex regex $ \re' st -> + if st /= nullPtr + then return regex + else do + alloca $ \errptr -> do + -- We can safely ignore errors here, a nullPtr is also a valid 'PCREExtra'. + -- XXX: The flag should be replaced with a proper PCRE_STUDY_JIT_COMPILE when pcre-light exports it. + st' <- c_pcre_study re' 1 errptr + st'' <- newForeignPtr c_pcre_free_study st' + return (Regex re st'' str) + + +withRegex :: Regex -> (Ptr PCRE -> Ptr PCREExtra -> IO a) -> IO a +withRegex (Regex re st _) f = withForeignPtr re $ \re' -> withForeignPtr st $ \st' -> f re' st' + + +-- Based on pcre-heavy's rawMatch +rawMatch :: Regex -> B.ByteString -> Int -> [PCREExecOption] -> Maybe [(Int, Int)] +rawMatch regex s offset opts = unsafePerformIO $ do + withRegex regex $ \re st -> do + nCapt <- fullInfoInt re st info_capturecount + let ovecSize = (nCapt + 1) * 3 + ovecBytes = ovecSize * size_of_cint + allocaBytes ovecBytes $ \ovec -> do + let (strFp, off, len) = B.toForeignPtr s + withForeignPtr strFp $ \strPtr -> do + results <- c_pcre_exec re st (strPtr `plusPtr` off) (fromIntegral len) (fromIntegral offset) + (combineExecOptions opts) ovec (fromIntegral ovecSize) + if results < 0 then return Nothing + else + let loop n o acc = + if n == results then return $ Just $ reverse acc + else do + i <- peekElemOff ovec $! o + j <- peekElemOff ovec (o + 1) + loop (n + 1) (o + 2) ((fromIntegral i, fromIntegral j) : acc) + in loop 0 0 [] + + +reMatch :: Regex -> B.ByteString -> Bool +reMatch r s = isJust $ rawMatch r s 0 [] + + +substr :: B.ByteString -> (Int,Int) -> B.ByteString +substr s (f, t) = B.take (t - f) . B.drop f $ s + + +match :: Regex -> B.ByteString -> Maybe [B.ByteString] +match r s = fmap (map (substr s)) $ rawMatch r s 0 [] + + +-- Rather simplified implementation of regex-heavy's gsub +gsub :: Regex -> ([B.ByteString] -> B.ByteString) -> B.ByteString -> B.ByteString +gsub r f = B.concat . rep + where + rep s = case rawMatch r s 0 [] of + Just m@((begin,end):_) -> substr s (0, begin) : f (map (substr s) m) : rep (substr s (end, B.length s)) + _ -> [s] + + +-- Wrapper around c_pcre_fullinfo for integer values +fullInfoInt :: Num a => Ptr PCRE -> Ptr PCREExtra -> PCREInfo -> IO a +fullInfoInt re st what = + alloca $ \n_ptr -> do + _ <- c_pcre_fullinfo re st what n_ptr + return . fromIntegral =<< peek (n_ptr :: Ptr CInt) + + +captureCount :: Regex -> Int +captureCount regex = unsafePerformIO $ + withRegex regex $ \re st -> fullInfoInt re st info_capturecount + + +captureNames :: Regex -> [(B.ByteString, Int)] +captureNames regex = unsafePerformIO $ + withRegex regex $ \re st -> do + count <- fullInfoInt re st info_namecount + entrysize <- fullInfoInt re st info_nameentrysize + + buf <- alloca $ \n_ptr -> do + _ <- c_pcre_fullinfo re st info_nametable n_ptr + buf <- peek n_ptr + B.packCStringLen (buf, count*entrysize) + + return $ split entrysize buf + + where + -- Split the nametable buffer into entries. Each entry has a fixed size in + -- bytes. The first two bytes in each entry store the pattern number in + -- big-endian format, the bytes following that contain the nul-terminated + -- name of the subpattern. + split :: Int -> B.ByteString -> [(B.ByteString, Int)] + split entrysize buf + | B.null buf = [] + | otherwise = + let + (entry, rest) = B.splitAt entrysize buf + idx = fromIntegral . B.index entry + num = idx 0 * 256 + idx 1 + name = B.takeWhile (/= 0) $ B.drop 2 entry + in (name, num) : split entrysize rest diff --git a/Logstat/Types.hs b/Logstat/Types.hs index 22f9656..11d2588 100644 --- a/Logstat/Types.hs +++ b/Logstat/Types.hs @@ -1,10 +1,9 @@ module Logstat.Types where -import Text.Regex.PCRE.Light (Regex) import Data.Map.Strict (Map) import Data.Heap (Heap) import Data.ByteString (ByteString) - +import Logstat.Regex (Regex) -- This type may seem inefficient; Surely we're not generating a string AND -- numeric representation for each and every intermediate value? But rest @@ -23,6 +22,7 @@ instance Ord Val where (Val a _) <= (Val b _) = a <= b data EvalError = UnknownField Field -- This error case can actually be prevented with static analysis | NoMatch Field ByteString + | NoExtract ByteString | InvalidNumber ByteString | DivByZero | Filtered -- Not strictly an error, just a way to signal that this event should be dropped @@ -30,6 +30,7 @@ data EvalError instance Show EvalError where show (UnknownField n) = "Unknown variable '" ++ n ++ "'" show (NoMatch f v) = "Regex on field " ++ f ++ " failed to match " ++ show v + show (NoExtract v) = "Regex in extract() failed to match " ++ show v show (InvalidNumber v) = "Invalid number " ++ show v show DivByZero = "Division by zero" show Filtered = "Filtered" @@ -81,6 +82,8 @@ data Expr | EOp !Op !Expr !Expr | EIf !Expr !Expr !Expr | EMatch !Regex !Expr + | EExtract !Expr !Regex + | EReplace !Expr !Regex !ByteString | ENeg !Expr | ENot !Expr deriving(Show) diff --git a/Tests/Main.hs b/Tests/Main.hs index fdb0717..9386aeb 100644 --- a/Tests/Main.hs +++ b/Tests/Main.hs @@ -57,4 +57,10 @@ main = runTestTT $ test , expr "1234 !~ /5/" "1" , expr "\"abC\" ~ /^abc$/i" "1" , expr "\"abC\" !~ /^abc$/i" "" + , expr "extract(123153, /1(.)3/)" "2" + , expr "extract(123153, /1(.)3$/)" "5" + , expr "replace(1, /1/, \"2\")" "2" + , expr "replace(111, /1/, \"2\")" "222" + , expr "replace(9181716, /1/, \"2\")" "9282726" + , expr "replace(1, /3/, \"2\")" "1" ] diff --git a/logstat.cabal b/logstat.cabal index c38c42b..6ecd71d 100644 --- a/logstat.cabal +++ b/logstat.cabal @@ -27,6 +27,7 @@ executable logstat other-modules: Logstat.Eval Logstat.Parse + Logstat.Regex Logstat.Types Logstat.Value @@ -51,5 +52,6 @@ test-suite tests other-modules: Logstat.Eval Logstat.Parse + Logstat.Regex Logstat.Types Logstat.Value @@ -1,7 +1,3 @@ resolver: lts-9.2 packages: - '.' -- location: - git: https://github.com/17dec/pcre-light - commit: a70d5842783743fb104a608e902a8e82b3b8b584 - extra-dep: true |