summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYorhel <git@yorhel.nl>2017-10-15 11:13:45 +0200
committerYorhel <git@yorhel.nl>2017-10-15 11:13:45 +0200
commit947c61f3665e8058df785c3923332b7c5f7f35e6 (patch)
tree4892f614fc802b6a5fc77bb84d84c656b2ca7ff1
parent63b8fae12372a7183cf9da89ad5ebb04241c974d (diff)
Add replace() and extract(); study regexes; Add own PCRE wrapper
-rw-r--r--Logstat/Eval.hs17
-rw-r--r--Logstat/Parse.hs35
-rw-r--r--Logstat/Regex.hs167
-rw-r--r--Logstat/Types.hs7
-rw-r--r--Tests/Main.hs6
-rw-r--r--logstat.cabal2
-rw-r--r--stack.yaml4
7 files changed, 225 insertions, 13 deletions
diff --git a/Logstat/Eval.hs b/Logstat/Eval.hs
index 603a5a6..6a92c60 100644
--- a/Logstat/Eval.hs
+++ b/Logstat/Eval.hs
@@ -5,14 +5,14 @@ import Control.Monad.Except
import Control.Monad.State.Strict
import Data.Fixed (mod')
import Data.Foldable (foldl',toList)
-import Data.Maybe (catMaybes,isJust)
-import Text.Regex.PCRE.Light
+import Data.Maybe (catMaybes)
import qualified Data.ByteString as B
import qualified Data.Heap as Heap
import qualified Data.Map.Strict as Map
import Logstat.Types
import Logstat.Value
+import Logstat.Regex
except :: MonadError e m => Either e a -> m a
@@ -43,7 +43,16 @@ evalExpr st expr = case expr of
EMatch r e -> do
v <- evalExpr st e
- return $ bool $ isJust $ match r (asBS v) []
+ return $ bool $ reMatch r (asBS v)
+
+ EExtract e r -> do
+ v <- asBS <$> evalExpr st e
+ ma <- maybe (throwError (NoExtract v)) return $ match r v
+ return $ bs $ ma !! 1
+
+ EReplace e r n ->
+ -- TODO: Support subpattern substitution
+ bs . gsub r (const n) . asBS <$> evalExpr st e
EOp op a' b' ->
case op of
@@ -113,7 +122,7 @@ step ev = get >>= \stmt -> case stmt of
SRegex f r p -> do
val <- asBS <$> getField ev f
- ma <- maybe (throwError (NoMatch f val)) return $ match r val []
+ ma <- maybe (throwError (NoMatch f val)) return $ match r val
return $ foldl' ins ev $ zip ma p
where
ins s (_, Nothing) = s
diff --git a/Logstat/Parse.hs b/Logstat/Parse.hs
index 29415da..7e59258 100644
--- a/Logstat/Parse.hs
+++ b/Logstat/Parse.hs
@@ -18,7 +18,6 @@ import System.FilePath (takeDirectory,(</>))
import System.IO (hPutStr,stderr)
import Text.Megaparsec
import Text.Megaparsec.Expr
-import Text.Regex.PCRE.Light
import qualified Data.ByteString as B
import qualified Data.Map.Strict as M
import qualified Data.Text as T
@@ -26,6 +25,7 @@ import qualified Data.Text.Encoding as T
import qualified Text.Megaparsec.Lexer as L
import Logstat.Types
+import Logstat.Regex
import Logstat.Value
@@ -93,7 +93,14 @@ reg defopt = do
<|> caseless <$ char 'i'
case compileM (T.encodeUtf8 $ T.pack s) (defopt ++ opts) of
Left e -> fail $ "Error in regular expression: " ++ e
- Right r -> return r
+ -- Studying the regex at this point is somewhat premature - we may not
+ -- actually be using this regex in our program. However, we also should not
+ -- defer the study to after the main program has been assembled, because at
+ -- that point the func/proc expansion may have duplicated the Regex object.
+ -- Studying should ideally happen after assembling the main program but
+ -- before func/proc expansion. These actions are currently performed in the
+ -- wrong order.
+ Right r -> return (study r)
expr :: Monad m => Parser m Expr
expr = label "expression" $ makeExprParser term
@@ -137,7 +144,29 @@ expr = label "expression" $ makeExprParser term
<|> fromIntegral <$> try (char '0' >> char' 'o' >> L.octal)
<|> fromIntegral <$> L.decimal
- func n = do
+ func n
+ | n == "extract" = fextract
+ | n == "replace" = freplace
+ | otherwise = fdefined n
+
+ fextract = do
+ e <- expr
+ symbol ","
+ r <- reg []
+ when (captureCount r < 1) $ fail "Regular expression does not have any subpatterns to extract"
+ symbol ")"
+ return $ EExtract e r
+
+ freplace = do
+ e <- expr
+ symbol ","
+ r <- reg []
+ symbol ","
+ n <- str
+ symbol ")"
+ return $ EReplace e r n
+
+ fdefined n = do
f' <- M.lookup n . cfgFuncs <$> get
(args, e) <- maybe (fail $ "Unknown function " ++ n) return f'
a <- sepBy expr (symbol ",")
diff --git a/Logstat/Regex.hs b/Logstat/Regex.hs
new file mode 100644
index 0000000..b537342
--- /dev/null
+++ b/Logstat/Regex.hs
@@ -0,0 +1,167 @@
+-- This is (yet another) wrapper around PCRE. The pcre-light wrapper is missing
+-- some functionality (named captures, study, replacement support) and
+-- pcre-heavy also lacks two of those. I'd like this to be integrated with
+-- pcre-light, but its maintainer is unresponsive. :(
+
+module Logstat.Regex
+ ( Regex(..)
+
+ , PCREOption
+ , caseless
+ , extended
+ , no_auto_capture
+
+ , compileM
+ , study
+ , reMatch
+ , match
+ , gsub
+ , captureCount
+ , captureNames
+ ) where
+
+import Data.Maybe (isJust)
+import Foreign
+import Foreign.C.Types
+import Foreign.C.String
+import System.IO.Unsafe (unsafePerformIO)
+import Text.Regex.PCRE.Light.Base hiding (Regex(..))
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Internal as B
+
+type PCREExtra = () -- Not exported by pcre-light.
+
+data Regex = Regex !(ForeignPtr PCRE) !(ForeignPtr PCREExtra) !B.ByteString
+
+foreign import ccall unsafe "pcre.h pcre_study"
+ c_pcre_study :: Ptr PCRE -> CInt -> Ptr CString -> IO (Ptr PCREExtra)
+
+foreign import ccall unsafe "pcre.h &pcre_free_study"
+ c_pcre_free_study :: FunPtr (Ptr PCREExtra -> IO ())
+
+
+instance Eq Regex where (Regex _ _ a) == (Regex _ _ b) = a == b
+instance Ord Regex where (Regex _ _ a) <= (Regex _ _ b) = a <= b
+instance Show Regex where show (Regex _ _ a) = "Regex " ++ show a
+
+
+-- Based on pcre-light's compileM
+compileM :: B.ByteString -> [PCREOption] -> Either String Regex
+compileM str os = unsafePerformIO $
+ B.useAsCString str $ \pattern ->
+ alloca $ \errptr ->
+ alloca $ \erroffset -> do
+ re <- c_pcre_compile pattern (combineOptions os) errptr erroffset nullPtr
+ if re == nullPtr
+ then do
+ err <- peekCString =<< peek errptr
+ return (Left err)
+ else do
+ st' <- newForeignPtr finalizerFree nullPtr
+ re' <- newForeignPtr finalizerFree re
+ return (Right (Regex re' st' str))
+
+
+study :: Regex -> Regex
+study regex@(Regex re _ str) = unsafePerformIO $
+ withRegex regex $ \re' st ->
+ if st /= nullPtr
+ then return regex
+ else do
+ alloca $ \errptr -> do
+ -- We can safely ignore errors here, a nullPtr is also a valid 'PCREExtra'.
+ -- XXX: The flag should be replaced with a proper PCRE_STUDY_JIT_COMPILE when pcre-light exports it.
+ st' <- c_pcre_study re' 1 errptr
+ st'' <- newForeignPtr c_pcre_free_study st'
+ return (Regex re st'' str)
+
+
+withRegex :: Regex -> (Ptr PCRE -> Ptr PCREExtra -> IO a) -> IO a
+withRegex (Regex re st _) f = withForeignPtr re $ \re' -> withForeignPtr st $ \st' -> f re' st'
+
+
+-- Based on pcre-heavy's rawMatch
+rawMatch :: Regex -> B.ByteString -> Int -> [PCREExecOption] -> Maybe [(Int, Int)]
+rawMatch regex s offset opts = unsafePerformIO $ do
+ withRegex regex $ \re st -> do
+ nCapt <- fullInfoInt re st info_capturecount
+ let ovecSize = (nCapt + 1) * 3
+ ovecBytes = ovecSize * size_of_cint
+ allocaBytes ovecBytes $ \ovec -> do
+ let (strFp, off, len) = B.toForeignPtr s
+ withForeignPtr strFp $ \strPtr -> do
+ results <- c_pcre_exec re st (strPtr `plusPtr` off) (fromIntegral len) (fromIntegral offset)
+ (combineExecOptions opts) ovec (fromIntegral ovecSize)
+ if results < 0 then return Nothing
+ else
+ let loop n o acc =
+ if n == results then return $ Just $ reverse acc
+ else do
+ i <- peekElemOff ovec $! o
+ j <- peekElemOff ovec (o + 1)
+ loop (n + 1) (o + 2) ((fromIntegral i, fromIntegral j) : acc)
+ in loop 0 0 []
+
+
+reMatch :: Regex -> B.ByteString -> Bool
+reMatch r s = isJust $ rawMatch r s 0 []
+
+
+substr :: B.ByteString -> (Int,Int) -> B.ByteString
+substr s (f, t) = B.take (t - f) . B.drop f $ s
+
+
+match :: Regex -> B.ByteString -> Maybe [B.ByteString]
+match r s = fmap (map (substr s)) $ rawMatch r s 0 []
+
+
+-- Rather simplified implementation of regex-heavy's gsub
+gsub :: Regex -> ([B.ByteString] -> B.ByteString) -> B.ByteString -> B.ByteString
+gsub r f = B.concat . rep
+ where
+ rep s = case rawMatch r s 0 [] of
+ Just m@((begin,end):_) -> substr s (0, begin) : f (map (substr s) m) : rep (substr s (end, B.length s))
+ _ -> [s]
+
+
+-- Wrapper around c_pcre_fullinfo for integer values
+fullInfoInt :: Num a => Ptr PCRE -> Ptr PCREExtra -> PCREInfo -> IO a
+fullInfoInt re st what =
+ alloca $ \n_ptr -> do
+ _ <- c_pcre_fullinfo re st what n_ptr
+ return . fromIntegral =<< peek (n_ptr :: Ptr CInt)
+
+
+captureCount :: Regex -> Int
+captureCount regex = unsafePerformIO $
+ withRegex regex $ \re st -> fullInfoInt re st info_capturecount
+
+
+captureNames :: Regex -> [(B.ByteString, Int)]
+captureNames regex = unsafePerformIO $
+ withRegex regex $ \re st -> do
+ count <- fullInfoInt re st info_namecount
+ entrysize <- fullInfoInt re st info_nameentrysize
+
+ buf <- alloca $ \n_ptr -> do
+ _ <- c_pcre_fullinfo re st info_nametable n_ptr
+ buf <- peek n_ptr
+ B.packCStringLen (buf, count*entrysize)
+
+ return $ split entrysize buf
+
+ where
+ -- Split the nametable buffer into entries. Each entry has a fixed size in
+ -- bytes. The first two bytes in each entry store the pattern number in
+ -- big-endian format, the bytes following that contain the nul-terminated
+ -- name of the subpattern.
+ split :: Int -> B.ByteString -> [(B.ByteString, Int)]
+ split entrysize buf
+ | B.null buf = []
+ | otherwise =
+ let
+ (entry, rest) = B.splitAt entrysize buf
+ idx = fromIntegral . B.index entry
+ num = idx 0 * 256 + idx 1
+ name = B.takeWhile (/= 0) $ B.drop 2 entry
+ in (name, num) : split entrysize rest
diff --git a/Logstat/Types.hs b/Logstat/Types.hs
index 22f9656..11d2588 100644
--- a/Logstat/Types.hs
+++ b/Logstat/Types.hs
@@ -1,10 +1,9 @@
module Logstat.Types where
-import Text.Regex.PCRE.Light (Regex)
import Data.Map.Strict (Map)
import Data.Heap (Heap)
import Data.ByteString (ByteString)
-
+import Logstat.Regex (Regex)
-- This type may seem inefficient; Surely we're not generating a string AND
-- numeric representation for each and every intermediate value? But rest
@@ -23,6 +22,7 @@ instance Ord Val where (Val a _) <= (Val b _) = a <= b
data EvalError
= UnknownField Field -- This error case can actually be prevented with static analysis
| NoMatch Field ByteString
+ | NoExtract ByteString
| InvalidNumber ByteString
| DivByZero
| Filtered -- Not strictly an error, just a way to signal that this event should be dropped
@@ -30,6 +30,7 @@ data EvalError
instance Show EvalError where
show (UnknownField n) = "Unknown variable '" ++ n ++ "'"
show (NoMatch f v) = "Regex on field " ++ f ++ " failed to match " ++ show v
+ show (NoExtract v) = "Regex in extract() failed to match " ++ show v
show (InvalidNumber v) = "Invalid number " ++ show v
show DivByZero = "Division by zero"
show Filtered = "Filtered"
@@ -81,6 +82,8 @@ data Expr
| EOp !Op !Expr !Expr
| EIf !Expr !Expr !Expr
| EMatch !Regex !Expr
+ | EExtract !Expr !Regex
+ | EReplace !Expr !Regex !ByteString
| ENeg !Expr
| ENot !Expr
deriving(Show)
diff --git a/Tests/Main.hs b/Tests/Main.hs
index fdb0717..9386aeb 100644
--- a/Tests/Main.hs
+++ b/Tests/Main.hs
@@ -57,4 +57,10 @@ main = runTestTT $ test
, expr "1234 !~ /5/" "1"
, expr "\"abC\" ~ /^abc$/i" "1"
, expr "\"abC\" !~ /^abc$/i" ""
+ , expr "extract(123153, /1(.)3/)" "2"
+ , expr "extract(123153, /1(.)3$/)" "5"
+ , expr "replace(1, /1/, \"2\")" "2"
+ , expr "replace(111, /1/, \"2\")" "222"
+ , expr "replace(9181716, /1/, \"2\")" "9282726"
+ , expr "replace(1, /3/, \"2\")" "1"
]
diff --git a/logstat.cabal b/logstat.cabal
index c38c42b..6ecd71d 100644
--- a/logstat.cabal
+++ b/logstat.cabal
@@ -27,6 +27,7 @@ executable logstat
other-modules:
Logstat.Eval
Logstat.Parse
+ Logstat.Regex
Logstat.Types
Logstat.Value
@@ -51,5 +52,6 @@ test-suite tests
other-modules:
Logstat.Eval
Logstat.Parse
+ Logstat.Regex
Logstat.Types
Logstat.Value
diff --git a/stack.yaml b/stack.yaml
index 34adc48..8be8203 100644
--- a/stack.yaml
+++ b/stack.yaml
@@ -1,7 +1,3 @@
resolver: lts-9.2
packages:
- '.'
-- location:
- git: https://github.com/17dec/pcre-light
- commit: a70d5842783743fb104a608e902a8e82b3b8b584
- extra-dep: true