Add replace() and extract(); study regexes; Add own PCRE wrapper

author: Yorhel <git@yorhel.nl> 2017-10-15 11:13:45 +0200
committer: Yorhel <git@yorhel.nl> 2017-10-15 11:13:45 +0200
commit: 947c61f3665e8058df785c3923332b7c5f7f35e6 (patch)
tree: 4892f614fc802b6a5fc77bb84d84c656b2ca7ff1
parent: 63b8fae12372a7183cf9da89ad5ebb04241c974d (diff)
7 files changed, 225 insertions, 13 deletions
diff --git a/Logstat/Eval.hs b/Logstat/Eval.hs
index 603a5a6..6a92c60 100644
--- a/Logstat/Eval.hs
+++ b/Logstat/Eval.hs
@@ -5,14 +5,14 @@ import Control.Monad.Except
 import Control.Monad.State.Strict
 import Data.Fixed (mod')
 import Data.Foldable (foldl',toList)
-import Data.Maybe (catMaybes,isJust)
-import Text.Regex.PCRE.Light
+import Data.Maybe (catMaybes)
 import qualified Data.ByteString as B
 import qualified Data.Heap       as Heap
 import qualified Data.Map.Strict as Map
 
 import Logstat.Types
 import Logstat.Value
+import Logstat.Regex
 
 
 except :: MonadError e m => Either e a -> m a
@@ -43,7 +43,16 @@ evalExpr st expr = case expr of
 
   EMatch r e -> do
     v <- evalExpr st e
-    return $ bool $ isJust $ match r (asBS v) []
+    return $ bool $ reMatch r (asBS v)
+
+  EExtract e r -> do
+    v <- asBS <$> evalExpr st e
+    ma <- maybe (throwError (NoExtract v)) return $ match r v
+    return $ bs $ ma !! 1
+
+  EReplace e r n ->
+    -- TODO: Support subpattern substitution
+    bs . gsub r (const n) . asBS <$> evalExpr st e
 
   EOp op a' b' ->
     case op of
@@ -113,7 +122,7 @@ step ev = get >>= \stmt -> case stmt of
 
   SRegex f r p -> do
     val <- asBS <$> getField ev f
-    ma <- maybe (throwError (NoMatch f val)) return $ match r val []
+    ma <- maybe (throwError (NoMatch f val)) return $ match r val
     return $ foldl' ins ev $ zip ma p
     where
       ins s (_, Nothing) = s
diff --git a/Logstat/Parse.hs b/Logstat/Parse.hs
index 29415da..7e59258 100644
--- a/Logstat/Parse.hs
+++ b/Logstat/Parse.hs
@@ -18,7 +18,6 @@ import System.FilePath (takeDirectory,(</>))
 import System.IO (hPutStr,stderr)
 import Text.Megaparsec
 import Text.Megaparsec.Expr
-import Text.Regex.PCRE.Light
 import qualified Data.ByteString as B
 import qualified Data.Map.Strict as M
 import qualified Data.Text as T
@@ -26,6 +25,7 @@ import qualified Data.Text.Encoding as T
 import qualified Text.Megaparsec.Lexer as L
 
 import Logstat.Types
+import Logstat.Regex
 import Logstat.Value
 
 
@@ -93,7 +93,14 @@ reg defopt = do
     <|> caseless <$ char 'i'
   case compileM (T.encodeUtf8 $ T.pack s) (defopt ++ opts) of
     Left e -> fail $ "Error in regular expression: " ++ e
-    Right r -> return r
+    -- Studying the regex at this point is somewhat premature - we may not
+    -- actually be using this regex in our program. However, we also should not
+    -- defer the study to after the main program has been assembled, because at
+    -- that point the func/proc expansion may have duplicated the Regex object.
+    -- Studying should ideally happen after assembling the main program but
+    -- before func/proc expansion. These actions are currently performed in the
+    -- wrong order.
+    Right r -> return (study r)
 
 expr :: Monad m => Parser m Expr
 expr = label "expression" $ makeExprParser term
@@ -137,7 +144,29 @@ expr = label "expression" $ makeExprParser term
        <|> fromIntegral <$> try (char '0' >> char' 'o' >> L.octal)
        <|> fromIntegral <$> L.decimal
 
-  func n = do
+  func n
+    | n == "extract" = fextract
+    | n == "replace" = freplace
+    | otherwise      = fdefined n
+
+  fextract = do
+    e <- expr
+    symbol ","
+    r <- reg []
+    when (captureCount r < 1) $ fail "Regular expression does not have any subpatterns to extract"
+    symbol ")"
+    return $ EExtract e r
+
+  freplace = do
+    e <- expr
+    symbol ","
+    r <- reg []
+    symbol ","
+    n <- str
+    symbol ")"
+    return $ EReplace e r n
+
+  fdefined n = do
     f' <- M.lookup n . cfgFuncs <$> get
     (args, e) <- maybe (fail $ "Unknown function " ++ n) return f'
     a <- sepBy expr (symbol ",")
diff --git a/Logstat/Regex.hs b/Logstat/Regex.hs
new file mode 100644
index 0000000..b537342
--- /dev/null
+++ b/Logstat/Regex.hs
@@ -0,0 +1,167 @@
+-- This is (yet another) wrapper around PCRE. The pcre-light wrapper is missing
+-- some functionality (named captures, study, replacement support) and
+-- pcre-heavy also lacks two of those. I'd like this to be integrated with
+-- pcre-light, but its maintainer is unresponsive. :(
+
+module Logstat.Regex
+  ( Regex(..)
+
+  , PCREOption
+  , caseless
+  , extended
+  , no_auto_capture
+
+  , compileM
+  , study
+  , reMatch
+  , match
+  , gsub
+  , captureCount
+  , captureNames
+  ) where
+
+import Data.Maybe (isJust)
+import Foreign
+import Foreign.C.Types
+import Foreign.C.String
+import System.IO.Unsafe (unsafePerformIO)
+import Text.Regex.PCRE.Light.Base hiding (Regex(..))
+import qualified Data.ByteString          as B
+import qualified Data.ByteString.Internal as B
+
+type PCREExtra = () -- Not exported by pcre-light.
+
+data Regex = Regex !(ForeignPtr PCRE) !(ForeignPtr PCREExtra) !B.ByteString
+
+foreign import ccall unsafe "pcre.h pcre_study"
+  c_pcre_study :: Ptr PCRE -> CInt -> Ptr CString -> IO (Ptr PCREExtra)
+
+foreign import ccall unsafe "pcre.h &pcre_free_study"
+  c_pcre_free_study :: FunPtr (Ptr PCREExtra -> IO ())
+
+
+instance Eq  Regex where (Regex _ _ a) == (Regex _ _ b) = a == b
+instance Ord Regex where (Regex _ _ a) <= (Regex _ _ b) = a <= b
+instance Show Regex where show (Regex _ _ a) = "Regex " ++ show a
+
+
+-- Based on pcre-light's compileM
+compileM :: B.ByteString -> [PCREOption] -> Either String Regex
+compileM str os = unsafePerformIO $
+  B.useAsCString str $ \pattern ->
+    alloca $ \errptr       ->
+    alloca $ \erroffset    -> do
+      re <- c_pcre_compile pattern (combineOptions os) errptr erroffset nullPtr
+      if re == nullPtr
+        then do
+          err <- peekCString =<< peek errptr
+          return (Left err)
+        else do
+          st' <- newForeignPtr finalizerFree nullPtr
+          re' <- newForeignPtr finalizerFree re
+          return (Right (Regex re' st' str))
+
+
+study :: Regex -> Regex
+study regex@(Regex re _ str) = unsafePerformIO $
+  withRegex regex $ \re' st ->
+    if st /= nullPtr
+      then return regex
+      else do
+        alloca $ \errptr -> do
+          -- We can safely ignore errors here, a nullPtr is also a valid 'PCREExtra'.
+          -- XXX: The flag should be replaced with a proper PCRE_STUDY_JIT_COMPILE when pcre-light exports it.
+          st' <- c_pcre_study re' 1 errptr
+          st'' <- newForeignPtr c_pcre_free_study st'
+          return (Regex re st'' str)
+
+
+withRegex :: Regex -> (Ptr PCRE -> Ptr PCREExtra -> IO a) -> IO a
+withRegex (Regex re st _) f = withForeignPtr re $ \re' -> withForeignPtr st $ \st' -> f re' st'
+
+
+-- Based on pcre-heavy's rawMatch
+rawMatch :: Regex -> B.ByteString -> Int -> [PCREExecOption] -> Maybe [(Int, Int)]
+rawMatch regex s offset opts = unsafePerformIO $ do
+  withRegex regex $ \re st -> do
+    nCapt <- fullInfoInt re st info_capturecount
+    let ovecSize = (nCapt + 1) * 3
+        ovecBytes = ovecSize * size_of_cint
+    allocaBytes ovecBytes $ \ovec -> do
+      let (strFp, off, len) = B.toForeignPtr s
+      withForeignPtr strFp $ \strPtr -> do
+        results <- c_pcre_exec re st (strPtr `plusPtr` off) (fromIntegral len) (fromIntegral offset)
+                               (combineExecOptions opts) ovec (fromIntegral ovecSize)
+        if results < 0 then return Nothing
+        else
+          let loop n o acc =
+                if n == results then return $ Just $ reverse acc
+                else do
+                  i <- peekElemOff ovec $! o
+                  j <- peekElemOff ovec (o + 1)
+                  loop (n + 1) (o + 2) ((fromIntegral i, fromIntegral j) : acc)
+          in loop 0 0 []
+
+
+reMatch :: Regex -> B.ByteString -> Bool
+reMatch r s = isJust $ rawMatch r s 0 []
+
+
+substr :: B.ByteString -> (Int,Int) -> B.ByteString
+substr s (f, t) = B.take (t - f) . B.drop f $ s
+
+
+match :: Regex -> B.ByteString -> Maybe [B.ByteString]
+match r s = fmap (map (substr s)) $ rawMatch r s 0 []
+
+
+-- Rather simplified implementation of regex-heavy's gsub
+gsub :: Regex -> ([B.ByteString] -> B.ByteString) -> B.ByteString -> B.ByteString
+gsub r f = B.concat . rep
+  where
+  rep s = case rawMatch r s 0 [] of
+    Just m@((begin,end):_) -> substr s (0, begin) : f (map (substr s) m) : rep (substr s (end, B.length s))
+    _ -> [s]
+
+
+-- Wrapper around c_pcre_fullinfo for integer values
+fullInfoInt :: Num a => Ptr PCRE -> Ptr PCREExtra -> PCREInfo -> IO a
+fullInfoInt re st what =
+  alloca $ \n_ptr -> do
+    _ <- c_pcre_fullinfo re st what n_ptr
+    return . fromIntegral =<< peek (n_ptr :: Ptr CInt)
+
+
+captureCount :: Regex -> Int
+captureCount regex = unsafePerformIO $
+  withRegex regex $ \re st -> fullInfoInt re st info_capturecount
+
+
+captureNames :: Regex -> [(B.ByteString, Int)]
+captureNames regex = unsafePerformIO $
+  withRegex regex $ \re st -> do
+    count     <- fullInfoInt re st info_namecount
+    entrysize <- fullInfoInt re st info_nameentrysize
+
+    buf <- alloca $ \n_ptr -> do
+      _ <- c_pcre_fullinfo re st info_nametable n_ptr
+      buf <- peek n_ptr
+      B.packCStringLen (buf, count*entrysize)
+
+    return $ split entrysize buf
+
+  where
+    -- Split the nametable buffer into entries. Each entry has a fixed size in
+    -- bytes. The first two bytes in each entry store the pattern number in
+    -- big-endian format, the bytes following that contain the nul-terminated
+    -- name of the subpattern.
+    split :: Int -> B.ByteString -> [(B.ByteString, Int)]
+    split entrysize buf
+      | B.null buf = []
+      | otherwise =
+        let
+          (entry, rest) = B.splitAt entrysize buf
+          idx = fromIntegral . B.index entry
+          num = idx 0 * 256 + idx 1
+          name = B.takeWhile (/= 0) $ B.drop 2 entry
+        in (name, num) : split entrysize rest
diff --git a/Logstat/Types.hs b/Logstat/Types.hs
index 22f9656..11d2588 100644
--- a/Logstat/Types.hs
+++ b/Logstat/Types.hs
@@ -1,10 +1,9 @@
 module Logstat.Types where
 
-import Text.Regex.PCRE.Light (Regex)
 import Data.Map.Strict (Map)
 import Data.Heap       (Heap)
 import Data.ByteString (ByteString)
-
+import Logstat.Regex   (Regex)
 
 -- This type may seem inefficient; Surely we're not generating a string AND
 -- numeric representation for each and every intermediate value? But rest
@@ -23,6 +22,7 @@ instance Ord Val  where (Val a _) <= (Val b _) = a <= b
 data EvalError
   = UnknownField Field -- This error case can actually be prevented with static analysis
   | NoMatch Field ByteString
+  | NoExtract ByteString
   | InvalidNumber ByteString
   | DivByZero
   | Filtered -- Not strictly an error, just a way to signal that this event should be dropped
@@ -30,6 +30,7 @@ data EvalError
 instance Show EvalError where
   show (UnknownField n) = "Unknown variable '" ++ n ++ "'"
   show (NoMatch f v) = "Regex on field " ++ f ++ " failed to match " ++ show v
+  show (NoExtract v) = "Regex in extract() failed to match " ++ show v
   show (InvalidNumber v) = "Invalid number " ++ show v
   show DivByZero = "Division by zero"
   show Filtered = "Filtered"
@@ -81,6 +82,8 @@ data Expr
   | EOp !Op !Expr !Expr
   | EIf !Expr !Expr !Expr
   | EMatch !Regex !Expr
+  | EExtract !Expr !Regex
+  | EReplace !Expr !Regex !ByteString
   | ENeg !Expr
   | ENot !Expr
   deriving(Show)
diff --git a/Tests/Main.hs b/Tests/Main.hs
index fdb0717..9386aeb 100644
--- a/Tests/Main.hs
+++ b/Tests/Main.hs
@@ -57,4 +57,10 @@ main = runTestTT $ test
   , expr "1234 !~ /5/" "1"
   , expr "\"abC\" ~ /^abc$/i" "1"
   , expr "\"abC\" !~ /^abc$/i" ""
+  , expr "extract(123153, /1(.)3/)" "2"
+  , expr "extract(123153, /1(.)3$/)" "5"
+  , expr "replace(1, /1/, \"2\")" "2"
+  , expr "replace(111, /1/, \"2\")" "222"
+  , expr "replace(9181716, /1/, \"2\")" "9282726"
+  , expr "replace(1, /3/, \"2\")" "1"
   ]
diff --git a/logstat.cabal b/logstat.cabal
index c38c42b..6ecd71d 100644
--- a/logstat.cabal
+++ b/logstat.cabal
@@ -27,6 +27,7 @@ executable logstat
   other-modules:
     Logstat.Eval
     Logstat.Parse
+    Logstat.Regex
     Logstat.Types
     Logstat.Value
 
@@ -51,5 +52,6 @@ test-suite tests
   other-modules:
     Logstat.Eval
     Logstat.Parse
+    Logstat.Regex
     Logstat.Types
     Logstat.Value
diff --git a/stack.yaml b/stack.yaml
index 34adc48..8be8203 100644
--- a/stack.yaml
+++ b/stack.yaml
@@ -1,7 +1,3 @@
 resolver: lts-9.2
 packages:
 - '.'
-- location:
-    git: https://github.com/17dec/pcre-light
-    commit: a70d5842783743fb104a608e902a8e82b3b8b584
-  extra-dep: true
author	Yorhel <git@yorhel.nl>	2017-10-15 11:13:45 +0200
committer	Yorhel <git@yorhel.nl>	2017-10-15 11:13:45 +0200
commit	947c61f3665e8058df785c3923332b7c5f7f35e6 (patch)
tree	4892f614fc802b6a5fc77bb84d84c656b2ca7ff1
parent	63b8fae12372a7183cf9da89ad5ebb04241c974d (diff)