1 files changed, 167 insertions, 0 deletions
diff --git a/Logstat/Regex.hs b/Logstat/Regex.hs
new file mode 100644
index 0000000..b537342
--- /dev/null
+++ b/Logstat/Regex.hs
@@ -0,0 +1,167 @@
+-- This is (yet another) wrapper around PCRE. The pcre-light wrapper is missing
+-- some functionality (named captures, study, replacement support) and
+-- pcre-heavy also lacks two of those. I'd like this to be integrated with
+-- pcre-light, but its maintainer is unresponsive. :(
+
+module Logstat.Regex
+  ( Regex(..)
+
+  , PCREOption
+  , caseless
+  , extended
+  , no_auto_capture
+
+  , compileM
+  , study
+  , reMatch
+  , match
+  , gsub
+  , captureCount
+  , captureNames
+  ) where
+
+import Data.Maybe (isJust)
+import Foreign
+import Foreign.C.Types
+import Foreign.C.String
+import System.IO.Unsafe (unsafePerformIO)
+import Text.Regex.PCRE.Light.Base hiding (Regex(..))
+import qualified Data.ByteString          as B
+import qualified Data.ByteString.Internal as B
+
+type PCREExtra = () -- Not exported by pcre-light.
+
+data Regex = Regex !(ForeignPtr PCRE) !(ForeignPtr PCREExtra) !B.ByteString
+
+foreign import ccall unsafe "pcre.h pcre_study"
+  c_pcre_study :: Ptr PCRE -> CInt -> Ptr CString -> IO (Ptr PCREExtra)
+
+foreign import ccall unsafe "pcre.h &pcre_free_study"
+  c_pcre_free_study :: FunPtr (Ptr PCREExtra -> IO ())
+
+
+instance Eq  Regex where (Regex _ _ a) == (Regex _ _ b) = a == b
+instance Ord Regex where (Regex _ _ a) <= (Regex _ _ b) = a <= b
+instance Show Regex where show (Regex _ _ a) = "Regex " ++ show a
+
+
+-- Based on pcre-light's compileM
+compileM :: B.ByteString -> [PCREOption] -> Either String Regex
+compileM str os = unsafePerformIO $
+  B.useAsCString str $ \pattern ->
+    alloca $ \errptr       ->
+    alloca $ \erroffset    -> do
+      re <- c_pcre_compile pattern (combineOptions os) errptr erroffset nullPtr
+      if re == nullPtr
+        then do
+          err <- peekCString =<< peek errptr
+          return (Left err)
+        else do
+          st' <- newForeignPtr finalizerFree nullPtr
+          re' <- newForeignPtr finalizerFree re
+          return (Right (Regex re' st' str))
+
+
+study :: Regex -> Regex
+study regex@(Regex re _ str) = unsafePerformIO $
+  withRegex regex $ \re' st ->
+    if st /= nullPtr
+      then return regex
+      else do
+        alloca $ \errptr -> do
+          -- We can safely ignore errors here, a nullPtr is also a valid 'PCREExtra'.
+          -- XXX: The flag should be replaced with a proper PCRE_STUDY_JIT_COMPILE when pcre-light exports it.
+          st' <- c_pcre_study re' 1 errptr
+          st'' <- newForeignPtr c_pcre_free_study st'
+          return (Regex re st'' str)
+
+
+withRegex :: Regex -> (Ptr PCRE -> Ptr PCREExtra -> IO a) -> IO a
+withRegex (Regex re st _) f = withForeignPtr re $ \re' -> withForeignPtr st $ \st' -> f re' st'
+
+
+-- Based on pcre-heavy's rawMatch
+rawMatch :: Regex -> B.ByteString -> Int -> [PCREExecOption] -> Maybe [(Int, Int)]
+rawMatch regex s offset opts = unsafePerformIO $ do
+  withRegex regex $ \re st -> do
+    nCapt <- fullInfoInt re st info_capturecount
+    let ovecSize = (nCapt + 1) * 3
+        ovecBytes = ovecSize * size_of_cint
+    allocaBytes ovecBytes $ \ovec -> do
+      let (strFp, off, len) = B.toForeignPtr s
+      withForeignPtr strFp $ \strPtr -> do
+        results <- c_pcre_exec re st (strPtr `plusPtr` off) (fromIntegral len) (fromIntegral offset)
+                               (combineExecOptions opts) ovec (fromIntegral ovecSize)
+        if results < 0 then return Nothing
+        else
+          let loop n o acc =
+                if n == results then return $ Just $ reverse acc
+                else do
+                  i <- peekElemOff ovec $! o
+                  j <- peekElemOff ovec (o + 1)
+                  loop (n + 1) (o + 2) ((fromIntegral i, fromIntegral j) : acc)
+          in loop 0 0 []
+
+
+reMatch :: Regex -> B.ByteString -> Bool
+reMatch r s = isJust $ rawMatch r s 0 []
+
+
+substr :: B.ByteString -> (Int,Int) -> B.ByteString
+substr s (f, t) = B.take (t - f) . B.drop f $ s
+
+
+match :: Regex -> B.ByteString -> Maybe [B.ByteString]
+match r s = fmap (map (substr s)) $ rawMatch r s 0 []
+
+
+-- Rather simplified implementation of regex-heavy's gsub
+gsub :: Regex -> ([B.ByteString] -> B.ByteString) -> B.ByteString -> B.ByteString
+gsub r f = B.concat . rep
+  where
+  rep s = case rawMatch r s 0 [] of
+    Just m@((begin,end):_) -> substr s (0, begin) : f (map (substr s) m) : rep (substr s (end, B.length s))
+    _ -> [s]
+
+
+-- Wrapper around c_pcre_fullinfo for integer values
+fullInfoInt :: Num a => Ptr PCRE -> Ptr PCREExtra -> PCREInfo -> IO a
+fullInfoInt re st what =
+  alloca $ \n_ptr -> do
+    _ <- c_pcre_fullinfo re st what n_ptr
+    return . fromIntegral =<< peek (n_ptr :: Ptr CInt)
+
+
+captureCount :: Regex -> Int
+captureCount regex = unsafePerformIO $
+  withRegex regex $ \re st -> fullInfoInt re st info_capturecount
+
+
+captureNames :: Regex -> [(B.ByteString, Int)]
+captureNames regex = unsafePerformIO $
+  withRegex regex $ \re st -> do
+    count     <- fullInfoInt re st info_namecount
+    entrysize <- fullInfoInt re st info_nameentrysize
+
+    buf <- alloca $ \n_ptr -> do
+      _ <- c_pcre_fullinfo re st info_nametable n_ptr
+      buf <- peek n_ptr
+      B.packCStringLen (buf, count*entrysize)
+
+    return $ split entrysize buf
+
+  where
+    -- Split the nametable buffer into entries. Each entry has a fixed size in
+    -- bytes. The first two bytes in each entry store the pattern number in
+    -- big-endian format, the bytes following that contain the nul-terminated
+    -- name of the subpattern.
+    split :: Int -> B.ByteString -> [(B.ByteString, Int)]
+    split entrysize buf
+      | B.null buf = []
+      | otherwise =
+        let
+          (entry, rest) = B.splitAt entrysize buf
+          idx = fromIntegral . B.index entry
+          num = idx 0 * 256 + idx 1
+          name = B.takeWhile (/= 0) $ B.drop 2 entry
+        in (name, num) : split entrysize rest