summaryrefslogtreecommitdiff
path: root/Logstat/Regex.hs
diff options
context:
space:
mode:
Diffstat (limited to 'Logstat/Regex.hs')
-rw-r--r--Logstat/Regex.hs167
1 files changed, 167 insertions, 0 deletions
diff --git a/Logstat/Regex.hs b/Logstat/Regex.hs
new file mode 100644
index 0000000..b537342
--- /dev/null
+++ b/Logstat/Regex.hs
@@ -0,0 +1,167 @@
+-- This is (yet another) wrapper around PCRE. The pcre-light wrapper is missing
+-- some functionality (named captures, study, replacement support) and
+-- pcre-heavy also lacks two of those. I'd like this to be integrated with
+-- pcre-light, but its maintainer is unresponsive. :(
+
+module Logstat.Regex
+ ( Regex(..)
+
+ , PCREOption
+ , caseless
+ , extended
+ , no_auto_capture
+
+ , compileM
+ , study
+ , reMatch
+ , match
+ , gsub
+ , captureCount
+ , captureNames
+ ) where
+
+import Data.Maybe (isJust)
+import Foreign
+import Foreign.C.Types
+import Foreign.C.String
+import System.IO.Unsafe (unsafePerformIO)
+import Text.Regex.PCRE.Light.Base hiding (Regex(..))
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Internal as B
+
+type PCREExtra = () -- Not exported by pcre-light.
+
+data Regex = Regex !(ForeignPtr PCRE) !(ForeignPtr PCREExtra) !B.ByteString
+
+foreign import ccall unsafe "pcre.h pcre_study"
+ c_pcre_study :: Ptr PCRE -> CInt -> Ptr CString -> IO (Ptr PCREExtra)
+
+foreign import ccall unsafe "pcre.h &pcre_free_study"
+ c_pcre_free_study :: FunPtr (Ptr PCREExtra -> IO ())
+
+
+instance Eq Regex where (Regex _ _ a) == (Regex _ _ b) = a == b
+instance Ord Regex where (Regex _ _ a) <= (Regex _ _ b) = a <= b
+instance Show Regex where show (Regex _ _ a) = "Regex " ++ show a
+
+
+-- Based on pcre-light's compileM
+compileM :: B.ByteString -> [PCREOption] -> Either String Regex
+compileM str os = unsafePerformIO $
+ B.useAsCString str $ \pattern ->
+ alloca $ \errptr ->
+ alloca $ \erroffset -> do
+ re <- c_pcre_compile pattern (combineOptions os) errptr erroffset nullPtr
+ if re == nullPtr
+ then do
+ err <- peekCString =<< peek errptr
+ return (Left err)
+ else do
+ st' <- newForeignPtr finalizerFree nullPtr
+ re' <- newForeignPtr finalizerFree re
+ return (Right (Regex re' st' str))
+
+
+study :: Regex -> Regex
+study regex@(Regex re _ str) = unsafePerformIO $
+ withRegex regex $ \re' st ->
+ if st /= nullPtr
+ then return regex
+ else do
+ alloca $ \errptr -> do
+ -- We can safely ignore errors here, a nullPtr is also a valid 'PCREExtra'.
+ -- XXX: The flag should be replaced with a proper PCRE_STUDY_JIT_COMPILE when pcre-light exports it.
+ st' <- c_pcre_study re' 1 errptr
+ st'' <- newForeignPtr c_pcre_free_study st'
+ return (Regex re st'' str)
+
+
+withRegex :: Regex -> (Ptr PCRE -> Ptr PCREExtra -> IO a) -> IO a
+withRegex (Regex re st _) f = withForeignPtr re $ \re' -> withForeignPtr st $ \st' -> f re' st'
+
+
+-- Based on pcre-heavy's rawMatch
+rawMatch :: Regex -> B.ByteString -> Int -> [PCREExecOption] -> Maybe [(Int, Int)]
+rawMatch regex s offset opts = unsafePerformIO $ do
+ withRegex regex $ \re st -> do
+ nCapt <- fullInfoInt re st info_capturecount
+ let ovecSize = (nCapt + 1) * 3
+ ovecBytes = ovecSize * size_of_cint
+ allocaBytes ovecBytes $ \ovec -> do
+ let (strFp, off, len) = B.toForeignPtr s
+ withForeignPtr strFp $ \strPtr -> do
+ results <- c_pcre_exec re st (strPtr `plusPtr` off) (fromIntegral len) (fromIntegral offset)
+ (combineExecOptions opts) ovec (fromIntegral ovecSize)
+ if results < 0 then return Nothing
+ else
+ let loop n o acc =
+ if n == results then return $ Just $ reverse acc
+ else do
+ i <- peekElemOff ovec $! o
+ j <- peekElemOff ovec (o + 1)
+ loop (n + 1) (o + 2) ((fromIntegral i, fromIntegral j) : acc)
+ in loop 0 0 []
+
+
+reMatch :: Regex -> B.ByteString -> Bool
+reMatch r s = isJust $ rawMatch r s 0 []
+
+
+substr :: B.ByteString -> (Int,Int) -> B.ByteString
+substr s (f, t) = B.take (t - f) . B.drop f $ s
+
+
+match :: Regex -> B.ByteString -> Maybe [B.ByteString]
+match r s = fmap (map (substr s)) $ rawMatch r s 0 []
+
+
+-- Rather simplified implementation of regex-heavy's gsub
+gsub :: Regex -> ([B.ByteString] -> B.ByteString) -> B.ByteString -> B.ByteString
+gsub r f = B.concat . rep
+ where
+ rep s = case rawMatch r s 0 [] of
+ Just m@((begin,end):_) -> substr s (0, begin) : f (map (substr s) m) : rep (substr s (end, B.length s))
+ _ -> [s]
+
+
+-- Wrapper around c_pcre_fullinfo for integer values
+fullInfoInt :: Num a => Ptr PCRE -> Ptr PCREExtra -> PCREInfo -> IO a
+fullInfoInt re st what =
+ alloca $ \n_ptr -> do
+ _ <- c_pcre_fullinfo re st what n_ptr
+ return . fromIntegral =<< peek (n_ptr :: Ptr CInt)
+
+
+captureCount :: Regex -> Int
+captureCount regex = unsafePerformIO $
+ withRegex regex $ \re st -> fullInfoInt re st info_capturecount
+
+
+captureNames :: Regex -> [(B.ByteString, Int)]
+captureNames regex = unsafePerformIO $
+ withRegex regex $ \re st -> do
+ count <- fullInfoInt re st info_namecount
+ entrysize <- fullInfoInt re st info_nameentrysize
+
+ buf <- alloca $ \n_ptr -> do
+ _ <- c_pcre_fullinfo re st info_nametable n_ptr
+ buf <- peek n_ptr
+ B.packCStringLen (buf, count*entrysize)
+
+ return $ split entrysize buf
+
+ where
+ -- Split the nametable buffer into entries. Each entry has a fixed size in
+ -- bytes. The first two bytes in each entry store the pattern number in
+ -- big-endian format, the bytes following that contain the nul-terminated
+ -- name of the subpattern.
+ split :: Int -> B.ByteString -> [(B.ByteString, Int)]
+ split entrysize buf
+ | B.null buf = []
+ | otherwise =
+ let
+ (entry, rest) = B.splitAt entrysize buf
+ idx = fromIntegral . B.index entry
+ num = idx 0 * 256 + idx 1
+ name = B.takeWhile (/= 0) $ B.drop 2 entry
+ in (name, num) : split entrysize rest