diff options
Diffstat (limited to 'Logstat/Regex.hs')
-rw-r--r-- | Logstat/Regex.hs | 167 |
1 files changed, 167 insertions, 0 deletions
diff --git a/Logstat/Regex.hs b/Logstat/Regex.hs new file mode 100644 index 0000000..b537342 --- /dev/null +++ b/Logstat/Regex.hs @@ -0,0 +1,167 @@ +-- This is (yet another) wrapper around PCRE. The pcre-light wrapper is missing +-- some functionality (named captures, study, replacement support) and +-- pcre-heavy also lacks two of those. I'd like this to be integrated with +-- pcre-light, but its maintainer is unresponsive. :( + +module Logstat.Regex + ( Regex(..) + + , PCREOption + , caseless + , extended + , no_auto_capture + + , compileM + , study + , reMatch + , match + , gsub + , captureCount + , captureNames + ) where + +import Data.Maybe (isJust) +import Foreign +import Foreign.C.Types +import Foreign.C.String +import System.IO.Unsafe (unsafePerformIO) +import Text.Regex.PCRE.Light.Base hiding (Regex(..)) +import qualified Data.ByteString as B +import qualified Data.ByteString.Internal as B + +type PCREExtra = () -- Not exported by pcre-light. + +data Regex = Regex !(ForeignPtr PCRE) !(ForeignPtr PCREExtra) !B.ByteString + +foreign import ccall unsafe "pcre.h pcre_study" + c_pcre_study :: Ptr PCRE -> CInt -> Ptr CString -> IO (Ptr PCREExtra) + +foreign import ccall unsafe "pcre.h &pcre_free_study" + c_pcre_free_study :: FunPtr (Ptr PCREExtra -> IO ()) + + +instance Eq Regex where (Regex _ _ a) == (Regex _ _ b) = a == b +instance Ord Regex where (Regex _ _ a) <= (Regex _ _ b) = a <= b +instance Show Regex where show (Regex _ _ a) = "Regex " ++ show a + + +-- Based on pcre-light's compileM +compileM :: B.ByteString -> [PCREOption] -> Either String Regex +compileM str os = unsafePerformIO $ + B.useAsCString str $ \pattern -> + alloca $ \errptr -> + alloca $ \erroffset -> do + re <- c_pcre_compile pattern (combineOptions os) errptr erroffset nullPtr + if re == nullPtr + then do + err <- peekCString =<< peek errptr + return (Left err) + else do + st' <- newForeignPtr finalizerFree nullPtr + re' <- newForeignPtr finalizerFree re + return (Right (Regex re' st' str)) + + +study :: Regex -> Regex +study regex@(Regex re _ str) = unsafePerformIO $ + withRegex regex $ \re' st -> + if st /= nullPtr + then return regex + else do + alloca $ \errptr -> do + -- We can safely ignore errors here, a nullPtr is also a valid 'PCREExtra'. + -- XXX: The flag should be replaced with a proper PCRE_STUDY_JIT_COMPILE when pcre-light exports it. + st' <- c_pcre_study re' 1 errptr + st'' <- newForeignPtr c_pcre_free_study st' + return (Regex re st'' str) + + +withRegex :: Regex -> (Ptr PCRE -> Ptr PCREExtra -> IO a) -> IO a +withRegex (Regex re st _) f = withForeignPtr re $ \re' -> withForeignPtr st $ \st' -> f re' st' + + +-- Based on pcre-heavy's rawMatch +rawMatch :: Regex -> B.ByteString -> Int -> [PCREExecOption] -> Maybe [(Int, Int)] +rawMatch regex s offset opts = unsafePerformIO $ do + withRegex regex $ \re st -> do + nCapt <- fullInfoInt re st info_capturecount + let ovecSize = (nCapt + 1) * 3 + ovecBytes = ovecSize * size_of_cint + allocaBytes ovecBytes $ \ovec -> do + let (strFp, off, len) = B.toForeignPtr s + withForeignPtr strFp $ \strPtr -> do + results <- c_pcre_exec re st (strPtr `plusPtr` off) (fromIntegral len) (fromIntegral offset) + (combineExecOptions opts) ovec (fromIntegral ovecSize) + if results < 0 then return Nothing + else + let loop n o acc = + if n == results then return $ Just $ reverse acc + else do + i <- peekElemOff ovec $! o + j <- peekElemOff ovec (o + 1) + loop (n + 1) (o + 2) ((fromIntegral i, fromIntegral j) : acc) + in loop 0 0 [] + + +reMatch :: Regex -> B.ByteString -> Bool +reMatch r s = isJust $ rawMatch r s 0 [] + + +substr :: B.ByteString -> (Int,Int) -> B.ByteString +substr s (f, t) = B.take (t - f) . B.drop f $ s + + +match :: Regex -> B.ByteString -> Maybe [B.ByteString] +match r s = fmap (map (substr s)) $ rawMatch r s 0 [] + + +-- Rather simplified implementation of regex-heavy's gsub +gsub :: Regex -> ([B.ByteString] -> B.ByteString) -> B.ByteString -> B.ByteString +gsub r f = B.concat . rep + where + rep s = case rawMatch r s 0 [] of + Just m@((begin,end):_) -> substr s (0, begin) : f (map (substr s) m) : rep (substr s (end, B.length s)) + _ -> [s] + + +-- Wrapper around c_pcre_fullinfo for integer values +fullInfoInt :: Num a => Ptr PCRE -> Ptr PCREExtra -> PCREInfo -> IO a +fullInfoInt re st what = + alloca $ \n_ptr -> do + _ <- c_pcre_fullinfo re st what n_ptr + return . fromIntegral =<< peek (n_ptr :: Ptr CInt) + + +captureCount :: Regex -> Int +captureCount regex = unsafePerformIO $ + withRegex regex $ \re st -> fullInfoInt re st info_capturecount + + +captureNames :: Regex -> [(B.ByteString, Int)] +captureNames regex = unsafePerformIO $ + withRegex regex $ \re st -> do + count <- fullInfoInt re st info_namecount + entrysize <- fullInfoInt re st info_nameentrysize + + buf <- alloca $ \n_ptr -> do + _ <- c_pcre_fullinfo re st info_nametable n_ptr + buf <- peek n_ptr + B.packCStringLen (buf, count*entrysize) + + return $ split entrysize buf + + where + -- Split the nametable buffer into entries. Each entry has a fixed size in + -- bytes. The first two bytes in each entry store the pattern number in + -- big-endian format, the bytes following that contain the nul-terminated + -- name of the subpattern. + split :: Int -> B.ByteString -> [(B.ByteString, Int)] + split entrysize buf + | B.null buf = [] + | otherwise = + let + (entry, rest) = B.splitAt entrysize buf + idx = fromIntegral . B.index entry + num = idx 0 * 256 + idx 1 + name = B.takeWhile (/= 0) $ B.drop 2 entry + in (name, num) : split entrysize rest |