1 {-# LANGUAGE RankNTypes #-} 2 {-# LANGUAGE MultiParamTypeClasses, FlexibleInstances #-} 3 4 module Text.RegExp.Data where 5 6 import Data.Semiring 7 8 -- | 9 -- Regular expressions are represented as values of type 'RegExp' @c@ 10 -- where @c@ is the character type of the underlying alphabet. Values 11 -- of type @RegExp@ @c@ can be matched against lists of type @[c]@. 12 -- 13 newtype RegExp c = RegExp (forall w . Semiring w => RegW w c) 14 15 data RegW w c = RegW { active :: !Bool, 16 empty :: !w, 17 final_ :: !w, 18 reg :: !(Reg w c) } 19 20 final :: Semiring w => RegW w c -> w 21 final r = if active r then final_ r else zero 22 23 data Reg w c = Eps 24 | Sym String (c -> w) 25 | Alt (RegW w c) (RegW w c) 26 | Seq (RegW w c) (RegW w c) 27 | Rep (RegW w c) 28 29 class Semiring w => Weight a b w where 30 symWeight :: (a -> w) -> b -> w 31 32 defaultSymWeight :: (a -> w) -> a -> w 33 defaultSymWeight = id 34 35 instance Weight c c Bool where 36 symWeight = defaultSymWeight 37 38 instance Num a => Weight c c (Numeric a) where 39 symWeight = defaultSymWeight 40 41 weighted :: Weight a b w => RegW w a -> RegW w b 42 weighted (RegW a e f r) = 43 case r of 44 Eps -> RegW a e f Eps 45 Sym s p -> RegW a e f (Sym s (symWeight p)) 46 Alt p q -> RegW a e f (Alt (weighted p) (weighted q)) 47 Seq p q -> RegW a e f (Seq (weighted p) (weighted q)) 48 Rep p -> RegW a e f (Rep (weighted p)) 49 50 -- | 51 -- Matches the empty word. 'eps' has no direct string representation 52 -- but is used to implement other constructs such as optional 53 -- components like @a?@. 54 -- 55 eps :: RegExp c 56 eps = RegExp epsW 57 58 epsW :: Semiring w => RegW w c 59 epsW = RegW False one zero Eps 60 61 -- | Matches the given character. 62 -- 63 char :: Char -> RegExp Char 64 char c = psym (quote c) (c==) 65 66 -- | Matches the given symbol. 67 -- 68 sym :: (Eq c, Show c) => c -> RegExp c 69 sym c = psym (show c) (c==) 70 71 quote :: Char -> String 72 quote c | c `elem` " \\|*+?.[]{}^" = '\\' : [c] 73 | otherwise = [c] 74 75 -- | Matches a symbol that satisfies the given predicate. 76 -- 77 psym :: String -> (c -> Bool) -> RegExp c 78 psym s p = RegExp (symW s (fromBool . p)) 79 80 symW :: Semiring w => String -> (c -> w) -> RegW w c 81 symW s p = RegW False zero zero $ Sym s p 82 83 -- | Matches an arbitrary symbol. 84 -- 85 anySym :: RegExp c 86 anySym = psym "." (const True) 87 88 -- | Does not match anything. 'noMatch' is an identity for 'alt'. 89 -- 90 noMatch :: RegExp c 91 noMatch = psym "[]" (const False) 92 93 -- | 94 -- Matches either of two regular expressions. For example @a+b@ 95 -- matches either the character @a@ or the character @b@. 96 -- 97 alt :: RegExp c -> RegExp c -> RegExp c 98 alt (RegExp p) (RegExp q) = 99 RegExp (RegW False (empty p .+. empty q) zero (Alt p q)) 100 101 altW :: Semiring w => RegW w c -> RegW w c -> RegW w c 102 altW p q = RegW (active p || active q) 103 (empty p .+. empty q) 104 (final p .+. final q) 105 (Alt p q) 106 107 -- | 108 -- Matches the sequence of two regular expressions. For example the 109 -- regular expressions @ab@ matches the word @ab@. 110 -- 111 seq_ :: RegExp c -> RegExp c -> RegExp c 112 seq_ (RegExp p) (RegExp q) = 113 RegExp (RegW False (empty p .*. empty q) zero (Seq p q)) 114 115 seqW :: Semiring w => RegW w c -> RegW w c -> RegW w c 116 seqW p q = RegW (active p || active q) 117 (empty p .*. empty q) 118 (final p .*. empty q .+. final q) 119 (Seq p q) 120 121 -- | Matches zero or more occurrences of the given regular 122 -- expression. For example @a*@ matches the character @a@ zero or 123 -- more times. 124 -- 125 rep :: RegExp c -> RegExp c 126 rep (RegExp r) = RegExp (RegW False one zero (Rep r)) 127 128 repW :: Semiring w => RegW w c -> RegW w c 129 repW r = RegW (active r) one (final r) (Rep r) 130 131 -- | Matches one or more occurrences of the given regular 132 -- expression. For example @a+@ matches the character @a@ one or 133 -- more times. 134 -- 135 rep1 :: RegExp c -> RegExp c 136 rep1 r = r `seq_` rep r 137 138 -- | 139 -- Matches the given regular expression or the empty word. Optional 140 -- expressions are usually written @a?@ but could also be written 141 -- @(|a)@, that is, as alternative between 'eps' and @a@. 142 -- 143 opt :: RegExp c -> RegExp c 144 opt r = eps `alt` r 145 146 -- | 147 -- Matches a regular expression a given number of times. For example, 148 -- the regular expression @a{4,7}@ matches the character @a@ four to 149 -- seven times. If the minimal and maximal occurences are identical, 150 -- one can be left out, that is, @a{2}@ matches two occurrences of the 151 -- character @a@. 152 -- 153 -- Numerical bounds are implemented via translation into ordinary 154 -- regular expressions. For example, @a{4,7}@ is translated into 155 -- @aaaa(a(a(a)?)?)?@. 156 -- 157 brep :: (Int,Int) -> RegExp c -> RegExp c 158 brep (n,m) r 159 | n < 0 || m < 0 || n > m = error msg 160 | n == 0 && m == 0 = eps 161 | n == m = foldr1 seq_ (replicate n r) 162 | otherwise = foldr seq_ rest (replicate n r) 163 where 164 rest = foldr nestopt (opt r) (replicate (m-n-1) r) 165 nestopt p q = opt (seq_ p q) 166 msg = "Text.RegExp.brep: invalid repetition bounds: " ++ show (n,m) 167 168 regW :: Semiring w => RegExp c -> RegW w c 169 regW (RegExp r) = r 170 171 instance Show (RegExp Char) where 172 showsPrec n r = showsPrec n (regW r :: RegW Bool Char) 173 174 instance Show (RegW Bool Char) where 175 showsPrec n r = showsPrec n (reg r) 176 177 instance Show (Reg Bool Char) where 178 showsPrec _ Eps = showString "()" 179 showsPrec _ (Sym s _) = showString s 180 showsPrec n (Alt p q) = showParen (n > 0) 181 $ showsPrec 1 p 182 . showString "|" 183 . shows q 184 showsPrec n (Seq p q) = showParen (n > 1) 185 $ showsPrec 2 p 186 . showsPrec 1 q 187 showsPrec _ (Rep r) = showsPrec 2 r . showString "*" 188 189 instance Eq (RegExp Char) where 190 p == q = regW p == (regW q :: RegW Bool Char) 191 192 instance Eq (RegW Bool Char) where 193 p == q = reg p == reg q 194 195 instance Eq (Reg Bool Char) where 196 Eps == Eps = True 197 Sym s _ == Sym t _ = s==t 198 Alt a b == Alt c d = a==c && b==d 199 Seq a b == Seq c d = a==c && b==d 200 Rep a == Rep b = a==b 201 _ == _ = False