1 {-# LANGUAGE RankNTypes #-}
    2 {-# LANGUAGE MultiParamTypeClasses, FlexibleInstances #-}
    3 
    4 module Text.RegExp.Data where
    5 
    6 import Data.Semiring
    7 
    8 -- |
    9 -- Regular expressions are represented as values of type 'RegExp' @c@
   10 -- where @c@ is the character type of the underlying alphabet. Values
   11 -- of type @RegExp@ @c@ can be matched against lists of type @[c]@.
   12 -- 
   13 newtype RegExp c = RegExp (forall w . Semiring w => RegW w c)
   14 
   15 data RegW w c = RegW { active :: !Bool,
   16                        empty  :: !w, 
   17                        final_ :: !w, 
   18                        reg    :: !(Reg w c) }
   19 
   20 final :: Semiring w => RegW w c -> w
   21 final r = if active r then final_ r else zero
   22 
   23 data Reg w c = Eps
   24              | Sym String (c -> w)
   25              | Alt (RegW w c) (RegW w c)
   26              | Seq (RegW w c) (RegW w c)
   27              | Rep (RegW w c)
   28 
   29 class Semiring w => Weight a b w where
   30   symWeight :: (a -> w) -> b -> w
   31 
   32 defaultSymWeight :: (a -> w) -> a -> w
   33 defaultSymWeight = id
   34 
   35 instance Weight c c Bool where
   36   symWeight = defaultSymWeight
   37 
   38 instance Num a => Weight c c (Numeric a) where
   39   symWeight = defaultSymWeight
   40 
   41 weighted :: Weight a b w => RegW w a -> RegW w b
   42 weighted (RegW a e f r) =
   43   case r of
   44     Eps     -> RegW a e f Eps
   45     Sym s p -> RegW a e f (Sym s (symWeight p))
   46     Alt p q -> RegW a e f (Alt (weighted p) (weighted q))
   47     Seq p q -> RegW a e f (Seq (weighted p) (weighted q))
   48     Rep p   -> RegW a e f (Rep (weighted p))
   49 
   50 -- |
   51 -- Matches the empty word. 'eps' has no direct string representation
   52 -- but is used to implement other constructs such as optional
   53 -- components like @a?@.
   54 -- 
   55 eps :: RegExp c
   56 eps = RegExp epsW
   57 
   58 epsW :: Semiring w => RegW w c
   59 epsW = RegW False one zero Eps
   60 
   61 -- | Matches the given character.
   62 -- 
   63 char :: Char -> RegExp Char
   64 char c = psym (quote c) (c==)
   65 
   66 -- | Matches the given symbol.
   67 -- 
   68 sym :: (Eq c, Show c) => c -> RegExp c
   69 sym c = psym (show c) (c==)
   70 
   71 quote :: Char -> String
   72 quote c | c `elem` " \\|*+?.[]{}^" = '\\' : [c]
   73         | otherwise                = [c]
   74 
   75 -- | Matches a symbol that satisfies the given predicate.
   76 -- 
   77 psym :: String -> (c -> Bool) -> RegExp c
   78 psym s p = RegExp (symW s (fromBool . p))
   79 
   80 symW :: Semiring w => String -> (c -> w) -> RegW w c
   81 symW s p = RegW False zero zero $ Sym s p
   82 
   83 -- | Matches an arbitrary symbol.
   84 -- 
   85 anySym :: RegExp c
   86 anySym = psym "." (const True)
   87 
   88 -- | Does not match anything. 'noMatch' is an identity for 'alt'.
   89 -- 
   90 noMatch :: RegExp c
   91 noMatch = psym "[]" (const False)
   92 
   93 -- |
   94 -- Matches either of two regular expressions. For example @a+b@
   95 -- matches either the character @a@ or the character @b@.
   96 -- 
   97 alt :: RegExp c -> RegExp c -> RegExp c
   98 alt (RegExp p) (RegExp q) =
   99   RegExp (RegW False (empty p .+. empty q) zero (Alt p q))
  100 
  101 altW :: Semiring w => RegW w c -> RegW w c -> RegW w c
  102 altW p q = RegW (active p || active q)
  103                 (empty p .+. empty q)
  104                 (final p .+. final q)
  105                 (Alt p q)
  106 
  107 -- |
  108 -- Matches the sequence of two regular expressions. For example the
  109 -- regular expressions @ab@ matches the word @ab@.
  110 -- 
  111 seq_ :: RegExp c -> RegExp c -> RegExp c
  112 seq_ (RegExp p) (RegExp q) =
  113   RegExp (RegW False (empty p .*. empty q) zero (Seq p q))
  114 
  115 seqW :: Semiring w => RegW w c -> RegW w c -> RegW w c
  116 seqW p q = RegW (active p || active q)
  117                 (empty p .*. empty q)
  118                 (final p .*. empty q .+. final q)
  119                 (Seq p q)
  120 
  121 -- | Matches zero or more occurrences of the given regular
  122 --   expression. For example @a*@ matches the character @a@ zero or
  123 --   more times.
  124 -- 
  125 rep :: RegExp c -> RegExp c
  126 rep (RegExp r) = RegExp (RegW False one zero (Rep r))
  127 
  128 repW :: Semiring w => RegW w c -> RegW w c
  129 repW r = RegW (active r) one (final r) (Rep r)
  130 
  131 -- | Matches one or more occurrences of the given regular
  132 --   expression. For example @a+@ matches the character @a@ one or
  133 --   more times.
  134 -- 
  135 rep1 :: RegExp c -> RegExp c
  136 rep1 r = r `seq_` rep r
  137 
  138 -- |
  139 -- Matches the given regular expression or the empty word. Optional
  140 -- expressions are usually written @a?@ but could also be written
  141 -- @(|a)@, that is, as alternative between 'eps' and @a@.
  142 -- 
  143 opt :: RegExp c -> RegExp c
  144 opt r = eps `alt` r
  145 
  146 -- |
  147 -- Matches a regular expression a given number of times. For example,
  148 -- the regular expression @a{4,7}@ matches the character @a@ four to
  149 -- seven times. If the minimal and maximal occurences are identical,
  150 -- one can be left out, that is, @a{2}@ matches two occurrences of the
  151 -- character @a@.
  152 -- 
  153 -- Numerical bounds are implemented via translation into ordinary
  154 -- regular expressions. For example, @a{4,7}@ is translated into
  155 -- @aaaa(a(a(a)?)?)?@.
  156 -- 
  157 brep :: (Int,Int) -> RegExp c -> RegExp c
  158 brep (n,m) r
  159   | n < 0 || m < 0 || n > m  =  error msg
  160   | n == 0 && m == 0         =  eps
  161   | n == m                   =  foldr1 seq_ (replicate n r)
  162   | otherwise                =  foldr seq_ rest (replicate n r)
  163  where
  164   rest = foldr nestopt (opt r) (replicate (m-n-1) r)
  165   nestopt p q = opt (seq_ p q)
  166   msg = "Text.RegExp.brep: invalid repetition bounds: " ++ show (n,m)
  167 
  168 regW :: Semiring w => RegExp c -> RegW w c
  169 regW (RegExp r) = r
  170 
  171 instance Show (RegExp Char) where
  172   showsPrec n r = showsPrec n (regW r :: RegW Bool Char)
  173 
  174 instance Show (RegW Bool Char) where
  175   showsPrec n r = showsPrec n (reg r)
  176 
  177 instance Show (Reg Bool Char) where
  178   showsPrec _ Eps        =  showString "()"
  179   showsPrec _ (Sym s _)  =  showString s
  180   showsPrec n (Alt p q)  =  showParen (n > 0)
  181                          $  showsPrec 1 p
  182                          .  showString "|"
  183                          .  shows q
  184   showsPrec n (Seq p q)  =  showParen (n > 1)
  185                          $  showsPrec 2 p
  186                          .  showsPrec 1 q
  187   showsPrec _ (Rep r)    =  showsPrec 2 r . showString "*"
  188 
  189 instance Eq (RegExp Char) where
  190   p == q  =  regW p == (regW q :: RegW Bool Char)
  191 
  192 instance Eq (RegW Bool Char) where
  193   p == q  =  reg p == reg q
  194 
  195 instance Eq (Reg Bool Char) where
  196   Eps     == Eps      =  True
  197   Sym s _ == Sym t _  =  s==t
  198   Alt a b == Alt c d  =  a==c && b==d
  199   Seq a b == Seq c d  =  a==c && b==d
  200   Rep a   == Rep b    =  a==b
  201   _       == _        =  False