1 
    2 > {-# LANGUAGE GeneralizedNewtypeDeriving #-}
    3 > {-# LANGUAGE MultiParamTypeClasses, FlexibleContexts, FlexibleInstances #-}
    4 > {-# LANGUAGE ScopedTypeVariables #-}
    5 > {-# LANGUAGE OverloadedStrings #-}
    6 > {-# OPTIONS_GHC -fno-warn-missing-methods -fno-warn-orphans #-}
    7 
    8 We specify a `Monoid` instance for a `newtype` of lists.
    9 
   10 > import Data.Monoid ( Monoid(..) )
   11 
   12 We use QuickCheck version 1 for testing because version 2 cannot be
   13 used in batch mode.
   14 
   15 > import Test.QuickCheck
   16 > import Test.QuickCheck.Batch
   17 > import Control.Monad ( ap, replicateM )
   18 > import Data.Char ( chr, ord )
   19 > import Data.List ( permutations )
   20 
   21 We import the semiring properties in order to check them for the
   22 defined instances. We also define our own `sum` function for
   23 semirings.
   24 
   25 > import Data.Semiring.Properties
   26 > import Prelude hiding ( sum )
   27 
   28 Finally, we need the `RegExp` datatype, the `symWeight` function from
   29 the `Weight` class, and the different semirings used for matching.
   30 
   31 > import Text.RegExp
   32 > import Text.RegExp.Data
   33 > import Text.RegExp.Matching.Leftmost.Type ( Leftmost(..) )
   34 > import Text.RegExp.Matching.Longest.Type  ( Longest(..) )
   35 > import Text.RegExp.Matching.LeftLong.Type ( LeftLong(..) )
   36 > import Text.RegExp.Matching.Leftmost ( getLeftmost )
   37 > import Text.RegExp.Matching.Longest  ( getLongest )
   38 > import Text.RegExp.Matching.LeftLong ( getLeftLong )
   39 > import qualified Text.RegExp.Matching.Leftmost as Leftmost
   40 > import qualified Text.RegExp.Matching.Longest  as Longest
   41 > import qualified Text.RegExp.Matching.LeftLong as LeftLong
   42 
   43 The `main` function runs all tests defined in this program.
   44 
   45 > main :: IO ()
   46 > main = 
   47 >  do runChecks "semiring laws (Bool)" (semiring'laws :: Checks Bool)
   48 >     runChecks "semiring laws (Int)" (semiring'laws :: Checks (Numeric Int))
   49 >     runChecks "semiring laws (Leftmost)" (semiring'laws :: Checks Leftmost)
   50 >     runChecks "semiring laws (Longest)" (semiring'laws :: Checks Longest)
   51 >     runChecks "semiring laws (LeftLong)" semiring'laws'LeftLong
   52 >     runTests (pad "full match") options $
   53 >       checks (full'match'spec acceptFull id :: Checks Bool) ++
   54 >       checks (full'match'spec matchingCount getNumeric
   55 >               :: Checks (Numeric Int))
   56 >     runTests (pad "partial match") options $
   57 >       checks (partial'match'spec acceptPartial id :: Checks Bool) ++
   58 >       checks (indexed'match'spec Leftmost.matching getLeftmost) ++
   59 >       checks (partial'match'spec Longest.matching getLongest) ++
   60 >       checks (indexed'match'spec LeftLong.matching getLeftLong)
   61 >     runTests (pad "parse printed regexp") options [run parse'printed]
   62 >     runChecks "lazy infinite regexps" infinite'regexp'checks
   63 >     runTests "permutation parsing" options [run perm'parser'check]
   64 >  where
   65 >   options = defOpt { no_of_tests = 1000, length_of_tests = 60 }
   66 >   runChecks s = runTests (pad s) options . checks
   67 >   pad s = replicate (25-length s) ' ' ++ s
   68 
   69 The `Arbitrary` instance for numeric types wraps the underlying
   70 instance. We also provide one for `Char` which is not predefined.
   71 
   72 > instance (Num a, Arbitrary a) => Arbitrary (Numeric a) where
   73 >   arbitrary = Numeric `fmap` arbitrary
   74 >
   75 > instance Arbitrary Char where
   76 >   arbitrary = elements "abcde \\|*+?.[]{}"
   77 
   78 We provide generic `Semiring` instances for the semirings used for
   79 matching.
   80 
   81 > instance Arbitrary Leftmost where
   82 >   arbitrary = frequency [ (1, return zero)
   83 >                         , (1, return one)
   84 >                         , (3, (Leftmost . abs) `fmap` arbitrary) ]
   85 >
   86 > instance Arbitrary Longest where
   87 >   arbitrary = frequency [ (1, return zero) 
   88 >                         , (1, return one)
   89 >                         , (3, (Longest . succ . abs) `fmap` arbitrary) ]
   90 >
   91 > instance Arbitrary LeftLong where
   92 >   arbitrary = frequency [ (1, return zero)
   93 >                         , (1, return one)
   94 >                         , (3, do x <- abs `fmap` arbitrary
   95 >                                  y <- abs `fmap` arbitrary
   96 >                                  return $ LeftLong (min x y) (max x y)) ]
   97 
   98 We define a list of `Checks` for the semiring laws.
   99 
  100 > semiring'laws :: (Arbitrary s, Show s, Semiring s) => Checks s
  101 > semiring'laws = mconcat [ prop2 plus'comm
  102 >                         , prop1 left'zero
  103 >                         , prop3 add'assoc
  104 >                         , prop1 left'one
  105 >                         , prop1 right'one
  106 >                         , prop3 mul'assoc
  107 >                         , prop3 left'distr
  108 >                         , prop3 right'distr
  109 >                         , prop1 left'ann
  110 >                         , prop1 right'ann
  111 >                         ]
  112 
  113 `Checks` is a `newtype` for a list of batch tests with a phantom type
  114 that can be used in definitions of the properties.
  115 
  116 > newtype Checks a = Checks { checks :: [TestOptions -> IO TestResult] }
  117 >  deriving ( Monoid )
  118 
  119 We define the auxiliary functions to create semiring properties with
  120 different arities.
  121 
  122 > prop1 :: (Arbitrary s, Show s, Testable a) => (s -> a) -> Checks s
  123 > prop1 prop = Checks [run prop]
  124 >
  125 > prop2 :: (Arbitrary s, Show s, Testable a) => (s -> s -> a) -> Checks s
  126 > prop2 prop = Checks [run prop]
  127 >
  128 > prop3 :: (Arbitrary s, Show s, Testable a) => (s-> s -> s -> a) -> Checks s
  129 > prop3 prop = Checks [run prop]
  130 
  131 The `LeftLong` type satisfies the distributive laws only with a
  132 precondition on all involved multiplications: multiplied matches must
  133 be adjacent and the start position must be smaller than the end
  134 position. This precondition is satisfied for all multiplications
  135 during regular expression matching.
  136 
  137 We define a variant of `semiring'laws` with this precondition on the
  138 distributive laws.
  139 
  140 > semiring'laws'LeftLong :: Checks LeftLong
  141 > semiring'laws'LeftLong = mconcat
  142 >   [ prop2 plus'comm
  143 >   , prop1 left'zero
  144 >   , prop3 add'assoc
  145 >   , prop1 left'one
  146 >   , prop1 right'one
  147 >   , prop3 mul'assoc
  148 >   , prop3 left'distr'LeftLong
  149 >   , prop3 right'distr'LeftLong
  150 >   , prop1 left'ann
  151 >   , prop1 right'ann
  152 >   ]
  153 
  154 For testing the distributive laws, we adjust the randomly generated
  155 `LeftLong` values such that the arguments of multiplications are
  156 adjacent.
  157 
  158 > left'distr'LeftLong :: LeftLong -> LeftLong -> LeftLong -> Bool
  159 > left'distr'LeftLong a b c = left'distr a (shift a b) (shift a c)
  160 >  where
  161 >   shift (LeftLong _ x) (LeftLong y z) = LeftLong (x+1) (z+x+1-y)
  162 >   shift _              x              = x
  163 >
  164 > right'distr'LeftLong :: LeftLong -> LeftLong -> LeftLong -> Bool
  165 > right'distr'LeftLong a b c = right'distr (shift a c) (shift b c) c
  166 >  where
  167 >   shift (LeftLong x y) (LeftLong z _) = LeftLong (x+z-1-y) (z-1)
  168 >   shift x              _              = x
  169 
  170 Now we turn to the correctness of the `match` function. In order to
  171 check it, we compare it with a executable specification which is
  172 correct by definition:
  173 
  174 > full'match'spec :: (Show a, Weight Char Char s)
  175 >                 => (RegExp Char -> String -> a)
  176 >                 -> (s -> a)
  177 >                 -> Checks s
  178 > full'match'spec = match'spec fullMatchSpec
  179 >
  180 > partial'match'spec :: (Show a, Weight Char Char s)
  181 >                    => (RegExp Char -> String -> a)
  182 >                    -> (s -> a)
  183 >                    -> Checks s
  184 > partial'match'spec = match'spec partialMatchSpec
  185 >
  186 > indexed'match'spec :: (Show a, Weight Char (Int,Char) s)
  187 >                    => (RegExp Char -> String -> a)
  188 >                    -> (s -> a)
  189 >                    -> Checks s
  190 > indexed'match'spec = match'spec (\r -> partialMatchSpec r . zip [(0::Int)..])
  191 >
  192 > match'spec :: (Show a, Semiring s)
  193 >            => (RegExp Char -> String -> s)
  194 >            -> (RegExp Char -> String -> a)
  195 >            -> (s -> a)
  196 >            -> Checks s
  197 > match'spec spec convmatch conv =
  198 >   Checks [run (check'match'spec spec convmatch conv)]
  199 >
  200 
  201 > check'match'spec :: (Show a, Semiring s)
  202 >                  => (RegExp Char -> String -> s)
  203 >                  -> (RegExp Char -> String -> a)
  204 >                  -> (s -> a)
  205 >                  -> RegExp Char -> String -> Bool
  206 > check'match'spec spec convmatch conv r s =
  207 >   show (convmatch r s') == show (conv (spec r s'))
  208 >  where s' = take 5 s
  209 
  210 To make this work, we need an `Arbitrary` instance for regular
  211 expressions.
  212 
  213 > instance Arbitrary (RegExp Char) where
  214 >   arbitrary = sized regexp
  215 >
  216 > regexp :: Int -> Gen (RegExp Char)
  217 > regexp 0 = frequency [ (1, return eps)
  218 >                      , (4, char `fmap` simpleChar) ]
  219 > regexp n = frequency [ (3, regexp 0)
  220 >                      , (1, alt  `fmap` subexp `ap` subexp)
  221 >                      , (2, seq_ `fmap` subexp `ap` subexp)
  222 >                      , (1, rep  `fmap` regexp (n-1))
  223 >                      , (2, fromString `fmap` parsedRegExp n) ]
  224 >  where subexp = regexp (n `div` 2)
  225 >
  226 > simpleChar :: Gen Char
  227 > simpleChar = elements "abcde"
  228 >
  229 > parsedRegExp :: Int -> Gen String
  230 > parsedRegExp n = frequency [ (4, symClass)
  231 >                            , (2, (++"?") `fmap` subexp)
  232 >                            , (2, (++"+") `fmap` subexp)
  233 >                            , (1, mkBrep1 =<< subexp)
  234 >                            , (1, mkBrep2 =<< subexp) ]
  235 >  where
  236 >   subexp = (($"") . showParen True . shows)
  237 >     `fmap` (resize (n-1) arbitrary :: Gen (RegExp Char))
  238 >
  239 >   mkBrep1 r = do x <- elements [0..3] :: Gen Int
  240 >                  return $ r ++ "{" ++ show x ++ "}"
  241 >
  242 >   mkBrep2 r = do x <- elements [0..2] :: Gen Int
  243 >                  y <- elements [0..2] :: Gen Int
  244 >                  return $ r ++ "{" ++ show x ++ "," ++ show (x+y) ++ "}"
  245 >
  246 > symClass :: Gen String
  247 > symClass = frequency [ (1, specialChar)
  248 >                      , (2, do n <- choose (0,3)
  249 >                               cs <- replicateM n charClass
  250 >                               s <- (["","^"]!!) `fmap` choose (0,1)
  251 >                               return $ "[" ++ s ++ concat cs ++ "]") ]
  252 >  where
  253 >   specialChar = elements (map (:[]) "." ++
  254 >                           map (\c -> '\\':[c]) "abcdewWdDsS \\|*+?.[]{}^")
  255 >   charClass   = oneof [ (:[]) `fmap` simpleChar
  256 >                       , specialChar
  257 >                       , do x <- simpleChar
  258 >                            y <- simpleChar
  259 >                            return $ x : '-' : [chr (ord x+ord y-ord 'a')] ]
  260 
  261 The specification of the matching function is defined inductively on
  262 the structure of a regular expression. It uses exhaustive search to
  263 find all possibilities to match a regexp against a word.
  264 
  265 > fullMatchSpec :: Weight a b s => RegExp a -> [b] -> s
  266 > fullMatchSpec (RegExp r) = matchSpec (reg (weighted r))
  267 >
  268 > matchSpec :: Semiring s => Reg s c -> [c] -> s
  269 > matchSpec Eps        u  =  if null u then one else zero
  270 > matchSpec (Sym _ f)  u  =  case u of [c] -> f c; _ -> zero
  271 > matchSpec (Alt p q)  u  =  matchSpec (reg p) u .+. matchSpec (reg q) u
  272 > matchSpec (Seq p q)  u  =
  273 >   sum [ matchSpec (reg p) u1 .*. matchSpec (reg q) u2 | (u1,u2) <- split u ]
  274 > matchSpec (Rep p)    u  =
  275 >   sum [ prod [ matchSpec (reg p) ui | ui <- ps] | ps <- parts u ]
  276 >
  277 > sum, prod :: Semiring s => [s] -> s
  278 > sum   =  foldr (.+.) zero
  279 > prod  =  foldr (.*.) one
  280 >
  281 > split :: [a] -> [([a],[a])]
  282 > split []      =  [([],[])]
  283 > split (c:cs)  =  ([],c:cs) : [ (c:s1,s2) | (s1,s2) <- split cs ]
  284 >
  285 > parts :: [a] ->  [[[a]]]
  286 > parts []      =  [[]]
  287 > parts [c]     =  [[[c]]]
  288 > parts (c:cs)  =  concat [ [(c:p):ps,[c]:p:ps] | p:ps <- parts cs ]
  289 
  290 We can perform a similar test for partial instead of full matches.
  291 
  292 > partialMatchSpec :: Weight a b s => RegExp a -> [b] -> s
  293 > partialMatchSpec (RegExp r) =
  294 >   matchSpec (reg (arb `seqW` weighted r `seqW` arb))
  295 >  where RegExp arb = rep anySym
  296 
  297 As a check for the parser, we check whether the representation
  298 generated by the `Show` instance of regular expressions can be parsed
  299 back and yields the original expression.
  300 
  301 > parse'printed :: RegExp Char -> Bool
  302 > parse'printed r = fromString (show r) == r
  303 
  304 We can also match infinite regular expressions lazily to recognize
  305 context-free or even context-sensitive languages.
  306 
  307 > infinite'regexp'checks :: Checks Bool
  308 > infinite'regexp'checks = Checks [run context'free, run context'sensitive]
  309 
  310 As an example for a context-free language, we recognize the language
  311  ${a^nb^n | n >= 0}$.
  312 
  313 > context'free :: String -> Bool
  314 > context'free s = isInAnBn s == (anbn =~ s)
  315 >
  316 > isInAnBn :: String -> Bool
  317 > isInAnBn s = all (=='a') xs && all (=='b') ys && length xs == length ys
  318 >  where (xs,ys) = break (=='b') s
  319 >
  320 > anbn :: RegExp Char
  321 > anbn = eps `alt` seq_ "a" (anbn `seq_` "b")
  322 
  323 As an example for a context-sensitive language we use the language
  324 ${a^nb^nc^n | n >= 0}$. To show that the alphabet cannot only contain
  325 characters, we use numbers instead of characters.
  326 
  327 > context'sensitive :: [Int] -> Bool
  328 > context'sensitive s = isInAnBnCn s == acceptFull anbncn s
  329 >
  330 > isInAnBnCn :: [Int] -> Bool
  331 > isInAnBnCn s = all (==1) xs && all (==2) ys && all (==3) zs
  332 >             && length xs == length ys && length ys == length zs
  333 >  where (xs,l)  = break (==2) s
  334 >        (ys,zs) = break (==3) l
  335 >
  336 > anbncn :: RegExp Int
  337 > anbncn = mkAnBnCn 0
  338 >  where
  339 >   mkAnBnCn n = brep (n,n) (sym 2) `seq_` brep (n,n) (sym 3)
  340 >          `alt` seq_ (sym 1) (mkAnBnCn (n+1))
  341 
  342 The library provides a combinator that matches a list of regular
  343 expressions in sequence, each occurring once in any order.
  344 
  345 > perm'parser'check :: String -> Bool
  346 > perm'parser'check cs = all (acceptFull (perm (map char s))) (permutations s)
  347 >  where s = take 5 cs
  348 
  349 We restrict the test to at most 5! (that is five factorial)
  350 permutations because otherwise there are too many. Note that it is
  351 possible to match much longer permutations:
  352 
  353     ghci> accept (perm (map char ['a'..'z'])) $ reverse ['a'..'z']
  354     True
  355     (0.05 secs, 8706356 bytes)
  356 
  357 But matching `perm (map char ['a'..'z'])` against *all* permutations
  358 of ['a'..'z'] takes too long.
  359