1 {-# LANGUAGE FlexibleInstances #-}
    2 {-# OPTIONS -fno-warn-orphans #-}
    3 
    4 -- |
    5 -- Module      : Text.RegExp
    6 -- Copyright   : Thomas Wilke, Frank Huch, and Sebastian Fischer
    7 -- License     : BSD3
    8 -- Maintainer  : Sebastian Fischer <mailto:sebf@informatik.uni-kiel.de>
    9 -- Stability   : experimental
   10 -- 
   11 -- This library provides a simple and fast regular expression matcher
   12 -- that is implemented in Haskell without binding to external
   13 -- libraries.
   14 -- 
   15 -- There are different ways to implement regular expression
   16 -- matching. Backtracking algorithms are simple but need bookkeeping
   17 -- overhead for nondeterministic search. One can use deterministic
   18 -- finite automata (DFA, see
   19 -- <http://swtch.com/~rsc/regexp/regexp1.html>) to match regular
   20 -- expressions faster. But for certain regular expressions these DFA
   21 -- are exponentially large which sometimes leads to prohibitive memory
   22 -- requirements.
   23 -- 
   24 -- We use a smart and simple algorithm to generate a DFA from a
   25 -- regular expression and do not generate the DFA completely but on
   26 -- the fly while parsing. This leads to a linear-time deterministic
   27 -- algorithm with constant space requirements. More specifically, the
   28 -- run time is limited by the product of the sizes of the regular
   29 -- expression and the string and the memory is limited by the size of
   30 -- the regular expression.
   31 -- 
   32 module Text.RegExp (
   33 
   34   module Data.Semiring, Weight(..),
   35 
   36   -- * Constructing regular expressions
   37 
   38   RegExp, fromString,
   39 
   40   eps, char, sym, psym, anySym, noMatch, alt, seq_, rep, rep1, opt, brep,
   41 
   42   perm,
   43 
   44   -- * Matching
   45 
   46   (=~), acceptFull, acceptPartial, matchingCount, fullMatch, partialMatch
   47 
   48   ) where
   49 
   50 import Data.Semiring
   51 import qualified Data.String
   52 
   53 import Text.RegExp.Data
   54 import Text.RegExp.Parser
   55 import Text.RegExp.Matching
   56 
   57 -- |
   58 -- Parses a regular expression from its string representation. If the
   59 -- 'OverloadedStrings' language extension is enabled, string literals
   60 -- can be used as regular expressions without using 'fromString'
   61 -- explicitly. Implicit conversion is especially useful in combination
   62 -- with functions like '=~' that take a value of type @RegExp Char@ as
   63 -- argument.
   64 -- 
   65 -- Here are some examples of supported regular expressions along with
   66 -- an explanation what they mean:
   67 -- 
   68 --  * @a@ matches the character @a@
   69 -- 
   70 --  * @[abc]@ matches any of the characters @a@, @b@, or @c@. It is
   71 --    equivalent to @(a|b|c)@, but @|@ can be used to specify
   72 --    alternatives between arbitrary regular expressions, not only
   73 --    characters.
   74 -- 
   75 --  * @[^abc]@ matches anything but the characters @a@, @b@, or @c@.
   76 -- 
   77 --  * @\\d@ matches a digit and is equivalent to @[0-9]@. Moreover,
   78 --    @\\D@ matches any non-digit character, @\\s@ and @\\S@ match
   79 --    space and non-space characters and @\\w@ and @\\W@ match word
   80 --    characters and non-word characters, that is, @\\w@ abbreviates
   81 --    @[a-zA-Z_]@.
   82 -- 
   83 --  * @a?@ matches the empty word or the character @a@, @a*@ matches
   84 --    zero or more occurrences of @a@, and @a+@ matches one or more
   85 --    @a@'s.
   86 -- 
   87 --  * @.@ (the dot) matches one arbitrary character.
   88 -- 
   89 --  * @a{4,7}@ matches four to seven occurrences of @a@, @a{2}@
   90 --    matches two.
   91 -- 
   92 fromString :: String -> RegExp Char
   93 fromString = Data.String.fromString
   94 
   95 instance Data.String.IsString (RegExp Char) where
   96   fromString = parse
   97 
   98 -- |
   99 -- Matches a sequence of the given regular expressions in any
  100 -- order. For example, the regular expression
  101 -- 
  102 -- @
  103 -- perm (map char \"abc\")
  104 -- @
  105 -- 
  106 -- has the same meaning as
  107 -- 
  108 -- @
  109 -- abc|acb|bca|bac|cba|cab
  110 -- @
  111 -- 
  112 -- and is represented as
  113 -- 
  114 -- @
  115 -- a(bc|cb)|b(ca|ac)|c(ba|ab)
  116 -- @
  117 -- 
  118 perm :: [RegExp c] -> RegExp c
  119 perm []  = eps
  120 perm [r] = r
  121 perm rs  = go rs []
  122  where
  123   go [p]    qs = p `seq_` perm qs
  124   go (p:ps) qs = (p `seq_` perm (ps ++ qs)) `alt` go ps (p:qs)
  125 
  126 -- | 
  127 -- Alias for 'acceptFull' specialized for Strings. Useful in combination
  128 -- with the 'IsString' instance for 'RegExp' 'Char'
  129 -- 
  130 (=~) :: RegExp Char -> String -> Bool
  131 (=~) = acceptFull
  132