1 {-# LANGUAGE FlexibleInstances #-} 2 {-# OPTIONS -fno-warn-orphans #-} 3 4 -- | 5 -- Module : Text.RegExp 6 -- Copyright : Thomas Wilke, Frank Huch, and Sebastian Fischer 7 -- License : BSD3 8 -- Maintainer : Sebastian Fischer <mailto:sebf@informatik.uni-kiel.de> 9 -- Stability : experimental 10 -- 11 -- This library provides a simple and fast regular expression matcher 12 -- that is implemented in Haskell without binding to external 13 -- libraries. 14 -- 15 -- There are different ways to implement regular expression 16 -- matching. Backtracking algorithms are simple but need bookkeeping 17 -- overhead for nondeterministic search. One can use deterministic 18 -- finite automata (DFA, see 19 -- <http://swtch.com/~rsc/regexp/regexp1.html>) to match regular 20 -- expressions faster. But for certain regular expressions these DFA 21 -- are exponentially large which sometimes leads to prohibitive memory 22 -- requirements. 23 -- 24 -- We use a smart and simple algorithm to generate a DFA from a 25 -- regular expression and do not generate the DFA completely but on 26 -- the fly while parsing. This leads to a linear-time deterministic 27 -- algorithm with constant space requirements. More specifically, the 28 -- run time is limited by the product of the sizes of the regular 29 -- expression and the string and the memory is limited by the size of 30 -- the regular expression. 31 -- 32 module Text.RegExp ( 33 34 module Data.Semiring, Weight(..), 35 36 -- * Constructing regular expressions 37 38 RegExp, fromString, 39 40 eps, char, sym, psym, anySym, noMatch, alt, seq_, rep, rep1, opt, brep, 41 42 perm, 43 44 -- * Matching 45 46 (=~), acceptFull, acceptPartial, matchingCount, fullMatch, partialMatch 47 48 ) where 49 50 import Data.Semiring 51 import qualified Data.String 52 53 import Text.RegExp.Data 54 import Text.RegExp.Parser 55 import Text.RegExp.Matching 56 57 -- | 58 -- Parses a regular expression from its string representation. If the 59 -- 'OverloadedStrings' language extension is enabled, string literals 60 -- can be used as regular expressions without using 'fromString' 61 -- explicitly. Implicit conversion is especially useful in combination 62 -- with functions like '=~' that take a value of type @RegExp Char@ as 63 -- argument. 64 -- 65 -- Here are some examples of supported regular expressions along with 66 -- an explanation what they mean: 67 -- 68 -- * @a@ matches the character @a@ 69 -- 70 -- * @[abc]@ matches any of the characters @a@, @b@, or @c@. It is 71 -- equivalent to @(a|b|c)@, but @|@ can be used to specify 72 -- alternatives between arbitrary regular expressions, not only 73 -- characters. 74 -- 75 -- * @[^abc]@ matches anything but the characters @a@, @b@, or @c@. 76 -- 77 -- * @\\d@ matches a digit and is equivalent to @[0-9]@. Moreover, 78 -- @\\D@ matches any non-digit character, @\\s@ and @\\S@ match 79 -- space and non-space characters and @\\w@ and @\\W@ match word 80 -- characters and non-word characters, that is, @\\w@ abbreviates 81 -- @[a-zA-Z_]@. 82 -- 83 -- * @a?@ matches the empty word or the character @a@, @a*@ matches 84 -- zero or more occurrences of @a@, and @a+@ matches one or more 85 -- @a@'s. 86 -- 87 -- * @.@ (the dot) matches one arbitrary character. 88 -- 89 -- * @a{4,7}@ matches four to seven occurrences of @a@, @a{2}@ 90 -- matches two. 91 -- 92 fromString :: String -> RegExp Char 93 fromString = Data.String.fromString 94 95 instance Data.String.IsString (RegExp Char) where 96 fromString = parse 97 98 -- | 99 -- Matches a sequence of the given regular expressions in any 100 -- order. For example, the regular expression 101 -- 102 -- @ 103 -- perm (map char \"abc\") 104 -- @ 105 -- 106 -- has the same meaning as 107 -- 108 -- @ 109 -- abc|acb|bca|bac|cba|cab 110 -- @ 111 -- 112 -- and is represented as 113 -- 114 -- @ 115 -- a(bc|cb)|b(ca|ac)|c(ba|ab) 116 -- @ 117 -- 118 perm :: [RegExp c] -> RegExp c 119 perm [] = eps 120 perm [r] = r 121 perm rs = go rs [] 122 where 123 go [p] qs = p `seq_` perm qs 124 go (p:ps) qs = (p `seq_` perm (ps ++ qs)) `alt` go ps (p:qs) 125 126 -- | 127 -- Alias for 'acceptFull' specialized for Strings. Useful in combination 128 -- with the 'IsString' instance for 'RegExp' 'Char' 129 -- 130 (=~) :: RegExp Char -> String -> Bool 131 (=~) = acceptFull 132