summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFelipeLessa <>2010-12-27 12:30:55 (GMT)
committerLuite Stegeman <luite@luite.com>2010-12-27 12:30:55 (GMT)
commit78259eb6d84853c8090b96e494a6858d9bf07b5b (patch)
tree5f49b44c34c9846e53d4a51afc0454673895f2e3
version 0.8.0.00.8.0.0
-rw-r--r--Data/Attoparsec/Text.hs222
-rw-r--r--Data/Attoparsec/Text/FastSet.hs63
-rw-r--r--Data/Attoparsec/Text/Internal.hs458
-rw-r--r--Data/Attoparsec/Text/Lazy.hs89
-rw-r--r--LICENSE30
-rw-r--r--README.markdown29
-rw-r--r--Setup.lhs3
-rw-r--r--attoparsec-text.cabal49
-rw-r--r--benchmarks/Makefile10
-rw-r--r--benchmarks/Tiny.hs49
-rw-r--r--benchmarks/med.txt.bz2bin0 -> 518 bytes
-rw-r--r--tests/Makefile14
-rw-r--r--tests/QC.hs114
-rw-r--r--tests/QCSupport.hs53
-rw-r--r--tests/TestFastSet.hs11
15 files changed, 1194 insertions, 0 deletions
diff --git a/Data/Attoparsec/Text.hs b/Data/Attoparsec/Text.hs
new file mode 100644
index 0000000..6636ad0
--- /dev/null
+++ b/Data/Attoparsec/Text.hs
@@ -0,0 +1,222 @@
+-- |
+-- Module : Data.Attoparsec.Text
+-- Copyright : Felipe Lessa 2010, Bryan O'Sullivan 2007-2010
+-- License : BSD3
+--
+-- Maintainer : felipe.lessa@gmail.com
+-- Stability : experimental
+-- Portability : unknown
+--
+-- Simple, efficient combinator parsing for 'T.Text' strings,
+-- loosely based on the Parsec library.
+
+module Data.Attoparsec.Text
+ (
+ -- * Differences from Parsec
+ -- $parsec
+
+ -- * Performance considerations
+ -- $performance
+
+ -- * Parser types
+ I.Parser
+ , Result(..)
+
+ -- ** Typeclass instances
+ -- $instances
+
+ -- * Running parsers
+ , parse
+ , feed
+ , parseWith
+ , parseTest
+
+ -- ** Result conversion
+ , maybeResult
+ , eitherResult
+
+ -- * Combinators
+ , (I.<?>)
+ , I.try
+ , module Data.Attoparsec.Combinator
+
+ -- * Parsing individual characters
+ , I.char
+ , I.anyChar
+ , I.notChar
+ , I.satisfy
+ , I.satisfyWith
+ , I.skip
+
+ -- ** Character classes
+ , I.inClass
+ , I.notInClass
+
+ -- * Efficient string handling
+ , I.string
+ , I.skipWhile
+ , I.take
+ , I.takeWhile
+ , I.takeWhile1
+ , I.takeTill
+
+ -- * State observation and manipulation functions
+ , I.endOfInput
+ , I.ensure
+ ) where
+
+import Data.Attoparsec.Combinator
+import qualified Data.Attoparsec.Text.Internal as I
+import qualified Data.Text as T
+
+-- $parsec
+--
+-- Compared to Parsec 3, Attoparsec makes several tradeoffs. It is
+-- not intended for, or ideal for, all possible uses.
+--
+-- * While Attoparsec can consume input incrementally, Parsec cannot.
+-- Incremental input is a huge deal for efficient and secure network
+-- and system programming, since it gives much more control to users
+-- of the library over matters such as resource usage and the I/O
+-- model to use.
+--
+-- * Much of the performance advantage of Attoparsec is gained via
+-- high-performance parsers such as 'I.takeWhile' and 'I.string'.
+-- If you use complicated combinators that return lists of
+-- characters, there really isn't much performance difference the
+-- two libraries.
+--
+-- * Unlike Parsec 3, Attoparsec does not support being used as a
+-- monad transformer. This is mostly a matter of the implementor
+-- not having needed that functionality.
+--
+-- * Attoparsec is specialised to deal only with strict 'T.Text'
+-- input. Efficiency concernts rule out both lists and lazy
+-- texts. The usual use for lazy texts would be to allow
+-- consumption of very large input without a large footprint.
+-- For this need, Attoparsec's incremental input provides an
+-- excellent substitute, with much more control over when input
+-- takes place.
+--
+-- * Parsec parsers can produce more helpful error messages than
+-- Attoparsec parsers. This is a matter of focus: Attoparsec avoids
+-- the extra book-keeping in favour of higher performance.
+
+-- $performance
+--
+-- To actually achieve high performance, there are a few guidelines
+-- that it is useful to follow.
+--
+-- Use the 'T.Text'-oriented parsers whenever possible,
+-- e.g. 'I.takeWhile1' instead of 'many1' 'I.anyChar'. There is
+-- about a factor of 100 difference in performance between the
+-- two kinds of parser.
+--
+-- For very simple character-testing predicates, write them by
+-- hand instead of using 'I.inClass' or 'I.notInClass'. For
+-- instance, both of these predicates test for an end-of-line
+-- character, but the first is much faster than the second:
+--
+-- >endOfLine_fast c = w == '\r' || c == '\n'
+-- >endOfLine_slow = inClass "\r\n"
+--
+-- Make active use of benchmarking and profiling tools to measure,
+-- find the problems with, and improve the performance of your parser.
+
+-- $instances
+--
+-- The 'I.Parser' type is an instance of the following classes:
+--
+-- * 'Monad', where 'fail' throws an exception (i.e. fails) with an
+-- error message.
+--
+-- * 'Functor' and 'Applicative', which follow the usual definitions.
+--
+-- * 'MonadPlus', where 'mzero' fails (with no error message) and
+-- 'mplus' executes the right-hand parser if the left-hand one
+-- fails.
+--
+-- * 'Alternative', which follows 'MonadPlus'.
+--
+-- The 'Result' type is an instance of 'Functor', where 'fmap'
+-- transforms the value in a 'Done' result.
+
+-- | The result of a parse.
+data Result r = Fail !T.Text [String] String
+ -- ^ The parse failed. The 'T.Text' is the input
+ -- that had not yet been consumed when the failure
+ -- occurred. The @[@'String'@]@ is a list of contexts
+ -- in which the error occurred. The 'String' is the
+ -- message describing the error, if any.
+ | Partial (T.Text -> Result r)
+ -- ^ Supply this continuation with more input so that
+ -- the parser can resume. To indicate that no more
+ -- input is available, use an 'T.empty' string.
+ | Done !T.Text r
+ -- ^ The parse succeeded. The 'T.Text' is the
+ -- input that had not yet been consumed (if any) when
+ -- the parse succeeded.
+
+instance Show r => Show (Result r) where
+ show (Fail bs stk msg) =
+ "Fail " ++ show bs ++ " " ++ show stk ++ " " ++ show msg
+ show (Partial _) = "Partial _"
+ show (Done bs r) = "Done " ++ show bs ++ " " ++ show r
+
+-- | If a parser has returned a 'Partial' result, supply it with more
+-- input.
+feed :: Result r -> T.Text -> Result r
+feed f@(Fail _ _ _) _ = f
+feed (Partial k) d = k d
+feed (Done bs r) d = Done (T.append bs d) r
+
+fmapR :: (a -> b) -> Result a -> Result b
+fmapR _ (Fail st stk msg) = Fail st stk msg
+fmapR f (Partial k) = Partial (fmapR f . k)
+fmapR f (Done bs r) = Done bs (f r)
+
+instance Functor Result where
+ fmap = fmapR
+
+-- | Run a parser and print its result to standard output.
+parseTest :: (Show a) => I.Parser a -> T.Text -> IO ()
+parseTest p s = print (parse p s)
+
+translate :: I.Result a -> Result a
+translate (I.Fail st stk msg) = Fail (I.input st) stk msg
+translate (I.Partial k) = Partial (translate . k)
+translate (I.Done st r) = Done (I.input st) r
+
+-- | Run a parser and return its result.
+parse :: I.Parser a -> T.Text -> Result a
+parse m s = translate (I.parse m s)
+{-# INLINE parse #-}
+
+-- | Run a parser with an initial input string, and a monadic action
+-- that can supply more input if needed.
+parseWith :: Monad m =>
+ (m T.Text)
+ -- ^ An action that will be executed to provide the parser
+ -- with more input, if necessary. The action must return an
+ -- 'T.empty' string when there is no more input available.
+ -> I.Parser a
+ -> T.Text
+ -- ^ Initial input for the parser.
+ -> m (Result a)
+parseWith refill p s = step $ I.parse p s
+ where step (I.Fail st stk msg) = return $! Fail (I.input st) stk msg
+ step (I.Partial k) = (step . k) =<< refill
+ step (I.Done st r) = return $! Done (I.input st) r
+
+-- | Convert a 'Result' value to a 'Maybe' value. A 'Partial' result
+-- is treated as failure.
+maybeResult :: Result r -> Maybe r
+maybeResult (Done _ r) = Just r
+maybeResult _ = Nothing
+
+-- | Convert a 'Result' value to an 'Either' value. A 'Partial' result
+-- is treated as failure.
+eitherResult :: Result r -> Either String r
+eitherResult (Done _ r) = Right r
+eitherResult (Fail _ _ msg) = Left msg
+eitherResult _ = Left "Result: incomplete input"
diff --git a/Data/Attoparsec/Text/FastSet.hs b/Data/Attoparsec/Text/FastSet.hs
new file mode 100644
index 0000000..a97bc6b
--- /dev/null
+++ b/Data/Attoparsec/Text/FastSet.hs
@@ -0,0 +1,63 @@
+-----------------------------------------------------------------------------
+-- |
+-- Module : Data.Attoparsec.FastSet
+-- Copyright : Felipe Lessa 2010, Bryan O'Sullivan 2008
+-- License : BSD3
+--
+-- Maintainer : felipe.lessa@gmail.com
+-- Stability : experimental
+-- Portability : unknown
+--
+-- Fast set membership tests for 'Char' values. The set
+-- representation is unboxed for efficiency. We test for
+-- membership using a binary search.
+--
+-----------------------------------------------------------------------------
+module Data.Attoparsec.Text.FastSet
+ (
+ -- * Data type
+ FastSet
+ -- * Construction
+ , fromList
+ , set
+ -- * Lookup
+ , member
+ -- * Handy interface
+ , charClass
+ ) where
+
+import Data.List (sort)
+import qualified Data.Array.Base as AB
+import qualified Data.Array.Unboxed as A
+import qualified Data.Text as T
+
+newtype FastSet = FastSet (A.UArray Int Char)
+ deriving (Eq, Ord, Show)
+
+-- | Create a set.
+set :: T.Text -> FastSet
+set t = mkSet (T.length t) (sort $ T.unpack t)
+
+fromList :: [Char] -> FastSet
+fromList cs = mkSet (length cs) (sort cs)
+
+mkSet :: Int -> [Char] -> FastSet
+mkSet l = FastSet . A.listArray (0,l-1)
+
+-- | Check the set for membership.
+member :: Char -> FastSet -> Bool
+member c (FastSet a) = uncurry search (A.bounds a)
+ where search lo hi
+ | hi < lo = False
+ | otherwise =
+ let mid = (lo + hi) `div` 2
+ in case compare c (AB.unsafeAt a mid) of
+ GT -> search (mid + 1) hi
+ LT -> search lo (mid - 1)
+ _ -> True
+
+charClass :: String -> FastSet
+charClass = fromList . go
+ where go (a:'-':b:xs) = [a..b] ++ go xs
+ go (x:xs) = x : go xs
+ go _ = ""
diff --git a/Data/Attoparsec/Text/Internal.hs b/Data/Attoparsec/Text/Internal.hs
new file mode 100644
index 0000000..8b2ed98
--- /dev/null
+++ b/Data/Attoparsec/Text/Internal.hs
@@ -0,0 +1,458 @@
+{-# LANGUAGE Rank2Types, RecordWildCards #-}
+-- |
+-- Module : Data.Attoparsec.Text.Internal
+-- Copyright : Felipe Lessa 2010, Bryan O'Sullivan 2007-2010
+-- License : BSD3
+--
+-- Maintainer : felipe.lessa@gmail.com
+-- Stability : experimental
+-- Portability : unknown
+--
+-- Simple, efficient parser combinators for 'T.Text' strings,
+-- loosely based on the Parsec library, heavily based on attoparsec.
+
+module Data.Attoparsec.Text.Internal
+ (
+ -- * Parser types
+ Parser
+ , Result(..)
+ , S(input)
+
+ -- * Running parsers
+ , parse
+
+ -- * Combinators
+ , (<?>)
+ , try
+ , module Data.Attoparsec.Combinator
+
+ -- * Parsing individual characters
+ , satisfy
+ , satisfyWith
+ , anyChar
+ , skip
+ , char
+ , notChar
+
+ -- ** Character classes
+ , inClass
+ , notInClass
+
+ -- * Efficient string handling
+ , skipWhile
+ , string
+ , stringTransform
+ , take
+ , takeWhile
+ , takeWhile1
+ , takeTill
+
+ -- * State observation and manipulation functions
+ , endOfInput
+ , ensure
+
+ -- * Utilities
+ , endOfLine
+ ) where
+
+import Control.Applicative (Alternative(..), Applicative(..), (<$>))
+import Control.Monad (MonadPlus(..), when)
+import Data.Attoparsec.Combinator
+import Data.Attoparsec.Text.FastSet (charClass, member)
+import Data.Monoid (Monoid(..))
+import Prelude hiding (getChar, take, takeWhile)
+import qualified Data.Text as T
+
+data Result r = Fail S [String] String
+ | Partial (T.Text -> Result r)
+ | Done S r
+
+-- | The 'Parser' type is a monad.
+newtype Parser a = Parser {
+ runParser :: forall r. S
+ -> Failure r
+ -> Success a r
+ -> Result r
+ }
+
+type Failure r = S -> [String] -> String -> Result r
+type Success a r = S -> a -> Result r
+
+-- | Have we read all available input?
+data More = Complete | Incomplete
+ deriving (Eq, Show)
+
+plusMore :: More -> More -> More
+plusMore Complete _ = Complete
+plusMore _ Complete = Complete
+plusMore _ _ = Incomplete
+{-# INLINE plusMore #-}
+
+instance Monoid More where
+ mempty = Incomplete
+ mappend = plusMore
+
+data S = S {
+ input :: !T.Text
+ , _added :: !T.Text
+ , more :: !More
+ } deriving (Show)
+
+instance Show r => Show (Result r) where
+ show (Fail _ stack msg) = "Fail " ++ show stack ++ " " ++ show msg
+ show (Partial _) = "Partial _"
+ show (Done bs r) = "Done " ++ show bs ++ " " ++ show r
+
+addS :: S -> S -> S
+addS (S s0 a0 c0) (S _s1 a1 c1) = S (s0 +++ a1) (a0 +++ a1) (mappend c0 c1)
+{-# INLINE addS #-}
+
+instance Monoid S where
+ mempty = S T.empty T.empty Incomplete
+ mappend = addS
+
+bindP :: Parser a -> (a -> Parser b) -> Parser b
+bindP m g =
+ Parser (\st0 kf ks -> runParser m st0 kf (\s a -> runParser (g a) s kf ks))
+{-# INLINE bindP #-}
+
+returnP :: a -> Parser a
+returnP a = Parser (\st0 _kf ks -> ks st0 a)
+{-# INLINE returnP #-}
+
+instance Monad Parser where
+ return = returnP
+ (>>=) = bindP
+ fail = failDesc
+
+noAdds :: S -> S
+noAdds (S s0 _a0 c0) = S s0 T.empty c0
+{-# INLINE noAdds #-}
+
+plus :: Parser a -> Parser a -> Parser a
+plus a b = Parser $ \st0 kf ks ->
+ let kf' st1 _ _ = runParser b (mappend st0 st1) kf ks
+ !st2 = noAdds st0
+ in runParser a st2 kf' ks
+{-# INLINE plus #-}
+
+instance MonadPlus Parser where
+ mzero = failDesc "mzero"
+ mplus = plus
+
+fmapP :: (a -> b) -> Parser a -> Parser b
+fmapP p m = Parser (\st0 f k -> runParser m st0 f (\s a -> k s (p a)))
+{-# INLINE fmapP #-}
+
+instance Functor Parser where
+ fmap = fmapP
+
+apP :: Parser (a -> b) -> Parser a -> Parser b
+apP d e = do
+ b <- d
+ a <- e
+ return (b a)
+{-# INLINE apP #-}
+
+instance Applicative Parser where
+ pure = returnP
+ (<*>) = apP
+
+ -- These definitions are equal to the defaults, but this
+ -- way the optimizer doesn't have to work so hard to figure
+ -- that out.
+ (*>) = (>>)
+ x <* y = x >>= \a -> y >> return a
+
+instance Alternative Parser where
+ empty = failDesc "empty"
+ (<|>) = plus
+
+failDesc :: String -> Parser a
+failDesc err = Parser (\st0 kf _ks -> kf st0 [] msg)
+ where msg = "Failed reading: " ++ err
+{-# INLINE failDesc #-}
+
+-- | Succeed only if at least @n@ characters of input are available.
+ensure :: Int -> Parser ()
+ensure n = Parser $ \st0@(S s0 _a0 _c0) kf ks ->
+ if T.length s0 >= n
+ then ks st0 ()
+ else runParser (demandInput >> ensure n) st0 kf ks
+
+-- | Ask for input. If we receive any, pass it to a success
+-- continuation, otherwise to a failure continuation.
+prompt :: S -> (S -> Result r) -> (S -> Result r) -> Result r
+prompt (S s0 a0 _c0) kf ks = Partial $ \s ->
+ if T.null s
+ then kf $! S s0 a0 Complete
+ else ks $! S (s0 +++ s) (a0 +++ s) Incomplete
+
+-- | Immediately demand more input via a 'Partial' continuation
+-- result.
+demandInput :: Parser ()
+demandInput = Parser $ \st0 kf ks ->
+ if more st0 == Complete
+ then kf st0 ["demandInput"] "not enough characters"
+ else prompt st0 (\st -> kf st ["demandInput"] "not enough characters") (`ks` ())
+
+-- | This parser always succeeds. It returns 'True' if any input is
+-- available either immediately or on demand, and 'False' if the end
+-- of all input has been reached.
+wantInput :: Parser Bool
+wantInput = Parser $ \st0@(S s0 _a0 c0) _kf ks ->
+ case () of
+ _ | not (T.null s0) -> ks st0 True
+ | c0 == Complete -> ks st0 False
+ | otherwise -> prompt st0 (`ks` False) (`ks` True)
+
+get :: Parser T.Text
+get = Parser (\st0 _kf ks -> ks st0 (input st0))
+
+put :: T.Text -> Parser ()
+put s = Parser (\(S _s0 a0 c0) _kf ks -> ks (S s a0 c0) ())
+
+(+++) :: T.Text -> T.Text -> T.Text
+(+++) = T.append
+{-# INLINE (+++) #-}
+
+-- | Attempt a parse, and if it fails, rewind the input so that no
+-- input appears to have been consumed.
+--
+-- This combinator is useful in cases where a parser might consume
+-- some input before failing, i.e. the parser needs arbitrary
+-- lookahead. The downside to using this combinator is that it can
+-- retain input for longer than is desirable.
+try :: Parser a -> Parser a
+try p = Parser $ \st0 kf ks ->
+ runParser p (noAdds st0) (kf . mappend st0) ks
+
+-- | The parser @satisfy p@ succeeds for any character for which
+-- the predicate @p@ returns 'True'. Returns the character that
+-- is actually parsed.
+--
+-- >import Data.Char (isDigit)
+-- >digit = satisfy isDigit
+satisfy :: (Char -> Bool) -> Parser Char
+satisfy p = do
+ ensure 1
+ s <- get
+ case T.uncons s of
+ Just (h,t) | p h -> put t >> return h
+ | otherwise -> fail "satisfy"
+
+-- | The parser @skip p@ succeeds for any character for which the
+-- predicate @p@ returns 'True'.
+--
+-- >import Data.Char (isDigit)
+-- >digit = satisfy isDigit
+skip :: (Char -> Bool) -> Parser ()
+skip p = do
+ ensure 1
+ s <- get
+ case T.uncons s of
+ Just (h,t) | p h -> put t
+ | otherwise -> fail "skip"
+
+
+-- | The parser @satisfyWith f p@ transforms a character, and
+-- succeeds if the predicate @p@ returns 'True' on the
+-- transformed value. The parser returns the transformed
+-- character that was parsed.
+satisfyWith :: (Char -> a) -> (a -> Bool) -> Parser a
+satisfyWith f p = do
+ ensure 1
+ s <- get
+ let Just (h,t) = T.uncons s
+ c = f h
+ if p c
+ then put t >> return c
+ else fail "satisfyWith"
+
+-- | Consume @n@ characters of input, but succeed only if the
+-- predicate returns 'True'.
+takeWith :: Int -> (T.Text -> Bool) -> Parser T.Text
+takeWith n p = do
+ ensure n
+ s <- get
+ let (h,t) = T.splitAt n s
+ if p h
+ then put t >> return h
+ else failDesc "takeWith"
+
+-- | Consume exactly @n@ characters of input.
+take :: Int -> Parser T.Text
+take n = takeWith n (const True)
+{-# INLINE take #-}
+
+-- | @string s@ parses a sequence of characters that identically
+-- match @s@. Returns the parsed string (i.e. @s@). This parser
+-- consumes no input if it fails (even if a partial match).
+--
+-- /Note/: The behaviour of this parser is different to that of the
+-- similarly-named parser in Parsec, as this one is all-or-nothing.
+-- To illustrate the difference, the following parser will fail under
+-- Parsec given an input of @"for"@:
+--
+-- >string "foo" <|> string "for"
+--
+-- The reason for its failure is that that the first branch is a
+-- partial match, and will consume the letters @\'f\'@ and
+-- @\'o\'@ before failing. In Attoparsec, both the original on
+-- bytestrings and this one on texts, the above parser will
+-- /succeed/ on that input, because the failed first branch will
+-- consume nothing.
+string :: T.Text -> Parser T.Text
+string s = takeWith (T.length s) (==s)
+{-# INLINE string #-}
+
+stringTransform :: (T.Text -> T.Text) -> T.Text
+ -> Parser T.Text
+stringTransform f s = takeWith (T.length s) ((==f s) . f)
+{-# INLINE stringTransform #-}
+
+-- | Skip past input for as long as the predicate returns 'True'.
+skipWhile :: (Char -> Bool) -> Parser ()
+skipWhile p = go
+ where
+ go = do
+ input <- wantInput
+ when input $ do
+ t <- T.dropWhile p <$> get
+ put t
+ when (T.null t) go
+
+-- | Consume input as long as the predicate returns 'False'
+-- (i.e. until it returns 'True'), and return the consumed input.
+--
+-- This parser does not fail. It will return an empty string if the
+-- predicate returns 'True' on the first character of input.
+--
+-- /Note/: Because this parser does not fail, do not use it with
+-- combinators such as 'many', because such parsers loop until a
+-- failure occurs. Careless use will thus result in an infinite loop.
+takeTill :: (Char -> Bool) -> Parser T.Text
+takeTill p = takeWhile (not . p)
+{-# INLINE takeTill #-}
+
+-- | Consume input as long as the predicate returns 'True', and return
+-- the consumed input.
+--
+-- This parser does not fail. It will return an empty string if the
+-- predicate returns 'False' on the first character of input.
+--
+-- /Note/: Because this parser does not fail, do not use it with
+-- combinators such as 'many', because such parsers loop until a
+-- failure occurs. Careless use will thus result in an infinite loop.
+takeWhile :: (Char -> Bool) -> Parser T.Text
+takeWhile p = go []
+ where
+ go acc = do
+ input <- wantInput
+ if input
+ then do
+#if MIN_VERSION_text(0,11,0)
+ (h,t) <- T.span p <$> get
+#else
+ (h,t) <- T.spanBy p <$> get
+#endif
+ put t
+ if T.null t
+ then go (h:acc)
+ else return $ if null acc then h else T.concat $ reverse (h:acc)
+ else return $ case acc of
+ [] -> T.empty
+ [x] -> x
+ _ -> T.concat $ reverse acc
+
+-- | Consume input as long as the predicate returns 'True', and return
+-- the consumed input.
+--
+-- This parser requires the predicate to succeed on at least one
+-- character of input: it will fail if the predicate never
+-- returns 'True' or if there is no input left.
+takeWhile1 :: (Char -> Bool) -> Parser T.Text
+takeWhile1 p = do
+ (`when` demandInput) =<< T.null <$> get
+#if MIN_VERSION_text(0,11,0)
+ (h,t) <- T.span p <$> get
+#else
+ (h,t) <- T.spanBy p <$> get
+#endif
+ when (T.null h) $ failDesc "takeWhile1"
+ put t
+ if T.null t
+ then (h+++) `fmapP` takeWhile p
+ else return h
+
+-- | Match any character in a set.
+--
+-- >vowel = inClass "aeiou"
+--
+-- Range notation is supported.
+--
+-- >halfAlphabet = inClass "a-nA-N"
+--
+-- To add a literal @\'-\'@ to a set, place it at the beginning or end
+-- of the string.
+inClass :: String -> Char -> Bool
+inClass s = (`member` mySet)
+ where mySet = charClass s
+{-# INLINE inClass #-}
+
+-- | Match any character not in a set.
+notInClass :: String -> Char -> Bool
+notInClass s = not . inClass s
+{-# INLINE notInClass #-}
+
+-- | Match any character.
+anyChar :: Parser Char
+anyChar = satisfy $ const True
+{-# INLINE anyChar #-}
+
+-- | Match a specific character.
+char :: Char -> Parser Char
+char c = satisfy (== c) <?> show c
+{-# INLINE char #-}
+
+-- | Match any character except the given one.
+notChar :: Char -> Parser Char
+notChar c = satisfy (/= c) <?> "not " ++ show c
+{-# INLINE notChar #-}
+
+-- | Match only if all input has been consumed.
+endOfInput :: Parser ()
+endOfInput = Parser $ \st0@S{..} kf ks ->
+ if T.null input
+ then if more == Complete
+ then ks st0 ()
+ else let kf' st1 _ _ = ks (mappend st0 st1) ()
+ ks' st1 _ = kf (mappend st0 st1) [] "endOfInput"
+ in runParser demandInput st0 kf' ks'
+ else kf st0 [] "endOfInput"
+
+-- | Match either a single newline character @\'\\n\'@, or a carriage
+-- return followed by a newline character @\"\\r\\n\"@.
+endOfLine :: Parser ()
+endOfLine = (char '\n' >> return ()) <|> (string (T.pack "\r\n") >> return ())
+
+--- | Name the parser, in case failure occurs.
+(<?>) :: Parser a
+ -> String -- ^ the name to use if parsing fails
+ -> Parser a
+p <?> msg = Parser $ \s kf ks -> runParser p s (\s' strs m -> kf s' (msg:strs) m) ks
+{-# INLINE (<?>) #-}
+infix 0 <?>
+
+-- | Terminal failure continuation.
+failK :: Failure a
+failK st0 stack msg = Fail st0 stack msg
+
+-- | Terminal success continuation.
+successK :: Success a a
+successK state a = Done state a
+
+-- | Run a parser.
+parse :: Parser a -> T.Text -> Result a
+parse m s = runParser m (S s T.empty Incomplete) failK successK
+{-# INLINE parse #-}
diff --git a/Data/Attoparsec/Text/Lazy.hs b/Data/Attoparsec/Text/Lazy.hs
new file mode 100644
index 0000000..3e53a52
--- /dev/null
+++ b/Data/Attoparsec/Text/Lazy.hs
@@ -0,0 +1,89 @@
+-- |
+-- Module : Data.Attoparsec.Text.Lazy
+-- Copyright : Felipe Lessa 2010, Bryan O'Sullivan 2010
+-- License : BSD3
+--
+-- Maintainer : felipe.lessa@gmail.com
+-- Stability : experimental
+-- Portability : unknown
+--
+-- Simple, efficient combinator parsing for lazy 'Text'
+-- strings, loosely based on the Parsec library.
+--
+-- This is essentially the same code as in the 'Data.Attoparsec'
+-- module, only with a 'parse' function that can consume a lazy
+-- 'Text' incrementally, and a 'Result' type that does not allow
+-- more input to be fed in. Think of this as suitable for use with a
+-- lazily read file, e.g. via 'L.readFile' or 'L.hGetContents'.
+--
+-- Behind the scenes, strict 'T.Text' values are still used
+-- internally to store parser input and manipulate it efficiently.
+-- High-performance parsers such as 'string' still expect strict
+-- 'T.Text' parameters.
+
+module Data.Attoparsec.Text.Lazy
+ (
+ Result(..)
+ , module Data.Attoparsec.Text
+ -- * Running parsers
+ , parse
+ , parseTest
+ -- ** Result conversion
+ , maybeResult
+ , eitherResult
+ ) where
+
+import Data.Text.Lazy (Text, fromChunks, toChunks)
+import qualified Data.Text as T
+import qualified Data.Attoparsec.Text as A
+import Data.Attoparsec.Text hiding (Result(..), eitherResult, maybeResult,
+ parse, parseWith, parseTest)
+
+-- | The result of a parse.
+data Result r = Fail Text [String] String
+ -- ^ The parse failed. The 'Text' is the input
+ -- that had not yet been consumed when the failure
+ -- occurred. The @[@'String'@]@ is a list of contexts
+ -- in which the error occurred. The 'String' is the
+ -- message describing the error, if any.
+ | Done Text r
+ -- ^ The parse succeeded. The 'Text' is the
+ -- input that had not yet been consumed (if any) when
+ -- the parse succeeded.
+
+instance Show r => Show (Result r) where
+ show (Fail bs stk msg) =
+ "Fail " ++ show bs ++ " " ++ show stk ++ " " ++ show msg
+ show (Done bs r) = "Done " ++ show bs ++ " " ++ show r
+
+fmapR :: (a -> b) -> Result a -> Result b
+fmapR _ (Fail st stk msg) = Fail st stk msg
+fmapR f (Done bs r) = Done bs (f r)
+
+instance Functor Result where
+ fmap = fmapR
+
+-- | Run a parser and return its result.
+parse :: A.Parser a -> Text -> Result a
+parse p s = case toChunks s of
+ x:xs -> go (A.parse p x) xs
+ [] -> go (A.parse p T.empty) []
+ where
+ go (A.Fail x stk msg) ys = Fail (fromChunks $ x:ys) stk msg
+ go (A.Done x r) ys = Done (fromChunks $ x:ys) r
+ go (A.Partial k) (y:ys) = go (k y) ys
+ go (A.Partial k) [] = go (k T.empty) []
+
+-- | Run a parser and print its result to standard output.
+parseTest :: (Show a) => A.Parser a -> Text -> IO ()
+parseTest p s = print (parse p s)
+
+-- | Convert a 'Result' value to a 'Maybe' value.
+maybeResult :: Result r -> Maybe r
+maybeResult (Done _ r) = Just r
+maybeResult _ = Nothing
+
+-- | Convert a 'Result' value to an 'Either' value.
+eitherResult :: Result r -> Either String r
+eitherResult (Done _ r) = Right r
+eitherResult (Fail _ _ msg) = Left msg
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..8164f53
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,30 @@
+Copyright (c) Felipe Lessa, Lennart Kolmodin
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the author nor the names of his contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS ``AS IS'' AND ANY EXPRESS
+OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.markdown b/README.markdown
new file mode 100644
index 0000000..751ba1c
--- /dev/null
+++ b/README.markdown
@@ -0,0 +1,29 @@
+# Welcome to attoparsec-text
+
+attoparsec-text is a fast Haskell parser combinator library. It
+is designed after attoparsec library but working with Text
+instead of ByteStrings.
+
+# Join in!
+
+I'm happy to receive bug reports, fixes, documentation enhancements,
+and other improvements.
+
+Please report bugs via the
+[bitbucket issue tracker](http://bitbucket.org/bos/attoparsec/issues).
+
+Master [Mercurial repository](http://bitbucket.org/bos/attoparsec):
+
+* `hg clone http://bitbucket.org/bos/attoparsec`
+
+There's also a [git mirror](http://github.com/bos/attoparsec):
+
+* `git clone git://github.com/bos/attoparsec.git`
+
+(You can create and contribute changes using either Mercurial or git.)
+
+Authors
+-------
+
+This library is written and maintained by Bryan O'Sullivan,
+<bos@serpentine.com>.
diff --git a/Setup.lhs b/Setup.lhs
new file mode 100644
index 0000000..5bde0de
--- /dev/null
+++ b/Setup.lhs
@@ -0,0 +1,3 @@
+#!/usr/bin/env runhaskell
+> import Distribution.Simple
+> main = defaultMain
diff --git a/attoparsec-text.cabal b/attoparsec-text.cabal
new file mode 100644
index 0000000..5cfdf4f
--- /dev/null
+++ b/attoparsec-text.cabal
@@ -0,0 +1,49 @@
+name: attoparsec-text
+version: 0.8.0.0
+license: BSD3
+license-file: LICENSE
+category: Text, Parsing
+author: Felipe Lessa <felipe.lessa@gmail.com>, Bryan O'Sullivan <bos@serpentine.com>
+maintainer: Felipe Lessa <felipe.lessa@gmail.com>
+stability: experimental
+tested-with: GHC == 6.12.1
+synopsis: Fast combinator parsing for texts
+cabal-version: >= 1.6
+-- homepage: http://bitbucket.org/bos/attoparsec
+-- bug-reports: http://bitbucket.org/bos/attoparsec/issues
+build-type: Simple
+description:
+ A fast parser combinator library, aimed particularly at dealing
+ efficiently with network protocols and complicated text/binary
+ file formats.
+ .
+ This library is basically a translation of the original
+ attoparsec library to use text instead of bytestrings.
+extra-source-files:
+ README.markdown
+ benchmarks/Makefile
+ benchmarks/Tiny.hs
+ benchmarks/med.txt.bz2
+ tests/Makefile
+ tests/QC.hs
+ tests/QCSupport.hs
+ tests/TestFastSet.hs
+-- examples/Makefile
+-- examples/Parsec_RFC2616.hs
+-- examples/RFC2616.hs
+-- examples/TestRFC2616.hs
+-- examples/rfc2616.c
+
+library
+ build-depends: base >= 3 && < 5,
+ attoparsec >= 0.7 && < 0.9,
+ text >= 0.10 && < 0.12,
+ containers >= 0.1.0.1 && < 0.4,
+ array >= 0.1 && < 0.4
+ extensions: CPP
+ exposed-modules: Data.Attoparsec.Text
+ Data.Attoparsec.Text.FastSet
+ Data.Attoparsec.Text.Lazy
+ other-modules: Data.Attoparsec.Text.Internal
+ ghc-options: -Wall
+ ghc-prof-options: -auto-all
diff --git a/benchmarks/Makefile b/benchmarks/Makefile
new file mode 100644
index 0000000..545ec86
--- /dev/null
+++ b/benchmarks/Makefile
@@ -0,0 +1,10 @@
+all: med.txt tiny
+
+tiny: Tiny.hs
+ ghc -O --make -o $@ $<
+
+%: %.bz2
+ bunzip2 -k $<
+
+clean:
+ -rm -f *.o *.hi tiny
diff --git a/benchmarks/Tiny.hs b/benchmarks/Tiny.hs
new file mode 100644
index 0000000..5135482
--- /dev/null
+++ b/benchmarks/Tiny.hs
@@ -0,0 +1,49 @@
+import Control.Applicative ((<|>))
+import Control.Monad (forM_)
+import Data.Char (isDigit, isLetter)
+import System.Environment (getArgs)
+import qualified Data.Attoparsec.Char8 as AB
+import qualified Data.ByteString.Char8 as B
+import qualified Data.Attoparsec.Text as A
+import qualified Data.Text as T
+import qualified Data.Text.IO as T
+import qualified Text.Parsec as P
+import qualified Text.Parsec.ByteString as P
+
+
+attoparsec_bytestring args = do
+ forM_ args $ \arg -> do
+ input <- B.readFile arg
+ case AB.parse p input `AB.feed` B.empty of
+ AB.Done _ xs -> print (length xs)
+ what -> print what
+ where
+ slow = AB.many (AB.many1 AB.letter_ascii <|> AB.many1 AB.digit)
+ fast = AB.many (AB.takeWhile1 isLetter <|> AB.takeWhile1 isDigit)
+ isDigit c = c >= '0' && c <= '9'
+ isLetter c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
+ p = fast
+
+attoparsec_text args = do
+ forM_ args $ \arg -> do
+ input <- T.readFile arg
+ case A.parse p input `A.feed` T.empty of
+ A.Done _ xs -> print (length xs)
+ what -> print what
+ where
+ p = A.many (A.takeWhile1 isLetter <|> A.takeWhile1 isDigit)
+
+parsec args =
+ forM_ args $ \arg -> do
+ input <- readFile arg
+ case P.parse (P.many (P.many1 P.letter P.<|> P.many1 P.digit)) "" input of
+ Left err -> print err
+ Right xs -> print (length xs)
+
+main = do
+ args <- getArgs
+ case args of
+ ("attoparsec_bytestring":xs) -> attoparsec_bytestring xs
+ ("attoparsec_text":xs) -> attoparsec_text xs
+ ("parsec":xs) -> parsec xs
+ [] -> putStrLn "Usage: ... [parsec|attoparsec_text|attoparsec_bytestring] inputs"
diff --git a/benchmarks/med.txt.bz2 b/benchmarks/med.txt.bz2
new file mode 100644
index 0000000..862f915
--- /dev/null
+++ b/benchmarks/med.txt.bz2
Binary files differ
diff --git a/tests/Makefile b/tests/Makefile
new file mode 100644
index 0000000..f30c11f
--- /dev/null
+++ b/tests/Makefile
@@ -0,0 +1,14 @@
+all: TestFastSet.out qc.out
+
+%.out: %.exe
+ ./$< | tee $<.tmp
+ mv $<.tmp $@
+
+qc.exe: QC.hs
+ ghc -O -fno-warn-orphans --make -o $@ $<
+
+%.exe: %.hs
+ ghc -O -fno-warn-orphans --make -o $@ $<
+
+clean:
+ -rm -f *.hi *.o *.exe *.out
diff --git a/tests/QC.hs b/tests/QC.hs
new file mode 100644
index 0000000..f302a03
--- /dev/null
+++ b/tests/QC.hs
@@ -0,0 +1,114 @@
+{-# LANGUAGE OverloadedStrings #-}
+module Main (main) where
+
+import Control.Monad (forM_)
+import Data.Maybe (isJust)
+import Prelude hiding (takeWhile)
+import QCSupport
+import Test.Framework (defaultMain, testGroup)
+import Test.Framework.Providers.QuickCheck2 (testProperty)
+import Test.QuickCheck hiding (NonEmpty)
+import qualified Data.Attoparsec.Text as P
+import qualified Data.Text as T
+
+-- Make sure that structures whose types claim they are non-empty
+-- really are.
+
+nonEmptyList l = length (nonEmpty l) > 0
+ where types = l :: NonEmpty [Int]
+nonEmptyBS l = T.length (nonEmpty l) > 0
+
+-- Naming.
+
+{-
+label (NonEmpty s) = case parse (anyChar <?> s) T.empty of
+ (_, Left err) -> s `isInfixOf` err
+ _ -> False
+-}
+
+-- Basic byte-level combinators.
+
+maybeP p s = case P.parse p s `P.feed` T.empty of
+ P.Done _ i -> Just i
+ _ -> Nothing
+
+defP p s = P.parse p s `P.feed` T.empty
+
+satisfy w s = maybeP (P.satisfy (<=w)) (T.cons w s) == Just w
+
+char w s = maybeP (P.char w) (T.cons w s) == Just w
+
+anyChar s = maybeP P.anyChar s == if T.null s
+ then Nothing
+ else Just (T.head s)
+
+notChar w (NonEmpty s) = maybeP (P.notChar w) s == if v == w
+ then Nothing
+ else Just v
+ where v = T.head s
+
+string s = maybeP (P.string s) s == Just s
+
+skipWhile w s =
+ let t = T.dropWhile (<=w) s
+ in case defP (P.skipWhile (<=w)) s of
+ P.Done t' () -> t == t'
+ _ -> False
+
+takeCount (Positive k) s =
+ case maybeP (P.take k) s of
+ Nothing -> k > T.length s
+ Just s' -> k <= T.length s
+
+takeWhile w s =
+ let (h,t) = T.spanBy (==w) s
+ in case defP (P.takeWhile (==w)) s of
+ P.Done t' h' -> t == t' && h == h'
+ _ -> False
+
+takeWhile1 w s =
+ let s' = T.cons w s
+ (h,t) = T.spanBy (<=w) s'
+ in case defP (P.takeWhile1 (<=w)) s' of
+ P.Done t' h' -> t == t' && h == h'
+ _ -> False
+
+takeTill w s =
+ let (h,t) = T.breakBy (==w) s
+ in case defP (P.takeTill (==w)) s of
+ P.Done t' h' -> t == t' && h == h'
+ _ -> False
+
+ensure n s = case defP (P.ensure m) s of
+ P.Done _ () -> T.length s >= m
+ _ -> T.length s < m
+ where m = (n `mod` 220) - 20
+
+takeWhile1_empty = maybeP (P.takeWhile1 undefined) T.empty == Nothing
+
+endOfInput s = maybeP P.endOfInput s == if T.null s
+ then Just ()
+ else Nothing
+
+main = defaultMain tests
+
+tests = [
+ testGroup "fnord" [
+ testProperty "nonEmptyList" nonEmptyList,
+ testProperty "nonEmptyBS" nonEmptyBS,
+ testProperty "satisfy" satisfy,
+ testProperty "char" char,
+ testProperty "notChar" notChar,
+ testProperty "anyChar" anyChar,
+ testProperty "string" string,
+ testProperty "skipWhile" skipWhile,
+ testProperty "takeCount" takeCount,
+ testProperty "takeWhile" takeWhile,
+ testProperty "takeWhile1" takeWhile1,
+ testProperty "takeWhile1_empty" takeWhile1_empty,
+ testProperty "takeTill" takeTill,
+ testProperty "endOfInput" endOfInput,
+ testProperty "ensure" ensure
+ ]
+
+ ]
diff --git a/tests/QCSupport.hs b/tests/QCSupport.hs
new file mode 100644
index 0000000..c1b6292
--- /dev/null
+++ b/tests/QCSupport.hs
@@ -0,0 +1,53 @@
+{-# LANGUAGE FlexibleContexts, FlexibleInstances #-}
+module QCSupport
+ (
+ NonEmpty(..)
+ ) where
+
+import Control.Applicative
+import Data.Attoparsec.Text
+import System.Random (RandomGen, Random(..))
+import Test.QuickCheck hiding (NonEmpty)
+import qualified Data.Text as T
+import qualified Data.Text.Lazy as L
+
+integralRandomR :: (Integral a, RandomGen g) => (a,a) -> g -> (a,g)
+integralRandomR (a,b) g = case randomR (fromIntegral a :: Integer,
+ fromIntegral b :: Integer) g of
+ (x,g') -> (fromIntegral x, g')
+
+newtype NonEmpty a = NonEmpty { nonEmpty :: a }
+ deriving (Eq, Ord, Read, Show)
+
+instance Functor NonEmpty where
+ fmap f (NonEmpty a) = NonEmpty (f a)
+
+instance Applicative NonEmpty where
+ NonEmpty f <*> NonEmpty a = NonEmpty (f a)
+ pure a = NonEmpty a
+
+instance Arbitrary a => Arbitrary (NonEmpty [a]) where
+ arbitrary = NonEmpty <$> sized (\n -> choose (1,n+1) >>= vector)
+
+instance Arbitrary T.Text where
+ arbitrary = T.pack <$> arbitrary
+
+instance Arbitrary (NonEmpty T.Text) where
+ arbitrary = fmap T.pack <$> arbitrary
+
+instance Arbitrary L.Text where
+ arbitrary = sized $ \n -> resize (round (sqrt (toEnum n :: Double)))
+ ((L.fromChunks . map nonEmpty) <$> arbitrary)
+
+instance Arbitrary (NonEmpty L.Text) where
+ arbitrary = sized $ \n -> resize (round (sqrt (toEnum n :: Double)))
+ (fmap (L.fromChunks . map nonEmpty) <$> arbitrary)
+
+{-
+instance Random Word8 where
+ randomR = integralRandomR
+ random = randomR (minBound,maxBound)
+
+instance Arbitrary Word8 where
+ arbitrary = choose (minBound, maxBound)
+-} \ No newline at end of file
diff --git a/tests/TestFastSet.hs b/tests/TestFastSet.hs
new file mode 100644
index 0000000..8a98b35
--- /dev/null
+++ b/tests/TestFastSet.hs
@@ -0,0 +1,11 @@
+module Main (main) where
+
+import qualified Data.Attoparsec.Text.FastSet as F
+import Test.QuickCheck
+
+prop_AllMembers s =
+ let set = F.fromList s
+ in all (`F.member` set) s
+
+main = do
+ quickCheck prop_AllMembers