summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHerbertValerioRiedel <>2018-05-14 00:08:00 (GMT)
committerhdiff <hdiff@hdiff.luite.com>2018-05-14 00:08:00 (GMT)
commit7faeafadfbb752f200b17b9804e787adeb754f34 (patch)
tree1712162faf60a7547b42f0d2e15771b6f8f44bb8
version 1.2.3.0HEAD1.2.3.0master
-rw-r--r--Data/Text.hs1875
-rw-r--r--Data/Text/Array.hs329
-rw-r--r--Data/Text/Encoding.hs477
-rw-r--r--Data/Text/Encoding/Error.hs124
-rw-r--r--Data/Text/Foreign.hs158
-rw-r--r--Data/Text/IO.hs341
-rw-r--r--Data/Text/Internal.hs186
-rw-r--r--Data/Text/Internal/Builder.hs329
-rw-r--r--Data/Text/Internal/Builder/Functions.hs40
-rw-r--r--Data/Text/Internal/Builder/Int/Digits.hs26
-rw-r--r--Data/Text/Internal/Builder/RealFloat/Functions.hs57
-rw-r--r--Data/Text/Internal/Encoding/Fusion.hs209
-rw-r--r--Data/Text/Internal/Encoding/Fusion/Common.hs123
-rw-r--r--Data/Text/Internal/Encoding/Utf16.hs45
-rw-r--r--Data/Text/Internal/Encoding/Utf32.hs26
-rw-r--r--Data/Text/Internal/Encoding/Utf8.hs255
-rw-r--r--Data/Text/Internal/Functions.hs31
-rw-r--r--Data/Text/Internal/Fusion.hs231
-rw-r--r--Data/Text/Internal/Fusion/CaseMapping.hs1002
-rw-r--r--Data/Text/Internal/Fusion/Common.hs945
-rw-r--r--Data/Text/Internal/Fusion/Size.hs187
-rw-r--r--Data/Text/Internal/Fusion/Types.hs122
-rw-r--r--Data/Text/Internal/IO.hs166
-rw-r--r--Data/Text/Internal/Lazy.hs119
-rw-r--r--Data/Text/Internal/Lazy/Encoding/Fusion.hs324
-rw-r--r--Data/Text/Internal/Lazy/Fusion.hs120
-rw-r--r--Data/Text/Internal/Lazy/Search.hs134
-rw-r--r--Data/Text/Internal/Private.hs37
-rw-r--r--Data/Text/Internal/Read.hs62
-rw-r--r--Data/Text/Internal/Search.hs89
-rw-r--r--Data/Text/Internal/Unsafe.hs56
-rw-r--r--Data/Text/Internal/Unsafe/Char.hs119
-rw-r--r--Data/Text/Internal/Unsafe/Shift.hs72
-rw-r--r--Data/Text/Lazy.hs1729
-rw-r--r--Data/Text/Lazy/Builder.hs57
-rw-r--r--Data/Text/Lazy/Builder/Int.hs264
-rw-r--r--Data/Text/Lazy/Builder/RealFloat.hs245
-rw-r--r--Data/Text/Lazy/Encoding.hs250
-rw-r--r--Data/Text/Lazy/IO.hs195
-rw-r--r--Data/Text/Lazy/Internal.hs20
-rw-r--r--Data/Text/Lazy/Read.hs192
-rw-r--r--Data/Text/Read.hs200
-rw-r--r--Data/Text/Show.hs91
-rw-r--r--Data/Text/Unsafe.hs113
-rw-r--r--LICENSE26
-rw-r--r--Setup.lhs3
-rw-r--r--benchmarks/Setup.hs2
-rw-r--r--benchmarks/cbits/time_iconv.c35
-rw-r--r--benchmarks/haskell/Benchmarks.hs79
-rw-r--r--benchmarks/haskell/Benchmarks/Builder.hs75
-rw-r--r--benchmarks/haskell/Benchmarks/DecodeUtf8.hs67
-rw-r--r--benchmarks/haskell/Benchmarks/EncodeUtf8.hs33
-rw-r--r--benchmarks/haskell/Benchmarks/Equality.hs38
-rw-r--r--benchmarks/haskell/Benchmarks/FileRead.hs33
-rw-r--r--benchmarks/haskell/Benchmarks/FoldLines.hs58
-rw-r--r--benchmarks/haskell/Benchmarks/Mul.hs138
-rw-r--r--benchmarks/haskell/Benchmarks/Programs/BigTable.hs42
-rw-r--r--benchmarks/haskell/Benchmarks/Programs/Cut.hs98
-rw-r--r--benchmarks/haskell/Benchmarks/Programs/Fold.hs68
-rw-r--r--benchmarks/haskell/Benchmarks/Programs/Sort.hs71
-rw-r--r--benchmarks/haskell/Benchmarks/Programs/StripTags.hs53
-rw-r--r--benchmarks/haskell/Benchmarks/Programs/Throughput.hs41
-rw-r--r--benchmarks/haskell/Benchmarks/Pure.hs486
-rw-r--r--benchmarks/haskell/Benchmarks/ReadNumbers.hs93
-rw-r--r--benchmarks/haskell/Benchmarks/Replace.hs43
-rw-r--r--benchmarks/haskell/Benchmarks/Search.hs48
-rw-r--r--benchmarks/haskell/Benchmarks/Stream.hs104
-rw-r--r--benchmarks/haskell/Benchmarks/WordFrequencies.hs36
-rw-r--r--benchmarks/haskell/Multilang.hs32
-rw-r--r--benchmarks/haskell/Timer.hs30
-rw-r--r--benchmarks/python/cut.py12
-rwxr-xr-xbenchmarks/python/multilang.py50
-rw-r--r--benchmarks/python/sort.py13
-rw-r--r--benchmarks/python/strip_tags.py25
-rwxr-xr-xbenchmarks/python/utils.py18
-rw-r--r--benchmarks/ruby/cut.rb16
-rw-r--r--benchmarks/ruby/fold.rb50
-rw-r--r--benchmarks/ruby/sort.rb15
-rw-r--r--benchmarks/ruby/strip_tags.rb22
-rw-r--r--benchmarks/ruby/utils.rb14
-rw-r--r--benchmarks/text-benchmarks.cabal139
-rw-r--r--cbits/cbits.c179
-rw-r--r--changelog.md3
-rw-r--r--include/text_cbits.h11
-rw-r--r--scripts/ApiCompare.hs28
-rw-r--r--scripts/Arsec.hs44
-rw-r--r--scripts/CaseFolding.hs46
-rw-r--r--scripts/CaseMapping.hs38
-rw-r--r--scripts/SpecialCasing.hs56
-rw-r--r--tests-and-benchmarks.markdown68
-rw-r--r--tests/.ghci1
-rw-r--r--tests/LiteralRuleTest.hs21
-rw-r--r--tests/Makefile45
-rw-r--r--tests/Tests.hs13
-rw-r--r--tests/Tests/IO.hs34
-rw-r--r--tests/Tests/Properties.hs1400
-rw-r--r--tests/Tests/Properties/Mul.hs40
-rw-r--r--tests/Tests/QuickCheckUtils.hs368
-rw-r--r--tests/Tests/Regressions.hs93
-rw-r--r--tests/Tests/SlowFunctions.hs39
-rw-r--r--tests/Tests/Utils.hs52
-rw-r--r--tests/cabal.config6
-rwxr-xr-xtests/scripts/cover-stdio.sh62
-rw-r--r--tests/text-tests.cabal158
-rw-r--r--text-utf8.cabal266
105 files changed, 17049 insertions, 0 deletions
diff --git a/Data/Text.hs b/Data/Text.hs
new file mode 100644
index 0000000..890d008
--- /dev/null
+++ b/Data/Text.hs
@@ -0,0 +1,1875 @@
+{-# LANGUAGE BangPatterns, CPP, MagicHash, Rank2Types, UnboxedTuples #-}
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+#if __GLASGOW_HASKELL__ >= 702
+{-# LANGUAGE Trustworthy #-}
+#endif
+#if __GLASGOW_HASKELL__ >= 708
+{-# LANGUAGE TypeFamilies #-}
+#endif
+
+-- |
+-- Module : Data.Text
+-- Copyright : (c) 2009, 2010, 2011, 2012 Bryan O'Sullivan,
+-- (c) 2009 Duncan Coutts,
+-- (c) 2008, 2009 Tom Harper
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Portability : GHC
+--
+-- A time and space-efficient implementation of Unicode text.
+-- Suitable for performance critical use, both in terms of large data
+-- quantities and high speed.
+--
+-- /Note/: Read below the synopsis for important notes on the use of
+-- this module.
+--
+-- This module is intended to be imported @qualified@, to avoid name
+-- clashes with "Prelude" functions, e.g.
+--
+-- > import qualified Data.Text as T
+--
+-- To use an extended and very rich family of functions for working
+-- with Unicode text (including normalization, regular expressions,
+-- non-standard encodings, text breaking, and locales), see the
+-- <http://hackage.haskell.org/package/text-icu text-icu package >.
+--
+
+module Data.Text
+ (
+ -- * Strict vs lazy types
+ -- $strict
+
+ -- * Acceptable data
+ -- $replacement
+
+ -- * Definition of character
+ -- $character_definition
+
+ -- * Fusion
+ -- $fusion
+
+ -- * Types
+ Text
+
+ -- * Creation and elimination
+ , pack
+ , unpack
+ , singleton
+ , empty
+
+ -- * Basic interface
+ , cons
+ , snoc
+ , append
+ , uncons
+ , unsnoc
+ , head
+ , last
+ , tail
+ , init
+ , null
+ , length
+ , compareLength
+
+ -- * Transformations
+ , map
+ , intercalate
+ , intersperse
+ , transpose
+ , reverse
+ , replace
+
+ -- ** Case conversion
+ -- $case
+ , toCaseFold
+ , toLower
+ , toUpper
+ , toTitle
+
+ -- ** Justification
+ , justifyLeft
+ , justifyRight
+ , center
+
+ -- * Folds
+ , foldl
+ , foldl'
+ , foldl1
+ , foldl1'
+ , foldr
+ , foldr1
+
+ -- ** Special folds
+ , concat
+ , concatMap
+ , any
+ , all
+ , maximum
+ , minimum
+
+ -- * Construction
+
+ -- ** Scans
+ , scanl
+ , scanl1
+ , scanr
+ , scanr1
+
+ -- ** Accumulating maps
+ , mapAccumL
+ , mapAccumR
+
+ -- ** Generation and unfolding
+ , replicate
+ , unfoldr
+ , unfoldrN
+
+ -- * Substrings
+
+ -- ** Breaking strings
+ , take
+ , takeEnd
+ , drop
+ , dropEnd
+ , takeWhile
+ , takeWhileEnd
+ , dropWhile
+ , dropWhileEnd
+ , dropAround
+ , strip
+ , stripStart
+ , stripEnd
+ , splitAt
+ , breakOn
+ , breakOnEnd
+ , break
+ , span
+ , group
+ , groupBy
+ , inits
+ , tails
+
+ -- ** Breaking into many substrings
+ -- $split
+ , splitOn
+ , split
+ , chunksOf
+
+ -- ** Breaking into lines and words
+ , lines
+ --, lines'
+ , words
+ , unlines
+ , unwords
+
+ -- * Predicates
+ , isPrefixOf
+ , isSuffixOf
+ , isInfixOf
+
+ -- ** View patterns
+ , stripPrefix
+ , stripSuffix
+ , commonPrefixes
+
+ -- * Searching
+ , filter
+ , breakOnAll
+ , find
+ , partition
+
+ -- , findSubstring
+
+ -- * Indexing
+ -- $index
+ , index
+ , findIndex
+ , count
+
+ -- * Zipping
+ , zip
+ , zipWith
+
+ -- -* Ordered text
+ -- , sort
+
+ -- * Low level operations
+ , copy
+ , unpackCString#
+ ) where
+
+import Prelude (Char, Bool(..), Int, Maybe(..), String,
+ Eq(..), Ord(..), Ordering(..), (++),
+ Read(..),
+ (&&), (||), (+), (-), (.), ($), ($!), (>>),
+ not, return, otherwise, quot)
+#if defined(HAVE_DEEPSEQ)
+import Control.DeepSeq (NFData(rnf))
+#endif
+#if defined(ASSERTS)
+import Control.Exception (assert)
+#endif
+import Data.Char (isSpace)
+import Data.Data (Data(gfoldl, toConstr, gunfold, dataTypeOf), constrIndex,
+ Constr, mkConstr, DataType, mkDataType, Fixity(Prefix))
+import Control.Monad (foldM)
+import Control.Monad.ST (ST)
+import qualified Data.Text.Array as A
+import qualified Data.List as L
+import Data.Binary (Binary(get, put))
+import Data.Monoid (Monoid(..))
+#if MIN_VERSION_base(4,9,0)
+import Data.Semigroup (Semigroup(..))
+#endif
+import Data.String (IsString(..))
+import qualified Data.Text.Internal.Fusion as S
+import qualified Data.Text.Internal.Fusion.Common as S
+import Data.Text.Encoding (decodeUtf8', encodeUtf8)
+import Data.Text.Internal.Fusion (stream, reverseStream, unstream)
+import Data.Text.Internal.Private (span_)
+import Data.Text.Internal (Text(..), empty, firstf, mul, safe, text)
+import Data.Text.Show (singleton, unpack, unpackCString#)
+import qualified Prelude as P
+import Data.Text.Unsafe (Iter(..), iter, iter_, lengthWord8, reverseIter,
+ reverseIter_, unsafeHead, unsafeTail, takeWord8)
+import qualified Data.Text.Internal.Functions as F
+import qualified Data.Text.Internal.Encoding.Utf8 as U8
+import Data.Text.Internal.Search (indices)
+#if defined(__HADDOCK__)
+import Data.ByteString (ByteString)
+import qualified Data.Text.Lazy as L
+import Data.Int (Int64)
+#endif
+import GHC.Base (eqInt, neInt, gtInt, geInt, ltInt, leInt)
+#if __GLASGOW_HASKELL__ >= 708
+import qualified GHC.Exts as Exts
+#endif
+#if MIN_VERSION_base(4,7,0)
+import Text.Printf (PrintfArg, formatArg, formatString)
+#endif
+
+-- $character_definition
+--
+-- This package uses the term /character/ to denote Unicode /code points/.
+--
+-- Note that this is not the same thing as a grapheme (e.g. a
+-- composition of code points that form one visual symbol). For
+-- instance, consider the grapheme \"&#x00e4;\". This symbol has two
+-- Unicode representations: a single code-point representation
+-- @U+00E4@ (the @LATIN SMALL LETTER A WITH DIAERESIS@ code point),
+-- and a two code point representation @U+0061@ (the \"@A@\" code
+-- point) and @U+0308@ (the @COMBINING DIAERESIS@ code point).
+
+-- $strict
+--
+-- This package provides both strict and lazy 'Text' types. The
+-- strict type is provided by the "Data.Text" module, while the lazy
+-- type is provided by the "Data.Text.Lazy" module. Internally, the
+-- lazy @Text@ type consists of a list of strict chunks.
+--
+-- The strict 'Text' type requires that an entire string fit into
+-- memory at once. The lazy 'Data.Text.Lazy.Text' type is capable of
+-- streaming strings that are larger than memory using a small memory
+-- footprint. In many cases, the overhead of chunked streaming makes
+-- the lazy 'Data.Text.Lazy.Text' type slower than its strict
+-- counterpart, but this is not always the case. Sometimes, the time
+-- complexity of a function in one module may be different from the
+-- other, due to their differing internal structures.
+--
+-- Each module provides an almost identical API, with the main
+-- difference being that the strict module uses 'Int' values for
+-- lengths and counts, while the lazy module uses 'Data.Int.Int64'
+-- lengths.
+
+-- $replacement
+--
+-- A 'Text' value is a sequence of Unicode scalar values, as defined
+-- in
+-- <http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#page=35 §3.9, definition D76 of the Unicode 5.2 standard >.
+-- As such, a 'Text' cannot contain values in the range U+D800 to
+-- U+DFFF inclusive. Haskell implementations admit all Unicode code
+-- points
+-- (<http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#page=13 §3.4, definition D10 >)
+-- as 'Char' values, including code points from this invalid range.
+-- This means that there are some 'Char' values that are not valid
+-- Unicode scalar values, and the functions in this module must handle
+-- those cases.
+--
+-- Within this module, many functions construct a 'Text' from one or
+-- more 'Char' values. Those functions will substitute 'Char' values
+-- that are not valid Unicode scalar values with the replacement
+-- character \"&#xfffd;\" (U+FFFD). Functions that perform this
+-- inspection and replacement are documented with the phrase
+-- \"Performs replacement on invalid scalar values\".
+--
+-- (One reason for this policy of replacement is that internally, a
+-- 'Text' value is represented as packed UTF-8 data. Values in the
+-- range U+D800 through U+DFFF are used by UTF-16 to denote surrogate
+-- code points, and so cannot be represented. The functions replace
+-- invalid scalar values, instead of dropping them, as a security
+-- measure. For details, see
+-- <http://unicode.org/reports/tr36/#Deletion_of_Noncharacters Unicode Technical Report 36, §3.5 >.)
+
+-- $fusion
+--
+-- Most of the functions in this module are subject to /fusion/,
+-- meaning that a pipeline of such functions will usually allocate at
+-- most one 'Text' value.
+--
+-- As an example, consider the following pipeline:
+--
+-- > import Data.Text as T
+-- > import Data.Text.Encoding as E
+-- > import Data.ByteString (ByteString)
+-- >
+-- > countChars :: ByteString -> Int
+-- > countChars = T.length . T.toUpper . E.decodeUtf8
+--
+-- From the type signatures involved, this looks like it should
+-- allocate one 'Data.ByteString.ByteString' value, and two 'Text'
+-- values. However, when a module is compiled with optimisation
+-- enabled under GHC, the two intermediate 'Text' values will be
+-- optimised away, and the function will be compiled down to a single
+-- loop over the source 'Data.ByteString.ByteString'.
+--
+-- Functions that can be fused by the compiler are documented with the
+-- phrase \"Subject to fusion\".
+
+instance Eq Text where
+ Text arrA offA lenA == Text arrB offB lenB
+ | lenA == lenB = A.equal arrA offA arrB offB lenA
+ | otherwise = False
+ {-# INLINE (==) #-}
+
+instance Ord Text where
+ compare = compareText
+
+instance Read Text where
+ readsPrec p str = [(pack x,y) | (x,y) <- readsPrec p str]
+
+#if MIN_VERSION_base(4,9,0)
+-- | Non-orphan 'Semigroup' instance only defined for
+-- @base-4.9.0.0@ and later; orphan instances for older GHCs are
+-- provided by
+-- the [semigroups](http://hackage.haskell.org/package/semigroups)
+-- package
+--
+-- @since 1.2.2.0
+instance Semigroup Text where
+ (<>) = append
+#endif
+
+instance Monoid Text where
+ mempty = empty
+#if MIN_VERSION_base(4,9,0)
+ mappend = (<>) -- future-proof definition
+#else
+ mappend = append
+#endif
+ mconcat = concat
+
+instance IsString Text where
+ fromString = pack
+
+#if __GLASGOW_HASKELL__ >= 708
+-- | @since 1.2.0.0
+instance Exts.IsList Text where
+ type Item Text = Char
+ fromList = pack
+ toList = unpack
+#endif
+
+#if defined(HAVE_DEEPSEQ)
+instance NFData Text where rnf !_ = ()
+#endif
+
+-- | @since 1.2.1.0
+instance Binary Text where
+ put t = put (encodeUtf8 t)
+ get = do
+ bs <- get
+ case decodeUtf8' bs of
+ P.Left exn -> P.fail (P.show exn)
+ P.Right a -> P.return a
+
+-- | This instance preserves data abstraction at the cost of inefficiency.
+-- We omit reflection services for the sake of data abstraction.
+--
+-- This instance was created by copying the updated behavior of
+-- @"Data.Set".@'Data.Set.Set' and @"Data.Map".@'Data.Map.Map'. If you
+-- feel a mistake has been made, please feel free to submit
+-- improvements.
+--
+-- The original discussion is archived here:
+-- <http://groups.google.com/group/haskell-cafe/browse_thread/thread/b5bbb1b28a7e525d/0639d46852575b93 could we get a Data instance for Data.Text.Text? >
+--
+-- The followup discussion that changed the behavior of 'Data.Set.Set'
+-- and 'Data.Map.Map' is archived here:
+-- <http://markmail.org/message/trovdc6zkphyi3cr#query:+page:1+mid:a46der3iacwjcf6n+state:results Proposal: Allow gunfold for Data.Map, ... >
+
+instance Data Text where
+ gfoldl f z txt = z pack `f` (unpack txt)
+ toConstr _ = packConstr
+ gunfold k z c = case constrIndex c of
+ 1 -> k (z pack)
+ _ -> P.error "gunfold"
+ dataTypeOf _ = textDataType
+
+#if MIN_VERSION_base(4,7,0)
+-- | Only defined for @base-4.7.0.0@ and later
+--
+-- @since 1.2.2.0
+instance PrintfArg Text where
+ formatArg txt = formatString $ unpack txt
+#endif
+
+packConstr :: Constr
+packConstr = mkConstr textDataType "pack" [] Prefix
+
+textDataType :: DataType
+textDataType = mkDataType "Data.Text.Text" [packConstr]
+
+-- | /O(n)/ Compare two 'Text' values lexicographically.
+compareText :: Text -> Text -> Ordering
+compareText (Text arrA offA lenA) (Text arrB offB lenB)
+ | lenA == 0 || lenB == 0 = compare lenA lenB
+ | otherwise =
+ A.cmp arrA offA arrB offB (min lenA lenB) `mappend` compare lenA lenB
+
+-- -----------------------------------------------------------------------------
+-- * Conversion to/from 'Text'
+
+-- | /O(n)/ Convert a 'String' into a 'Text'. Subject to
+-- fusion. Performs replacement on invalid scalar values.
+pack :: String -> Text
+pack = unstream . S.map safe . S.streamList
+{-# INLINE [1] pack #-}
+
+-- -----------------------------------------------------------------------------
+-- * Basic functions
+
+-- | /O(n)/ Adds a character to the front of a 'Text'. This function
+-- is more costly than its 'List' counterpart because it requires
+-- copying a new array. Subject to fusion. Performs replacement on
+-- invalid scalar values.
+cons :: Char -> Text -> Text
+cons c t = unstream (S.cons (safe c) (stream t))
+{-# INLINE cons #-}
+
+infixr 5 `cons`
+
+-- | /O(n)/ Adds a character to the end of a 'Text'. This copies the
+-- entire array in the process, unless fused. Subject to fusion.
+-- Performs replacement on invalid scalar values.
+snoc :: Text -> Char -> Text
+snoc t c = unstream (S.snoc (stream t) (safe c))
+{-# INLINE snoc #-}
+
+-- | /O(n)/ Appends one 'Text' to the other by copying both of them
+-- into a new 'Text'. Subject to fusion.
+append :: Text -> Text -> Text
+append a@(Text arr1 off1 len1) b@(Text arr2 off2 len2)
+ | len1 == 0 = b
+ | len2 == 0 = a
+ | len > 0 = Text (A.run x) 0 len
+ | otherwise = overflowError "append"
+ where
+ len = len1+len2
+ x :: ST s (A.MArray s)
+ x = do
+ arr <- A.new len
+ A.copyI arr 0 arr1 off1 len1
+ A.copyI arr len1 arr2 off2 len
+ return arr
+{-# NOINLINE append #-}
+
+{-# RULES
+"TEXT append -> fused" [~1] forall t1 t2.
+ append t1 t2 = unstream (S.append (stream t1) (stream t2))
+"TEXT append -> unfused" [1] forall t1 t2.
+ unstream (S.append (stream t1) (stream t2)) = append t1 t2
+ #-}
+
+-- | /O(1)/ Returns the first character of a 'Text', which must be
+-- non-empty. Subject to fusion.
+head :: Text -> Char
+head t = S.head (stream t)
+{-# INLINE head #-}
+
+-- | /O(1)/ Returns the first character and rest of a 'Text', or
+-- 'Nothing' if empty. Subject to fusion.
+uncons :: Text -> Maybe (Char, Text)
+uncons t@(Text arr off len)
+ | len <= 0 = Nothing
+ | otherwise = Just $ let !(Iter c d) = iter t 0
+ in (c, text arr (off+d) (len-d))
+{-# INLINE [1] uncons #-}
+
+-- | Lifted from Control.Arrow and specialized.
+second :: (b -> c) -> (a,b) -> (a,c)
+second f (a, b) = (a, f b)
+
+-- | /O(1)/ Returns the last character of a 'Text', which must be
+-- non-empty. Subject to fusion.
+last :: Text -> Char
+last (Text arr off len)
+ | len <= 0 = emptyError "last"
+ | otherwise = U8.reverseDecodeCharIndex (\c _ -> c) idx (off + len - 1)
+ where
+ idx = A.unsafeIndex arr
+{-# INLINE [1] last #-}
+
+{-# RULES
+"TEXT last -> fused" [~1] forall t.
+ last t = S.last (stream t)
+"TEXT last -> unfused" [1] forall t.
+ S.last (stream t) = last t
+ #-}
+
+-- | /O(1)/ Returns all characters after the head of a 'Text', which
+-- must be non-empty. Subject to fusion.
+tail :: Text -> Text
+tail t@(Text arr off len)
+ | len <= 0 = emptyError "tail"
+ | otherwise = text arr (off+d) (len-d)
+ where d = iter_ t 0
+{-# INLINE [1] tail #-}
+
+{-# RULES
+"TEXT tail -> fused" [~1] forall t.
+ tail t = unstream (S.tail (stream t))
+"TEXT tail -> unfused" [1] forall t.
+ unstream (S.tail (stream t)) = tail t
+ #-}
+
+-- | /O(1)/ Returns all but the last character of a 'Text', which must
+-- be non-empty. Subject to fusion.
+init :: Text -> Text
+init t@(Text arr off len)
+ | len <= 0 = emptyError "init"
+ | otherwise = U8.reverseDecodeCharIndex
+ (\_ s -> takeWord8 (len - s) t) idx (off + len - 1)
+ where
+ idx = A.unsafeIndex arr
+{-# INLINE [1] init #-}
+
+{-# RULES
+"TEXT init -> fused" [~1] forall t.
+ init t = unstream (S.init (stream t))
+"TEXT init -> unfused" [1] forall t.
+ unstream (S.init (stream t)) = init t
+ #-}
+
+-- | /O(1)/ Returns all but the last character and the last character of a
+-- 'Text', or 'Nothing' if empty.
+--
+-- @since 1.2.3.0
+unsnoc :: Text -> Maybe (Text, Char)
+unsnoc t@(Text _ _ len)
+ | len <= 0 = Nothing
+ | otherwise = Just (init t, last t) -- TODO
+{-# INLINE [1] unsnoc #-}
+
+-- | /O(1)/ Tests whether a 'Text' is empty or not. Subject to
+-- fusion.
+null :: Text -> Bool
+null (Text _arr _off len) =
+#if defined(ASSERTS)
+ assert (len >= 0) $
+#endif
+ len <= 0
+{-# INLINE [1] null #-}
+
+{-# RULES
+"TEXT null -> fused" [~1] forall t.
+ null t = S.null (stream t)
+"TEXT null -> unfused" [1] forall t.
+ S.null (stream t) = null t
+ #-}
+
+-- | /O(1)/ Tests whether a 'Text' contains exactly one character.
+-- Subject to fusion.
+isSingleton :: Text -> Bool
+isSingleton = S.isSingleton . stream
+{-# INLINE isSingleton #-}
+
+-- | /O(n)/ Returns the number of characters in a 'Text'.
+-- Subject to fusion.
+length :: Text -> Int
+length t = S.length (stream t)
+{-# INLINE [0] length #-}
+-- length needs to be phased after the compareN/length rules otherwise
+-- it may inline before the rules have an opportunity to fire.
+
+-- | /O(n)/ Compare the count of characters in a 'Text' to a number.
+-- Subject to fusion.
+--
+-- This function gives the same answer as comparing against the result
+-- of 'length', but can short circuit if the count of characters is
+-- greater than the number, and hence be more efficient.
+compareLength :: Text -> Int -> Ordering
+compareLength t n = S.compareLengthI (stream t) n
+{-# INLINE [1] compareLength #-}
+
+{-# RULES
+"TEXT compareN/length -> compareLength" [~1] forall t n.
+ compare (length t) n = compareLength t n
+ #-}
+
+{-# RULES
+"TEXT ==N/length -> compareLength/==EQ" [~1] forall t n.
+ eqInt (length t) n = compareLength t n == EQ
+ #-}
+
+{-# RULES
+"TEXT /=N/length -> compareLength//=EQ" [~1] forall t n.
+ neInt (length t) n = compareLength t n /= EQ
+ #-}
+
+{-# RULES
+"TEXT <N/length -> compareLength/==LT" [~1] forall t n.
+ ltInt (length t) n = compareLength t n == LT
+ #-}
+
+{-# RULES
+"TEXT <=N/length -> compareLength//=GT" [~1] forall t n.
+ leInt (length t) n = compareLength t n /= GT
+ #-}
+
+{-# RULES
+"TEXT >N/length -> compareLength/==GT" [~1] forall t n.
+ gtInt (length t) n = compareLength t n == GT
+ #-}
+
+{-# RULES
+"TEXT >=N/length -> compareLength//=LT" [~1] forall t n.
+ geInt (length t) n = compareLength t n /= LT
+ #-}
+
+-- -----------------------------------------------------------------------------
+-- * Transformations
+-- | /O(n)/ 'map' @f@ @t@ is the 'Text' obtained by applying @f@ to
+-- each element of @t@.
+--
+-- Example:
+--
+-- >>> let message = pack "I am not angry. Not at all."
+-- >>> T.map (\c -> if c == '.' then '!' else c) message
+-- "I am not angry! Not at all!"
+--
+-- Subject to fusion. Performs replacement on invalid scalar values.
+map :: (Char -> Char) -> Text -> Text
+map f t = unstream (S.map (safe . f) (stream t))
+{-# INLINE [1] map #-}
+
+-- | /O(n)/ The 'intercalate' function takes a 'Text' and a list of
+-- 'Text's and concatenates the list after interspersing the first
+-- argument between each element of the list.
+--
+-- Example:
+--
+-- >>> T.intercalate "NI!" ["We", "seek", "the", "Holy", "Grail"]
+-- "WeNI!seekNI!theNI!HolyNI!Grail"
+intercalate :: Text -> [Text] -> Text
+intercalate t = concat . (F.intersperse t)
+{-# INLINE intercalate #-}
+
+-- | /O(n)/ The 'intersperse' function takes a character and places it
+-- between the characters of a 'Text'.
+--
+-- Example:
+--
+-- >>> T.intersperse '.' "SHIELD"
+-- "S.H.I.E.L.D"
+--
+-- Subject to fusion. Performs replacement on invalid scalar values.
+intersperse :: Char -> Text -> Text
+intersperse c t = unstream (S.intersperse (safe c) (stream t))
+{-# INLINE intersperse #-}
+
+-- | /O(n)/ Reverse the characters of a string.
+--
+-- Example:
+--
+-- >>> T.reverse "desrever"
+-- "reversed"
+--
+-- Subject to fusion.
+reverse :: Text -> Text
+reverse t = S.reverse (stream t)
+{-# INLINE reverse #-}
+
+-- | /O(m+n)/ Replace every non-overlapping occurrence of @needle@ in
+-- @haystack@ with @replacement@.
+--
+-- This function behaves as though it was defined as follows:
+--
+-- @
+-- replace needle replacement haystack =
+-- 'intercalate' replacement ('splitOn' needle haystack)
+-- @
+--
+-- As this suggests, each occurrence is replaced exactly once. So if
+-- @needle@ occurs in @replacement@, that occurrence will /not/ itself
+-- be replaced recursively:
+--
+-- >>> replace "oo" "foo" "oo"
+-- "foo"
+--
+-- In cases where several instances of @needle@ overlap, only the
+-- first one will be replaced:
+--
+-- >>> replace "ofo" "bar" "ofofo"
+-- "barfo"
+--
+-- In (unlikely) bad cases, this function's time complexity degrades
+-- towards /O(n*m)/.
+replace :: Text
+ -- ^ @needle@ to search for. If this string is empty, an
+ -- error will occur.
+ -> Text
+ -- ^ @replacement@ to replace @needle@ with.
+ -> Text
+ -- ^ @haystack@ in which to search.
+ -> Text
+replace needle@(Text _ _ neeLen)
+ (Text repArr repOff repLen)
+ haystack@(Text hayArr hayOff hayLen)
+ | neeLen == 0 = emptyError "replace"
+ | L.null ixs = haystack
+ | len > 0 = Text (A.run x) 0 len
+ | otherwise = empty
+ where
+ ixs = indices needle haystack
+ len = hayLen - (neeLen - repLen) `mul` L.length ixs
+ x :: ST s (A.MArray s)
+ x = do
+ marr <- A.new len
+ let loop (i:is) o d = do
+ let d0 = d + i - o
+ d1 = d0 + repLen
+ A.copyI marr d hayArr (hayOff+o) d0
+ A.copyI marr d0 repArr repOff d1
+ loop is (i + neeLen) d1
+ loop [] o d = A.copyI marr d hayArr (hayOff+o) len
+ loop ixs 0 0
+ return marr
+
+-- ----------------------------------------------------------------------------
+-- ** Case conversions (folds)
+
+-- $case
+--
+-- When case converting 'Text' values, do not use combinators like
+-- @map toUpper@ to case convert each character of a string
+-- individually, as this gives incorrect results according to the
+-- rules of some writing systems. The whole-string case conversion
+-- functions from this module, such as @toUpper@, obey the correct
+-- case conversion rules. As a result, these functions may map one
+-- input character to two or three output characters. For examples,
+-- see the documentation of each function.
+--
+-- /Note/: In some languages, case conversion is a locale- and
+-- context-dependent operation. The case conversion functions in this
+-- module are /not/ locale sensitive. Programs that require locale
+-- sensitivity should use appropriate versions of the
+-- <http://hackage.haskell.org/package/text-icu-0.6.3.7/docs/Data-Text-ICU.html#g:4 case mapping functions from the text-icu package >.
+
+-- | /O(n)/ Convert a string to folded case. Subject to fusion.
+--
+-- This function is mainly useful for performing caseless (also known
+-- as case insensitive) string comparisons.
+--
+-- A string @x@ is a caseless match for a string @y@ if and only if:
+--
+-- @toCaseFold x == toCaseFold y@
+--
+-- The result string may be longer than the input string, and may
+-- differ from applying 'toLower' to the input string. For instance,
+-- the Armenian small ligature \"&#xfb13;\" (men now, U+FB13) is case
+-- folded to the sequence \"&#x574;\" (men, U+0574) followed by
+-- \"&#x576;\" (now, U+0576), while the Greek \"&#xb5;\" (micro sign,
+-- U+00B5) is case folded to \"&#x3bc;\" (small letter mu, U+03BC)
+-- instead of itself.
+toCaseFold :: Text -> Text
+toCaseFold t = unstream (S.toCaseFold (stream t))
+{-# INLINE toCaseFold #-}
+
+-- | /O(n)/ Convert a string to lower case, using simple case
+-- conversion. Subject to fusion.
+--
+-- The result string may be longer than the input string. For
+-- instance, \"&#x130;\" (Latin capital letter I with dot above,
+-- U+0130) maps to the sequence \"i\" (Latin small letter i, U+0069)
+-- followed by \" &#x307;\" (combining dot above, U+0307).
+toLower :: Text -> Text
+toLower t = unstream (S.toLower (stream t))
+{-# INLINE toLower #-}
+
+-- | /O(n)/ Convert a string to upper case, using simple case
+-- conversion. Subject to fusion.
+--
+-- The result string may be longer than the input string. For
+-- instance, the German \"&#xdf;\" (eszett, U+00DF) maps to the
+-- two-letter sequence \"SS\".
+toUpper :: Text -> Text
+toUpper t = unstream (S.toUpper (stream t))
+{-# INLINE toUpper #-}
+
+-- | /O(n)/ Convert a string to title case, using simple case
+-- conversion. Subject to fusion.
+--
+-- The first letter of the input is converted to title case, as is
+-- every subsequent letter that immediately follows a non-letter.
+-- Every letter that immediately follows another letter is converted
+-- to lower case.
+--
+-- The result string may be longer than the input string. For example,
+-- the Latin small ligature &#xfb02; (U+FB02) is converted to the
+-- sequence Latin capital letter F (U+0046) followed by Latin small
+-- letter l (U+006C).
+--
+-- /Note/: this function does not take language or culture specific
+-- rules into account. For instance, in English, different style
+-- guides disagree on whether the book name \"The Hill of the Red
+-- Fox\" is correctly title cased&#x2014;but this function will
+-- capitalize /every/ word.
+--
+-- @since 1.0.0.0
+toTitle :: Text -> Text
+toTitle t = unstream (S.toTitle (stream t))
+{-# INLINE toTitle #-}
+
+-- | /O(n)/ Left-justify a string to the given length, using the
+-- specified fill character on the right. Subject to fusion.
+-- Performs replacement on invalid scalar values.
+--
+-- Examples:
+--
+-- >>> justifyLeft 7 'x' "foo"
+-- "fooxxxx"
+--
+-- >>> justifyLeft 3 'x' "foobar"
+-- "foobar"
+justifyLeft :: Int -> Char -> Text -> Text
+justifyLeft k c t
+ | len >= k = t
+ | otherwise = t `append` replicateChar (k-len) c
+ where len = length t
+{-# INLINE [1] justifyLeft #-}
+
+{-# RULES
+"TEXT justifyLeft -> fused" [~1] forall k c t.
+ justifyLeft k c t = unstream (S.justifyLeftI k c (stream t))
+"TEXT justifyLeft -> unfused" [1] forall k c t.
+ unstream (S.justifyLeftI k c (stream t)) = justifyLeft k c t
+ #-}
+
+-- | /O(n)/ Right-justify a string to the given length, using the
+-- specified fill character on the left. Performs replacement on
+-- invalid scalar values.
+--
+-- Examples:
+--
+-- >>> justifyRight 7 'x' "bar"
+-- "xxxxbar"
+--
+-- >>> justifyRight 3 'x' "foobar"
+-- "foobar"
+justifyRight :: Int -> Char -> Text -> Text
+justifyRight k c t
+ | len >= k = t
+ | otherwise = replicateChar (k-len) c `append` t
+ where len = length t
+{-# INLINE justifyRight #-}
+
+-- | /O(n)/ Center a string to the given length, using the specified
+-- fill character on either side. Performs replacement on invalid
+-- scalar values.
+--
+-- Examples:
+--
+-- >>> center 8 'x' "HS"
+-- "xxxHSxxx"
+center :: Int -> Char -> Text -> Text
+center k c t
+ | len >= k = t
+ | otherwise = replicateChar l c `append` t `append` replicateChar r c
+ where len = length t
+ d = k - len
+ r = d `quot` 2
+ l = d - r
+{-# INLINE center #-}
+
+-- | /O(n)/ The 'transpose' function transposes the rows and columns
+-- of its 'Text' argument. Note that this function uses 'pack',
+-- 'unpack', and the list version of transpose, and is thus not very
+-- efficient.
+--
+-- Examples:
+--
+-- >>> transpose ["green","orange"]
+-- ["go","rr","ea","en","ng","e"]
+--
+-- >>> transpose ["blue","red"]
+-- ["br","le","ud","e"]
+transpose :: [Text] -> [Text]
+transpose ts = P.map pack (L.transpose (P.map unpack ts))
+
+-- -----------------------------------------------------------------------------
+-- * Reducing 'Text's (folds)
+
+-- | /O(n)/ 'foldl', applied to a binary operator, a starting value
+-- (typically the left-identity of the operator), and a 'Text',
+-- reduces the 'Text' using the binary operator, from left to right.
+-- Subject to fusion.
+foldl :: (a -> Char -> a) -> a -> Text -> a
+foldl f z t = S.foldl f z (stream t)
+{-# INLINE foldl #-}
+
+-- | /O(n)/ A strict version of 'foldl'. Subject to fusion.
+foldl' :: (a -> Char -> a) -> a -> Text -> a
+foldl' f z t = S.foldl' f z (stream t)
+{-# INLINE foldl' #-}
+
+-- | /O(n)/ A variant of 'foldl' that has no starting value argument,
+-- and thus must be applied to a non-empty 'Text'. Subject to fusion.
+foldl1 :: (Char -> Char -> Char) -> Text -> Char
+foldl1 f t = S.foldl1 f (stream t)
+{-# INLINE foldl1 #-}
+
+-- | /O(n)/ A strict version of 'foldl1'. Subject to fusion.
+foldl1' :: (Char -> Char -> Char) -> Text -> Char
+foldl1' f t = S.foldl1' f (stream t)
+{-# INLINE foldl1' #-}
+
+-- | /O(n)/ 'foldr', applied to a binary operator, a starting value
+-- (typically the right-identity of the operator), and a 'Text',
+-- reduces the 'Text' using the binary operator, from right to left.
+-- Subject to fusion.
+foldr :: (Char -> a -> a) -> a -> Text -> a
+foldr f z t = S.foldr f z (stream t)
+{-# INLINE foldr #-}
+
+-- | /O(n)/ A variant of 'foldr' that has no starting value argument,
+-- and thus must be applied to a non-empty 'Text'. Subject to
+-- fusion.
+foldr1 :: (Char -> Char -> Char) -> Text -> Char
+foldr1 f t = S.foldr1 f (stream t)
+{-# INLINE foldr1 #-}
+
+-- -----------------------------------------------------------------------------
+-- ** Special folds
+
+-- | /O(n)/ Concatenate a list of 'Text's.
+concat :: [Text] -> Text
+concat ts = case ts' of
+ [] -> empty
+ [t] -> t
+ _ -> Text (A.run go) 0 len
+ where
+ ts' = L.filter (not . null) ts
+ len = sumP "concat" $ L.map lengthWord8 ts'
+ go :: ST s (A.MArray s)
+ go = do
+ arr <- A.new len
+ let step i (Text a o l) =
+ let !j = i + l in A.copyI arr i a o j >> return j
+ foldM step 0 ts' >> return arr
+
+-- | /O(n)/ Map a function over a 'Text' that results in a 'Text', and
+-- concatenate the results.
+concatMap :: (Char -> Text) -> Text -> Text
+concatMap f = concat . foldr ((:) . f) []
+{-# INLINE concatMap #-}
+
+-- | /O(n)/ 'any' @p@ @t@ determines whether any character in the
+-- 'Text' @t@ satisfies the predicate @p@. Subject to fusion.
+any :: (Char -> Bool) -> Text -> Bool
+any p t = S.any p (stream t)
+{-# INLINE any #-}
+
+-- | /O(n)/ 'all' @p@ @t@ determines whether all characters in the
+-- 'Text' @t@ satisfy the predicate @p@. Subject to fusion.
+all :: (Char -> Bool) -> Text -> Bool
+all p t = S.all p (stream t)
+{-# INLINE all #-}
+
+-- | /O(n)/ 'maximum' returns the maximum value from a 'Text', which
+-- must be non-empty. Subject to fusion.
+maximum :: Text -> Char
+maximum t = S.maximum (stream t)
+{-# INLINE maximum #-}
+
+-- | /O(n)/ 'minimum' returns the minimum value from a 'Text', which
+-- must be non-empty. Subject to fusion.
+minimum :: Text -> Char
+minimum t = S.minimum (stream t)
+{-# INLINE minimum #-}
+
+-- -----------------------------------------------------------------------------
+-- * Building 'Text's
+
+-- | /O(n)/ 'scanl' is similar to 'foldl', but returns a list of
+-- successive reduced values from the left. Subject to fusion.
+-- Performs replacement on invalid scalar values.
+--
+-- > scanl f z [x1, x2, ...] == [z, z `f` x1, (z `f` x1) `f` x2, ...]
+--
+-- Note that
+--
+-- > last (scanl f z xs) == foldl f z xs.
+scanl :: (Char -> Char -> Char) -> Char -> Text -> Text
+scanl f z t = unstream (S.scanl g z (stream t))
+ where g a b = safe (f a b)
+{-# INLINE scanl #-}
+
+-- | /O(n)/ 'scanl1' is a variant of 'scanl' that has no starting
+-- value argument. Subject to fusion. Performs replacement on
+-- invalid scalar values.
+--
+-- > scanl1 f [x1, x2, ...] == [x1, x1 `f` x2, ...]
+scanl1 :: (Char -> Char -> Char) -> Text -> Text
+scanl1 f t | null t = empty
+ | otherwise = scanl f (unsafeHead t) (unsafeTail t)
+{-# INLINE scanl1 #-}
+
+-- | /O(n)/ 'scanr' is the right-to-left dual of 'scanl'. Performs
+-- replacement on invalid scalar values.
+--
+-- > scanr f v == reverse . scanl (flip f) v . reverse
+scanr :: (Char -> Char -> Char) -> Char -> Text -> Text
+scanr f z = S.reverse . S.reverseScanr g z . reverseStream
+ where g a b = safe (f a b)
+{-# INLINE scanr #-}
+
+-- | /O(n)/ 'scanr1' is a variant of 'scanr' that has no starting
+-- value argument. Subject to fusion. Performs replacement on
+-- invalid scalar values.
+scanr1 :: (Char -> Char -> Char) -> Text -> Text
+scanr1 f t | null t = empty
+ | otherwise = scanr f (last t) (init t)
+{-# INLINE scanr1 #-}
+
+-- | /O(n)/ Like a combination of 'map' and 'foldl''. Applies a
+-- function to each element of a 'Text', passing an accumulating
+-- parameter from left to right, and returns a final 'Text'. Performs
+-- replacement on invalid scalar values.
+mapAccumL :: (a -> Char -> (a,Char)) -> a -> Text -> (a, Text)
+mapAccumL f z0 = S.mapAccumL g z0 . stream
+ where g a b = second safe (f a b)
+{-# INLINE mapAccumL #-}
+
+-- | The 'mapAccumR' function behaves like a combination of 'map' and
+-- a strict 'foldr'; it applies a function to each element of a
+-- 'Text', passing an accumulating parameter from right to left, and
+-- returning a final value of this accumulator together with the new
+-- 'Text'.
+-- Performs replacement on invalid scalar values.
+mapAccumR :: (a -> Char -> (a,Char)) -> a -> Text -> (a, Text)
+mapAccumR f z0 = second reverse . S.mapAccumL g z0 . reverseStream
+ where g a b = second safe (f a b)
+{-# INLINE mapAccumR #-}
+
+-- -----------------------------------------------------------------------------
+-- ** Generating and unfolding 'Text's
+
+-- | /O(n*m)/ 'replicate' @n@ @t@ is a 'Text' consisting of the input
+-- @t@ repeated @n@ times.
+replicate :: Int -> Text -> Text
+replicate n t@(Text a o l)
+ | n <= 0 || l <= 0 = empty
+ | n == 1 = t
+ | isSingleton t = replicateChar n (unsafeHead t)
+ | otherwise = Text (A.run x) 0 len
+ where
+ len = l `mul` n
+ x :: ST s (A.MArray s)
+ x = do
+ arr <- A.new len
+ let loop !d !i | i >= n = return arr
+ | otherwise = let m = d + l
+ in A.copyI arr d a o m >> loop m (i+1)
+ loop 0 0
+{-# INLINE [1] replicate #-}
+
+{-# RULES
+"TEXT replicate/singleton -> replicateChar" [~1] forall n c.
+ replicate n (singleton c) = replicateChar n c
+ #-}
+
+-- | /O(n)/ 'replicateChar' @n@ @c@ is a 'Text' of length @n@ with @c@ the
+-- value of every element. Subject to fusion.
+replicateChar :: Int -> Char -> Text
+replicateChar n c = unstream (S.replicateCharI n (safe c))
+{-# INLINE replicateChar #-}
+
+-- | /O(n)/, where @n@ is the length of the result. The 'unfoldr'
+-- function is analogous to the List 'L.unfoldr'. 'unfoldr' builds a
+-- 'Text' from a seed value. The function takes the element and
+-- returns 'Nothing' if it is done producing the 'Text', otherwise
+-- 'Just' @(a,b)@. In this case, @a@ is the next 'Char' in the
+-- string, and @b@ is the seed value for further production. Subject
+-- to fusion. Performs replacement on invalid scalar values.
+unfoldr :: (a -> Maybe (Char,a)) -> a -> Text
+unfoldr f s = unstream (S.unfoldr (firstf safe . f) s)
+{-# INLINE unfoldr #-}
+
+-- | /O(n)/ Like 'unfoldr', 'unfoldrN' builds a 'Text' from a seed
+-- value. However, the length of the result should be limited by the
+-- first argument to 'unfoldrN'. This function is more efficient than
+-- 'unfoldr' when the maximum length of the result is known and
+-- correct, otherwise its performance is similar to 'unfoldr'. Subject
+-- to fusion. Performs replacement on invalid scalar values.
+unfoldrN :: Int -> (a -> Maybe (Char,a)) -> a -> Text
+unfoldrN n f s = unstream (S.unfoldrN n (firstf safe . f) s)
+{-# INLINE unfoldrN #-}
+
+-- -----------------------------------------------------------------------------
+-- * Substrings
+
+-- | /O(n)/ 'take' @n@, applied to a 'Text', returns the prefix of the
+-- 'Text' of length @n@, or the 'Text' itself if @n@ is greater than
+-- the length of the Text. Subject to fusion.
+take :: Int -> Text -> Text
+take n t@(Text arr off len)
+ | n <= 0 = empty
+ | n >= len = t
+ | otherwise = text arr off (iterN n t)
+{-# INLINE [1] take #-}
+
+iterN :: Int -> Text -> Int
+iterN n t@(Text _arr _off len) = loop 0 0
+ where loop !i !cnt
+ | i >= len || cnt >= n = i
+ | otherwise = loop (i+d) (cnt+1)
+ where d = iter_ t i
+
+{-# RULES
+"TEXT take -> fused" [~1] forall n t.
+ take n t = unstream (S.take n (stream t))
+"TEXT take -> unfused" [1] forall n t.
+ unstream (S.take n (stream t)) = take n t
+ #-}
+
+-- | /O(n)/ 'takeEnd' @n@ @t@ returns the suffix remaining after
+-- taking @n@ characters from the end of @t@.
+--
+-- Examples:
+--
+-- >>> takeEnd 3 "foobar"
+-- "bar"
+--
+-- @since 1.1.1.0
+takeEnd :: Int -> Text -> Text
+takeEnd n t@(Text arr off len)
+ | n <= 0 = empty
+ | n >= len = t
+ | otherwise = text arr (off+i) (len-i)
+ where i = iterNEnd n t
+
+iterNEnd :: Int -> Text -> Int
+iterNEnd n t@(Text _arr _off len) = loop (len-1) n
+ where loop i !m
+ | m <= 0 = i+1
+ | i <= 0 = 0
+ | otherwise = loop (i+d) (m-1)
+ where d = reverseIter_ t i
+
+-- | /O(n)/ 'drop' @n@, applied to a 'Text', returns the suffix of the
+-- 'Text' after the first @n@ characters, or the empty 'Text' if @n@
+-- is greater than the length of the 'Text'. Subject to fusion.
+drop :: Int -> Text -> Text
+drop n t@(Text arr off len)
+ | n <= 0 = t
+ | n >= len = empty
+ | otherwise = text arr (off+i) (len-i)
+ where i = iterN n t
+{-# INLINE [1] drop #-}
+
+{-# RULES
+"TEXT drop -> fused" [~1] forall n t.
+ drop n t = unstream (S.drop n (stream t))
+"TEXT drop -> unfused" [1] forall n t.
+ unstream (S.drop n (stream t)) = drop n t
+ #-}
+
+-- | /O(n)/ 'dropEnd' @n@ @t@ returns the prefix remaining after
+-- dropping @n@ characters from the end of @t@.
+--
+-- Examples:
+--
+-- >>> dropEnd 3 "foobar"
+-- "foo"
+--
+-- @since 1.1.1.0
+dropEnd :: Int -> Text -> Text
+dropEnd n t@(Text arr off len)
+ | n <= 0 = t
+ | n >= len = empty
+ | otherwise = text arr off (iterNEnd n t)
+
+-- | /O(n)/ 'takeWhile', applied to a predicate @p@ and a 'Text',
+-- returns the longest prefix (possibly empty) of elements that
+-- satisfy @p@. Subject to fusion.
+takeWhile :: (Char -> Bool) -> Text -> Text
+takeWhile p t@(Text arr off len) = loop 0
+ where loop !i | i >= len = t
+ | p c = loop (i+d)
+ | otherwise = text arr off i
+ where Iter c d = iter t i
+{-# INLINE [1] takeWhile #-}
+
+{-# RULES
+"TEXT takeWhile -> fused" [~1] forall p t.
+ takeWhile p t = unstream (S.takeWhile p (stream t))
+"TEXT takeWhile -> unfused" [1] forall p t.
+ unstream (S.takeWhile p (stream t)) = takeWhile p t
+ #-}
+
+-- | /O(n)/ 'takeWhileEnd', applied to a predicate @p@ and a 'Text',
+-- returns the longest suffix (possibly empty) of elements that
+-- satisfy @p@. Subject to fusion.
+-- Examples:
+--
+-- >>> takeWhileEnd (=='o') "foo"
+-- "oo"
+--
+-- @since 1.2.2.0
+takeWhileEnd :: (Char -> Bool) -> Text -> Text
+takeWhileEnd p t@(Text arr off len) = loop (len-1) len
+ where loop !i !l | l <= 0 = t
+ | p c = loop (i+d) (l+d)
+ | otherwise = text arr (off+l) (len-l)
+ where (c,d) = reverseIter t i
+{-# INLINE [1] takeWhileEnd #-}
+
+{-# RULES
+"TEXT takeWhileEnd -> fused" [~1] forall p t.
+ takeWhileEnd p t = S.reverse (S.takeWhile p (S.reverseStream t))
+"TEXT takeWhileEnd -> unfused" [1] forall p t.
+ S.reverse (S.takeWhile p (S.reverseStream t)) = takeWhileEnd p t
+ #-}
+
+-- | /O(n)/ 'dropWhile' @p@ @t@ returns the suffix remaining after
+-- 'takeWhile' @p@ @t@. Subject to fusion.
+dropWhile :: (Char -> Bool) -> Text -> Text
+dropWhile p t@(Text arr off len) = loop 0 0
+ where loop !i !l | l >= len = empty
+ | p c = loop (i+d) (l+d)
+ | otherwise = Text arr (off+i) (len-l)
+ where Iter c d = iter t i
+{-# INLINE [1] dropWhile #-}
+
+{-# RULES
+"TEXT dropWhile -> fused" [~1] forall p t.
+ dropWhile p t = unstream (S.dropWhile p (stream t))
+"TEXT dropWhile -> unfused" [1] forall p t.
+ unstream (S.dropWhile p (stream t)) = dropWhile p t
+ #-}
+
+-- | /O(n)/ 'dropWhileEnd' @p@ @t@ returns the prefix remaining after
+-- dropping characters that satisfy the predicate @p@ from the end of
+-- @t@. Subject to fusion.
+--
+-- Examples:
+--
+-- >>> dropWhileEnd (=='.') "foo..."
+-- "foo"
+dropWhileEnd :: (Char -> Bool) -> Text -> Text
+dropWhileEnd p t@(Text arr off len) = loop (len-1) len
+ where loop !i !l | l <= 0 = empty
+ | p c = loop (i+d) (l+d)
+ | otherwise = Text arr off l
+ where (c,d) = reverseIter t i
+{-# INLINE [1] dropWhileEnd #-}
+
+{-# RULES
+"TEXT dropWhileEnd -> fused" [~1] forall p t.
+ dropWhileEnd p t = S.reverse (S.dropWhile p (S.reverseStream t))
+"TEXT dropWhileEnd -> unfused" [1] forall p t.
+ S.reverse (S.dropWhile p (S.reverseStream t)) = dropWhileEnd p t
+ #-}
+
+-- | /O(n)/ 'dropAround' @p@ @t@ returns the substring remaining after
+-- dropping characters that satisfy the predicate @p@ from both the
+-- beginning and end of @t@. Subject to fusion.
+dropAround :: (Char -> Bool) -> Text -> Text
+dropAround p = dropWhile p . dropWhileEnd p
+{-# INLINE [1] dropAround #-}
+
+-- | /O(n)/ Remove leading white space from a string. Equivalent to:
+--
+-- > dropWhile isSpace
+stripStart :: Text -> Text
+stripStart = dropWhile isSpace
+{-# INLINE [1] stripStart #-}
+
+-- | /O(n)/ Remove trailing white space from a string. Equivalent to:
+--
+-- > dropWhileEnd isSpace
+stripEnd :: Text -> Text
+stripEnd = dropWhileEnd isSpace
+{-# INLINE [1] stripEnd #-}
+
+-- | /O(n)/ Remove leading and trailing white space from a string.
+-- Equivalent to:
+--
+-- > dropAround isSpace
+strip :: Text -> Text
+strip = dropAround isSpace
+{-# INLINE [1] strip #-}
+
+-- | /O(n)/ 'splitAt' @n t@ returns a pair whose first element is a
+-- prefix of @t@ of length @n@, and whose second is the remainder of
+-- the string. It is equivalent to @('take' n t, 'drop' n t)@.
+splitAt :: Int -> Text -> (Text, Text)
+splitAt n t@(Text arr off len)
+ | n <= 0 = (empty, t)
+ | n >= len = (t, empty)
+ | otherwise = let k = iterN n t
+ in (text arr off k, text arr (off+k) (len-k))
+
+-- | /O(n)/ 'span', applied to a predicate @p@ and text @t@, returns
+-- a pair whose first element is the longest prefix (possibly empty)
+-- of @t@ of elements that satisfy @p@, and whose second is the
+-- remainder of the list.
+span :: (Char -> Bool) -> Text -> (Text, Text)
+span p t = case span_ p t of
+ (# hd,tl #) -> (hd,tl)
+{-# INLINE span #-}
+
+-- | /O(n)/ 'break' is like 'span', but the prefix returned is
+-- over elements that fail the predicate @p@.
+break :: (Char -> Bool) -> Text -> (Text, Text)
+break p = span (not . p)
+{-# INLINE break #-}
+
+-- | /O(n)/ Group characters in a string according to a predicate.
+groupBy :: (Char -> Char -> Bool) -> Text -> [Text]
+groupBy p = loop
+ where
+ loop t@(Text arr off len)
+ | null t = []
+ | otherwise = text arr off n : loop (text arr (off+n) (len-n))
+ where Iter c d = iter t 0
+ n = d + findAIndexOrEnd (not . p c) (Text arr (off+d) (len-d))
+
+-- | Returns the /array/ index (in units of 'Word16') at which a
+-- character may be found. This is /not/ the same as the logical
+-- index returned by e.g. 'findIndex'.
+findAIndexOrEnd :: (Char -> Bool) -> Text -> Int
+findAIndexOrEnd q t@(Text _arr _off len) = go 0
+ where go !i | i >= len || q c = i
+ | otherwise = go (i+d)
+ where Iter c d = iter t i
+
+-- | /O(n)/ Group characters in a string by equality.
+group :: Text -> [Text]
+group = groupBy (==)
+
+-- | /O(n)/ Return all initial segments of the given 'Text', shortest
+-- first.
+inits :: Text -> [Text]
+inits t@(Text arr off len) = loop 0
+ where loop i | i >= len = [t]
+ | otherwise = Text arr off i : loop (i + iter_ t i)
+
+-- | /O(n)/ Return all final segments of the given 'Text', longest
+-- first.
+tails :: Text -> [Text]
+tails t | null t = [empty]
+ | otherwise = t : tails (unsafeTail t)
+
+-- $split
+--
+-- Splitting functions in this library do not perform character-wise
+-- copies to create substrings; they just construct new 'Text's that
+-- are slices of the original.
+
+-- | /O(m+n)/ Break a 'Text' into pieces separated by the first 'Text'
+-- argument (which cannot be empty), consuming the delimiter. An empty
+-- delimiter is invalid, and will cause an error to be raised.
+--
+-- Examples:
+--
+-- >>> splitOn "\r\n" "a\r\nb\r\nd\r\ne"
+-- ["a","b","d","e"]
+--
+-- >>> splitOn "aaa" "aaaXaaaXaaaXaaa"
+-- ["","X","X","X",""]
+--
+-- >>> splitOn "x" "x"
+-- ["",""]
+--
+-- and
+--
+-- > intercalate s . splitOn s == id
+-- > splitOn (singleton c) == split (==c)
+--
+-- (Note: the string @s@ to split on above cannot be empty.)
+--
+-- In (unlikely) bad cases, this function's time complexity degrades
+-- towards /O(n*m)/.
+splitOn :: Text
+ -- ^ String to split on. If this string is empty, an error
+ -- will occur.
+ -> Text
+ -- ^ Input text.
+ -> [Text]
+splitOn pat@(Text _ _ l) src@(Text arr off len)
+ | l <= 0 = emptyError "splitOn"
+ | isSingleton pat = split (== unsafeHead pat) src
+ | otherwise = go 0 (indices pat src)
+ where
+ go !s (x:xs) = text arr (s+off) (x-s) : go (x+l) xs
+ go s _ = [text arr (s+off) (len-s)]
+{-# INLINE [1] splitOn #-}
+
+{-# RULES
+"TEXT splitOn/singleton -> split/==" [~1] forall c t.
+ splitOn (singleton c) t = split (==c) t
+ #-}
+
+-- | /O(n)/ Splits a 'Text' into components delimited by separators,
+-- where the predicate returns True for a separator element. The
+-- resulting components do not contain the separators. Two adjacent
+-- separators result in an empty component in the output. eg.
+--
+-- >>> split (=='a') "aabbaca"
+-- ["","","bb","c",""]
+--
+-- >>> split (=='a') ""
+-- [""]
+split :: (Char -> Bool) -> Text -> [Text]
+split _ t@(Text _off _arr 0) = [t]
+split p t = loop t
+ where loop s | null s' = [l]
+ | otherwise = l : loop (unsafeTail s')
+ where (# l, s' #) = span_ (not . p) s
+{-# INLINE split #-}
+
+-- | /O(n)/ Splits a 'Text' into components of length @k@. The last
+-- element may be shorter than the other chunks, depending on the
+-- length of the input. Examples:
+--
+-- >>> chunksOf 3 "foobarbaz"
+-- ["foo","bar","baz"]
+--
+-- >>> chunksOf 4 "haskell.org"
+-- ["hask","ell.","org"]
+chunksOf :: Int -> Text -> [Text]
+chunksOf k = go
+ where
+ go t = case splitAt k t of
+ (a,b) | null a -> []
+ | otherwise -> a : go b
+{-# INLINE chunksOf #-}
+
+-- ----------------------------------------------------------------------------
+-- * Searching
+
+-------------------------------------------------------------------------------
+-- ** Searching with a predicate
+
+-- | /O(n)/ The 'find' function takes a predicate and a 'Text', and
+-- returns the first element matching the predicate, or 'Nothing' if
+-- there is no such element.
+find :: (Char -> Bool) -> Text -> Maybe Char
+find p t = S.findBy p (stream t)
+{-# INLINE find #-}
+
+-- | /O(n)/ The 'partition' function takes a predicate and a 'Text',
+-- and returns the pair of 'Text's with elements which do and do not
+-- satisfy the predicate, respectively; i.e.
+--
+-- > partition p t == (filter p t, filter (not . p) t)
+partition :: (Char -> Bool) -> Text -> (Text, Text)
+partition p t = (filter p t, filter (not . p) t)
+{-# INLINE partition #-}
+
+-- | /O(n)/ 'filter', applied to a predicate and a 'Text',
+-- returns a 'Text' containing those characters that satisfy the
+-- predicate.
+filter :: (Char -> Bool) -> Text -> Text
+filter p t = unstream (S.filter p (stream t))
+{-# INLINE filter #-}
+
+-- | /O(n+m)/ Find the first instance of @needle@ (which must be
+-- non-'null') in @haystack@. The first element of the returned tuple
+-- is the prefix of @haystack@ before @needle@ is matched. The second
+-- is the remainder of @haystack@, starting with the match.
+--
+-- Examples:
+--
+-- >>> breakOn "::" "a::b::c"
+-- ("a","::b::c")
+--
+-- >>> breakOn "/" "foobar"
+-- ("foobar","")
+--
+-- Laws:
+--
+-- > append prefix match == haystack
+-- > where (prefix, match) = breakOn needle haystack
+--
+-- If you need to break a string by a substring repeatedly (e.g. you
+-- want to break on every instance of a substring), use 'breakOnAll'
+-- instead, as it has lower startup overhead.
+--
+-- In (unlikely) bad cases, this function's time complexity degrades
+-- towards /O(n*m)/.
+breakOn :: Text -> Text -> (Text, Text)
+breakOn pat src@(Text arr off len)
+ | null pat = emptyError "breakOn"
+ | otherwise = case indices pat src of
+ [] -> (src, empty)
+ (x:_) -> (text arr off x, text arr (off+x) (len-x))
+{-# INLINE breakOn #-}
+
+-- | /O(n+m)/ Similar to 'breakOn', but searches from the end of the
+-- string.
+--
+-- The first element of the returned tuple is the prefix of @haystack@
+-- up to and including the last match of @needle@. The second is the
+-- remainder of @haystack@, following the match.
+--
+-- >>> breakOnEnd "::" "a::b::c"
+-- ("a::b::","c")
+breakOnEnd :: Text -> Text -> (Text, Text)
+breakOnEnd pat src = (reverse b, reverse a)
+ where (a,b) = breakOn (reverse pat) (reverse src)
+{-# INLINE breakOnEnd #-}
+
+-- | /O(n+m)/ Find all non-overlapping instances of @needle@ in
+-- @haystack@. Each element of the returned list consists of a pair:
+--
+-- * The entire string prior to the /k/th match (i.e. the prefix)
+--
+-- * The /k/th match, followed by the remainder of the string
+--
+-- Examples:
+--
+-- >>> breakOnAll "::" ""
+-- []
+--
+-- >>> breakOnAll "/" "a/b/c/"
+-- [("a","/b/c/"),("a/b","/c/"),("a/b/c","/")]
+--
+-- In (unlikely) bad cases, this function's time complexity degrades
+-- towards /O(n*m)/.
+--
+-- The @needle@ parameter may not be empty.
+breakOnAll :: Text -- ^ @needle@ to search for
+ -> Text -- ^ @haystack@ in which to search
+ -> [(Text, Text)]
+breakOnAll pat src@(Text arr off slen)
+ | null pat = emptyError "breakOnAll"
+ | otherwise = L.map step (indices pat src)
+ where
+ step x = (chunk 0 x, chunk x (slen-x))
+ chunk !n !l = text arr (n+off) l
+{-# INLINE breakOnAll #-}
+
+-------------------------------------------------------------------------------
+-- ** Indexing 'Text's
+
+-- $index
+--
+-- If you think of a 'Text' value as an array of 'Char' values (which
+-- it is not), you run the risk of writing inefficient code.
+--
+-- An idiom that is common in some languages is to find the numeric
+-- offset of a character or substring, then use that number to split
+-- or trim the searched string. With a 'Text' value, this approach
+-- would require two /O(n)/ operations: one to perform the search, and
+-- one to operate from wherever the search ended.
+--
+-- For example, suppose you have a string that you want to split on
+-- the substring @\"::\"@, such as @\"foo::bar::quux\"@. Instead of
+-- searching for the index of @\"::\"@ and taking the substrings
+-- before and after that index, you would instead use @breakOnAll \"::\"@.
+
+-- | /O(n)/ 'Text' index (subscript) operator, starting from 0.
+index :: Text -> Int -> Char
+index t n = S.index (stream t) n
+{-# INLINE index #-}
+
+-- | /O(n)/ The 'findIndex' function takes a predicate and a 'Text'
+-- and returns the index of the first element in the 'Text' satisfying
+-- the predicate. Subject to fusion.
+findIndex :: (Char -> Bool) -> Text -> Maybe Int
+findIndex p t = S.findIndex p (stream t)
+{-# INLINE findIndex #-}
+
+-- | /O(n+m)/ The 'count' function returns the number of times the
+-- query string appears in the given 'Text'. An empty query string is
+-- invalid, and will cause an error to be raised.
+--
+-- In (unlikely) bad cases, this function's time complexity degrades
+-- towards /O(n*m)/.
+count :: Text -> Text -> Int
+count pat src
+ | null pat = emptyError "count"
+ | isSingleton pat = countChar (unsafeHead pat) src
+ | otherwise = L.length (indices pat src)
+{-# INLINE [1] count #-}
+
+{-# RULES
+"TEXT count/singleton -> countChar" [~1] forall c t.
+ count (singleton c) t = countChar c t
+ #-}
+
+-- | /O(n)/ The 'countChar' function returns the number of times the
+-- query element appears in the given 'Text'. Subject to fusion.
+countChar :: Char -> Text -> Int
+countChar c t = S.countChar c (stream t)
+{-# INLINE countChar #-}
+
+-------------------------------------------------------------------------------
+-- * Zipping
+
+-- | /O(n)/ 'zip' takes two 'Text's and returns a list of
+-- corresponding pairs of bytes. If one input 'Text' is short,
+-- excess elements of the longer 'Text' are discarded. This is
+-- equivalent to a pair of 'unpack' operations.
+zip :: Text -> Text -> [(Char,Char)]
+zip a b = S.unstreamList $ S.zipWith (,) (stream a) (stream b)
+{-# INLINE zip #-}
+
+-- | /O(n)/ 'zipWith' generalises 'zip' by zipping with the function
+-- given as the first argument, instead of a tupling function.
+-- Performs replacement on invalid scalar values.
+zipWith :: (Char -> Char -> Char) -> Text -> Text -> Text
+zipWith f t1 t2 = unstream (S.zipWith g (stream t1) (stream t2))
+ where g a b = safe (f a b)
+{-# INLINE zipWith #-}
+
+-- | /O(n)/ Breaks a 'Text' up into a list of words, delimited by 'Char's
+-- representing white space.
+words :: Text -> [Text]
+words t@(Text arr off len) = loop 0 0
+ where
+ loop !start !n
+ | n >= len = if start == n
+ then []
+ else [Text arr (start+off) (n-start)]
+ | isSpace c =
+ if start == n
+ then loop (start+d) (start+d)
+ else Text arr (start+off) (n-start) : loop (n+d) (n+d)
+ | otherwise = loop start (n+d)
+ where Iter c d = iter t n
+{-# INLINE words #-}
+
+-- | /O(n)/ Breaks a 'Text' up into a list of 'Text's at
+-- newline 'Char's. The resulting strings do not contain newlines.
+lines :: Text -> [Text]
+lines ps | null ps = []
+ | otherwise = h : if null t
+ then []
+ else lines (unsafeTail t)
+ where (# h,t #) = span_ (/= '\n') ps
+{-# INLINE lines #-}
+
+{-
+-- | /O(n)/ Portably breaks a 'Text' up into a list of 'Text's at line
+-- boundaries.
+--
+-- A line boundary is considered to be either a line feed, a carriage
+-- return immediately followed by a line feed, or a carriage return.
+-- This accounts for both Unix and Windows line ending conventions,
+-- and for the old convention used on Mac OS 9 and earlier.
+lines' :: Text -> [Text]
+lines' ps | null ps = []
+ | otherwise = h : case uncons t of
+ Nothing -> []
+ Just (c,t')
+ | c == '\n' -> lines t'
+ | c == '\r' -> case uncons t' of
+ Just ('\n',t'') -> lines t''
+ _ -> lines t'
+ where (h,t) = span notEOL ps
+ notEOL c = c /= '\n' && c /= '\r'
+{-# INLINE lines' #-}
+-}
+
+-- | /O(n)/ Joins lines, after appending a terminating newline to
+-- each.
+unlines :: [Text] -> Text
+unlines = concat . L.map (`snoc` '\n')
+{-# INLINE unlines #-}
+
+-- | /O(n)/ Joins words using single space characters.
+unwords :: [Text] -> Text
+unwords = intercalate (singleton ' ')
+{-# INLINE unwords #-}
+
+-- | /O(n)/ The 'isPrefixOf' function takes two 'Text's and returns
+-- 'True' iff the first is a prefix of the second. Subject to fusion.
+isPrefixOf :: Text -> Text -> Bool
+isPrefixOf a@(Text _ _ alen) b@(Text _ _ blen) =
+ alen <= blen && S.isPrefixOf (stream a) (stream b)
+{-# INLINE [1] isPrefixOf #-}
+
+{-# RULES
+"TEXT isPrefixOf -> fused" [~1] forall s t.
+ isPrefixOf s t = S.isPrefixOf (stream s) (stream t)
+ #-}
+
+-- | /O(n)/ The 'isSuffixOf' function takes two 'Text's and returns
+-- 'True' iff the first is a suffix of the second.
+isSuffixOf :: Text -> Text -> Bool
+isSuffixOf a@(Text _aarr _aoff alen) b@(Text barr boff blen) =
+ d >= 0 && a == b'
+ where d = blen - alen
+ b' | d == 0 = b
+ | otherwise = Text barr (boff+d) alen
+{-# INLINE isSuffixOf #-}
+
+-- | /O(n+m)/ The 'isInfixOf' function takes two 'Text's and returns
+-- 'True' iff the first is contained, wholly and intact, anywhere
+-- within the second.
+--
+-- In (unlikely) bad cases, this function's time complexity degrades
+-- towards /O(n*m)/.
+isInfixOf :: Text -> Text -> Bool
+isInfixOf needle haystack
+ | null needle = True
+ | isSingleton needle = S.elem (unsafeHead needle) . S.stream $ haystack
+ | otherwise = not . L.null . indices needle $ haystack
+{-# INLINE [1] isInfixOf #-}
+
+{-# RULES
+"TEXT isInfixOf/singleton -> S.elem/S.stream" [~1] forall n h.
+ isInfixOf (singleton n) h = S.elem n (S.stream h)
+ #-}
+
+-------------------------------------------------------------------------------
+-- * View patterns
+
+-- | /O(n)/ Return the suffix of the second string if its prefix
+-- matches the entire first string.
+--
+-- Examples:
+--
+-- >>> stripPrefix "foo" "foobar"
+-- Just "bar"
+--
+-- >>> stripPrefix "" "baz"
+-- Just "baz"
+--
+-- >>> stripPrefix "foo" "quux"
+-- Nothing
+--
+-- This is particularly useful with the @ViewPatterns@ extension to
+-- GHC, as follows:
+--
+-- > {-# LANGUAGE ViewPatterns #-}
+-- > import Data.Text as T
+-- >
+-- > fnordLength :: Text -> Int
+-- > fnordLength (stripPrefix "fnord" -> Just suf) = T.length suf
+-- > fnordLength _ = -1
+stripPrefix :: Text -> Text -> Maybe Text
+stripPrefix p@(Text _arr _off plen) t@(Text arr off len)
+ | p `isPrefixOf` t = Just $! text arr (off+plen) (len-plen)
+ | otherwise = Nothing
+
+-- | /O(n)/ Find the longest non-empty common prefix of two strings
+-- and return it, along with the suffixes of each string at which they
+-- no longer match.
+--
+-- If the strings do not have a common prefix or either one is empty,
+-- this function returns 'Nothing'.
+--
+-- Examples:
+--
+-- >>> commonPrefixes "foobar" "fooquux"
+-- Just ("foo","bar","quux")
+--
+-- >>> commonPrefixes "veeble" "fetzer"
+-- Nothing
+--
+-- >>> commonPrefixes "" "baz"
+-- Nothing
+commonPrefixes :: Text -> Text -> Maybe (Text,Text,Text)
+commonPrefixes t0@(Text arr0 off0 len0) t1@(Text arr1 off1 len1) = go 0 0
+ where
+ go !i !j | i < len0 && j < len1 && a == b = go (i+d0) (j+d1)
+ | i > 0 = Just (Text arr0 off0 i,
+ text arr0 (off0+i) (len0-i),
+ text arr1 (off1+j) (len1-j))
+ | otherwise = Nothing
+ where Iter a d0 = iter t0 i
+ Iter b d1 = iter t1 j
+
+-- | /O(n)/ Return the prefix of the second string if its suffix
+-- matches the entire first string.
+--
+-- Examples:
+--
+-- >>> stripSuffix "bar" "foobar"
+-- Just "foo"
+--
+-- >>> stripSuffix "" "baz"
+-- Just "baz"
+--
+-- >>> stripSuffix "foo" "quux"
+-- Nothing
+--
+-- This is particularly useful with the @ViewPatterns@ extension to
+-- GHC, as follows:
+--
+-- > {-# LANGUAGE ViewPatterns #-}
+-- > import Data.Text as T
+-- >
+-- > quuxLength :: Text -> Int
+-- > quuxLength (stripSuffix "quux" -> Just pre) = T.length pre
+-- > quuxLength _ = -1
+stripSuffix :: Text -> Text -> Maybe Text
+stripSuffix p@(Text _arr _off plen) t@(Text arr off len)
+ | p `isSuffixOf` t = Just $! text arr off (len-plen)
+ | otherwise = Nothing
+
+-- | Add a list of non-negative numbers. Errors out on overflow.
+sumP :: String -> [Int] -> Int
+sumP fun = go 0
+ where go !a (x:xs)
+ | ax >= 0 = go ax xs
+ | otherwise = overflowError fun
+ where ax = a + x
+ go a _ = a
+
+emptyError :: String -> a
+emptyError fun = P.error $ "Data.Text." ++ fun ++ ": empty input"
+
+overflowError :: String -> a
+overflowError fun = P.error $ "Data.Text." ++ fun ++ ": size overflow"
+
+-- | /O(n)/ Make a distinct copy of the given string, sharing no
+-- storage with the original string.
+--
+-- As an example, suppose you read a large string, of which you need
+-- only a small portion. If you do not use 'copy', the entire original
+-- array will be kept alive in memory by the smaller string. Making a
+-- copy \"breaks the link\" to the original array, allowing it to be
+-- garbage collected if there are no other live references to it.
+copy :: Text -> Text
+copy (Text arr off len) = Text (A.run go) 0 len
+ where
+ go :: ST s (A.MArray s)
+ go = do
+ marr <- A.new len
+ A.copyI marr 0 arr off len
+ return marr
+
+
+-------------------------------------------------
+-- NOTE: the named chunk below used by doctest;
+-- verify the doctests via `doctest -fobject-code Data/Text.hs`
+
+-- $setup
+-- >>> :set -XOverloadedStrings
+-- >>> import qualified Data.Text as T
diff --git a/Data/Text/Array.hs b/Data/Text/Array.hs
new file mode 100644
index 0000000..426fef0
--- /dev/null
+++ b/Data/Text/Array.hs
@@ -0,0 +1,329 @@
+{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, MagicHash, Rank2Types,
+ RecordWildCards, UnboxedTuples, UnliftedFFITypes #-}
+{-# OPTIONS_GHC -fno-warn-unused-matches #-}
+-- |
+-- Module : Data.Text.Array
+-- Copyright : (c) 2009, 2010, 2011 Bryan O'Sullivan
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Portability : portable
+--
+-- Packed, unboxed, heap-resident arrays. Suitable for performance
+-- critical use, both in terms of large data quantities and high
+-- speed.
+--
+-- This module is intended to be imported @qualified@, to avoid name
+-- clashes with "Prelude" functions, e.g.
+--
+-- > import qualified Data.Text.Array as A
+--
+-- The names in this module resemble those in the 'Data.Array' family
+-- of modules, but are shorter due to the assumption of qualified
+-- naming.
+module Data.Text.Array
+ (
+ -- * Types
+ Array(Array, aBA)
+ , MArray(MArray, maBA)
+
+ -- * Functions
+ , copyM
+ , copyI
+ , copyToPtr
+ , copyFromPtr
+
+ , empty
+ , equal
+ , cmp
+#if defined(ASSERTS)
+ , length
+#endif
+ , run
+ , run2
+ , toList
+ , unsafeFreeze
+ , unsafeIndex
+ , unsafeIndex32
+ , unsafeIndex64
+ , new
+ , unsafeWrite
+ , unsafeWrite32
+ , unsafeWrite64
+ ) where
+
+#if defined(ASSERTS)
+-- This fugly hack is brought by GHC's apparent reluctance to deal
+-- with MagicHash and UnboxedTuples when inferring types. Eek!
+# define CHECK_BOUNDS(_func_,_len_,_k_) \
+if (_k_) < 0 || (_k_) >= (_len_) then error ("Data.Text.Array." ++ (_func_) ++ ": bounds error, offset " ++ show (_k_) ++ ", length " ++ show (_len_)) else
+#else
+# define CHECK_BOUNDS(_func_,_len_,_k_)
+#endif
+
+#include "MachDeps.h"
+
+#if defined(ASSERTS)
+import Control.Exception (assert)
+#endif
+import Data.Bits ((.&.), xor)
+import Data.Text.Internal.Unsafe.Shift (shiftR)
+import Foreign.Ptr (Ptr)
+#if __GLASGOW_HASKELL__ >= 804
+import GHC.Exts (compareByteArrays#)
+#elif __GLASGOW_HASKELL__ >= 703
+import Data.Text.Internal.Unsafe (inlinePerformIO)
+import Foreign.C.Types (CInt(CInt), CSize(CSize))
+#else
+import Data.Text.Internal.Unsafe (inlinePerformIO)
+import Foreign.C.Types (CInt, CSize)
+#endif
+import GHC.Base (IO(..), ByteArray#, MutableByteArray#, Int(..), (-#),
+ indexWord8Array#, indexWord32Array#, indexWord64Array#, newByteArray#, plusAddr#,
+ unsafeFreezeByteArray#, writeWord8Array#, writeWord32Array#, writeWord64Array#,
+ copyByteArray#, copyMutableByteArray#, copyByteArrayToAddr#,
+ copyAddrToByteArray#)
+import GHC.Exts (Ptr(..))
+import GHC.ST (ST(..), runST)
+import GHC.Word (Word8(..), Word32(..), Word64(..))
+import Prelude hiding (length, read)
+
+-- | Immutable array type.
+--
+-- The 'Array' constructor is exposed since @text-1.1.1.3@
+data Array = Array {
+ aBA :: ByteArray#
+#if defined(ASSERTS)
+ , aLen :: {-# UNPACK #-} !Int -- length in bytes
+#endif
+ }
+
+-- | Mutable array type, for use in the ST monad.
+--
+-- The 'MArray' constructor is exposed since @text-1.1.1.3@
+data MArray s = MArray {
+ maBA :: MutableByteArray# s
+#if defined(ASSERTS)
+ , maLen :: {-# UNPACK #-} !Int -- length in bytes
+#endif
+ }
+
+#if defined(ASSERTS)
+-- | Operations supported by all arrays.
+class IArray a where
+ -- | Return the length of an array.
+ length :: a -> Int
+
+instance IArray Array where
+ length = aLen
+ {-# INLINE length #-}
+
+instance IArray (MArray s) where
+ length = maLen
+ {-# INLINE length #-}
+#endif
+
+-- | Create an uninitialized mutable array.
+new :: forall s. Int -> ST s (MArray s)
+new n
+ | n < 0 || n .&. highBit /= 0 = array_size_error
+ | otherwise = ST $ \s1# ->
+ case newByteArray# len# s1# of
+ (# s2#, marr# #) -> (# s2#, MArray marr#
+#if defined(ASSERTS)
+ n
+#endif
+ #)
+ where !(I# len#) = bytesInArray n
+ highBit = maxBound `xor` (maxBound `shiftR` 1)
+{-# INLINE new #-}
+
+array_size_error :: a
+array_size_error = error "Data.Text.Array.new: size overflow"
+
+-- | Freeze a mutable array. Do not mutate the 'MArray' afterwards!
+unsafeFreeze :: MArray s -> ST s Array
+unsafeFreeze MArray{..} = ST $ \s1# ->
+ case unsafeFreezeByteArray# maBA s1# of
+ (# s2#, ba# #) -> (# s2#, Array ba#
+#if defined(ASSERTS)
+ maLen
+#endif
+ #)
+{-# INLINE unsafeFreeze #-}
+
+-- | Indicate how many bytes would be used for an array of the given
+-- size.
+bytesInArray :: Int -> Int
+bytesInArray n = n
+{-# INLINE bytesInArray #-}
+
+-- | Unchecked read of an immutable array. May return garbage or
+-- crash on an out-of-bounds access.
+unsafeIndex :: Array -> Int -> Word8
+unsafeIndex Array{..} i@(I# i#) =
+ CHECK_BOUNDS("unsafeIndex",aLen,i)
+ case indexWord8Array# aBA i# of r# -> (W8# r#)
+{-# INLINE unsafeIndex #-}
+
+-- | Unchecked read of an immutable array. May return garbage or
+-- crash on an out-of-bounds access.
+unsafeIndex32 :: Array -> Int -> Word32
+unsafeIndex32 Array{..} i@(I# i#) =
+ CHECK_BOUNDS("unsafeIndex32",aLen `quot` 4,i)
+ case indexWord32Array# aBA i# of r# -> (W32# r#)
+{-# INLINE unsafeIndex32 #-}
+
+-- | Unchecked read of an immutable array. May return garbage or
+-- crash on an out-of-bounds access.
+unsafeIndex64 :: Array -> Int -> Word64
+unsafeIndex64 Array{..} i@(I# i#) =
+ CHECK_BOUNDS("unsafeIndex64",aLen `quot` 8,i)
+ case indexWord64Array# aBA i# of r# -> (W64# r#)
+{-# INLINE unsafeIndex64 #-}
+
+-- | Unchecked write of a mutable array. May return garbage or crash
+-- on an out-of-bounds access.
+unsafeWrite :: MArray s -> Int -> Word8 -> ST s ()
+unsafeWrite MArray{..} i@(I# i#) (W8# e#) = ST $ \s1# ->
+ CHECK_BOUNDS("unsafeWrite",maLen,i)
+ case writeWord8Array# maBA i# e# s1# of
+ s2# -> (# s2#, () #)
+{-# INLINE unsafeWrite #-}
+
+-- | Unchecked write of a mutable array. May return garbage or crash
+-- on an out-of-bounds access.
+unsafeWrite32 :: MArray s -> Int -> Word32 -> ST s ()
+unsafeWrite32 MArray{..} i@(I# i#) (W32# e#) = ST $ \s1# ->
+ CHECK_BOUNDS("unsafeWrite32",maLen `quot` 4,i)
+ case writeWord32Array# maBA i# e# s1# of
+ s2# -> (# s2#, () #)
+{-# INLINE unsafeWrite32 #-}
+
+-- | Unchecked write of a mutable array. May return garbage or crash
+-- on an out-of-bounds access.
+unsafeWrite64 :: MArray s -> Int -> Word64 -> ST s ()
+unsafeWrite64 MArray{..} i@(I# i#) (W64# e#) = ST $ \s1# ->
+ CHECK_BOUNDS("unsafeWrite64",maLen `quot` 8,i)
+ case writeWord64Array# maBA i# e# s1# of
+ s2# -> (# s2#, () #)
+{-# INLINE unsafeWrite64 #-}
+
+-- | Convert an immutable array to a list.
+toList :: Array -> Int -> Int -> [Word8]
+toList ary off len = loop 0
+ where loop i | i < len = unsafeIndex ary (off+i) : loop (i+1)
+ | otherwise = []
+
+-- | An empty immutable array.
+empty :: Array
+empty = runST (new 0 >>= unsafeFreeze)
+
+-- | Run an action in the ST monad and return an immutable array of
+-- its result.
+run :: (forall s. ST s (MArray s)) -> Array
+run k = runST (k >>= unsafeFreeze)
+
+-- | Run an action in the ST monad and return an immutable array of
+-- its result paired with whatever else the action returns.
+run2 :: (forall s. ST s (MArray s, a)) -> (Array, a)
+run2 k = runST (do
+ (marr,b) <- k
+ arr <- unsafeFreeze marr
+ return (arr,b))
+{-# INLINE run2 #-}
+
+-- | Copy some elements of a mutable array.
+copyM :: MArray s -- ^ Destination
+ -> Int -- ^ Destination offset
+ -> MArray s -- ^ Source
+ -> Int -- ^ Source offset
+ -> Int -- ^ Count
+ -> ST s ()
+copyM dest didx@(I# didx#) src sidx@(I# sidx#) count@(I# count#)
+ | count <= 0 = return ()
+ | otherwise =
+#if defined(ASSERTS)
+ assert (sidx + count <= length src) .
+ assert (didx + count <= length dest) .
+#endif
+ ST $ \s ->
+ case copyMutableByteArray# (maBA src) sidx# (maBA dest) didx# count# s of
+ s' -> (# s', () #)
+{-# INLINE copyM #-}
+
+-- | Copy some elements of an immutable array.
+copyI :: MArray s -- ^ Destination
+ -> Int -- ^ Destination offset
+ -> Array -- ^ Source
+ -> Int -- ^ Source offset
+ -> Int -- ^ First offset in destination /not/ to
+ -- copy (i.e. /not/ length)
+ -> ST s ()
+copyI dest i0@(I# i0#) src _j0@(I# j0#) top@(I# top#)
+ | i0 >= top = return ()
+ | otherwise = ST $ \s ->
+ case copyByteArray# (aBA src) j0# (maBA dest) i0# (top# -# i0#) s of
+ s' -> (# s', () #)
+{-# INLINE copyI #-}
+
+-- | Compare portions of two arrays for equality. No bounds checking
+-- is performed.
+equal :: Array -- ^ First
+ -> Int -- ^ Offset into first
+ -> Array -- ^ Second
+ -> Int -- ^ Offset into second
+ -> Int -- ^ Count
+ -> Bool
+equal arrA offA arrB offB count = cmp arrA offA arrB offB count == EQ
+{-# INLINE equal #-}
+
+-- | Compare portions of two arrays for equality. No bounds checking
+-- is performed.
+cmp :: Array -- ^ First
+ -> Int -- ^ Offset into first
+ -> Array -- ^ Second
+ -> Int -- ^ Offset into second
+ -> Int -- ^ Count
+ -> Ordering
+#if __GLASGOW_HASKELL__ >= 804
+cmp arrA (I# offA) arrB (I# offB) (I# count) =
+ compare (I# (compareByteArrays# (aBA arrA) offA (aBA arrB) offB count)) 0
+#else
+cmp arrA offA arrB offB count = inlinePerformIO $ do
+ i <- memcmp (aBA arrA) (fromIntegral offA)
+ (aBA arrB) (fromIntegral offB) (fromIntegral count)
+ return $ compare i 0
+{-# INLINE cmp #-}
+
+foreign import ccall unsafe "_hs_text_utf_8_memcmp" memcmp
+ :: ByteArray# -> CSize -> ByteArray# -> CSize -> CSize -> IO CInt
+#endif
+
+-- | Copy some elements of an immutable array to a pointer
+copyToPtr :: Ptr Word8 -- ^ Destination
+ -> Int -- ^ Destination offset
+ -> Array -- ^ Source
+ -> Int -- ^ Source offset
+ -> Int -- ^ First offset in destination /not/ to
+ -- copy (i.e. /not/ length)
+ -> IO ()
+copyToPtr dest@(Ptr dest#) i0@(I# i0#) src j0@(I# j0#) top@(I# top#)
+ | i0 >= top = return ()
+ | otherwise =
+ IO $ \s -> case copyByteArrayToAddr# (aBA src) j0# (plusAddr# dest# i0#) (top# -# i0#) s of
+ s' -> (# s', () #)
+{-# INLINE copyToPtr #-}
+
+copyFromPtr :: MArray s -- ^ Destination
+ -> Int -- ^ Destination offset
+ -> Ptr Word8 -- ^ Source
+ -> Int -- ^ Source offset
+ -> Int -- ^ Count
+ -> ST s ()
+copyFromPtr dest i0@(I# i0#) src@(Ptr src#) j0@(I# j0#) count@(I# count#)
+ | count <= 0 = return ()
+ | otherwise =
+ ST $ \s -> case copyAddrToByteArray# (plusAddr# src# i0#) (maBA dest) j0# count# s of
+ s' -> (# s', () #)
+{-# INLINE copyFromPtr #-}
diff --git a/Data/Text/Encoding.hs b/Data/Text/Encoding.hs
new file mode 100644
index 0000000..8693f72
--- /dev/null
+++ b/Data/Text/Encoding.hs
@@ -0,0 +1,477 @@
+{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash,
+ UnliftedFFITypes #-}
+#if __GLASGOW_HASKELL__ >= 702
+{-# LANGUAGE Trustworthy #-}
+#endif
+-- |
+-- Module : Data.Text.Encoding
+-- Copyright : (c) 2009, 2010, 2011 Bryan O'Sullivan,
+-- (c) 2009 Duncan Coutts,
+-- (c) 2008, 2009 Tom Harper
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Portability : portable
+--
+-- Functions for converting 'Text' values to and from 'ByteString',
+-- using several standard encodings.
+--
+-- To gain access to a much larger family of encodings, use the
+-- <http://hackage.haskell.org/package/text-icu text-icu package>.
+
+module Data.Text.Encoding
+ (
+ -- * Decoding ByteStrings to Text
+ -- $strict
+ decodeASCII
+ , decodeLatin1
+ , decodeUtf8
+ , decodeUtf16LE
+ , decodeUtf16BE
+ , decodeUtf32LE
+ , decodeUtf32BE
+
+ -- ** Catchable failure
+ , decodeUtf8'
+
+ -- ** Controllable error handling
+ , decodeUtf8With
+ , decodeUtf16LEWith
+ , decodeUtf16BEWith
+ , decodeUtf32LEWith
+ , decodeUtf32BEWith
+
+ -- ** Stream oriented decoding
+ -- $stream
+ , streamDecodeUtf8
+ , streamDecodeUtf8With
+ , Decoding(..)
+
+ -- * Encoding Text to ByteStrings
+ , encodeUtf8
+ , encodeUtf16LE
+ , encodeUtf16BE
+ , encodeUtf32LE
+ , encodeUtf32BE
+
+ -- * Encoding Text using ByteString Builders
+ , encodeUtf8Builder
+ , encodeUtf8BuilderEscaped
+ ) where
+
+#if __GLASGOW_HASKELL__ >= 702
+import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
+#else
+import Control.Monad.ST (unsafeIOToST, unsafeSTToIO)
+#endif
+
+import Control.Exception (evaluate, try)
+import Control.Monad.ST (runST)
+import Data.ByteString as B
+import Data.ByteString.Internal as B hiding (c2w)
+import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
+import Data.Text.Internal (Text(..), safe, text)
+import Data.Text.Internal.Unsafe.Char (unsafeWrite)
+import Data.Text.Show ()
+import Data.Text.Unsafe (unsafeDupablePerformIO)
+import Data.Word (Word8, Word32)
+#if __GLASGOW_HASKELL__ >= 703
+import Foreign.C.Types (CSize)
+#else
+import Foreign.C.Types (CSize)
+#endif
+import Foreign.ForeignPtr (withForeignPtr)
+import Foreign.Marshal.Utils (with)
+import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
+import Foreign.Storable (Storable, peek, poke)
+import GHC.Base (MutableByteArray#)
+import qualified Data.ByteString.Builder as B
+import qualified Data.ByteString.Builder.Internal as B hiding (empty, append)
+import qualified Data.ByteString.Builder.Prim as BP
+import qualified Data.ByteString.Builder.Prim.Internal as BP
+import qualified Data.Text.Array as A
+import qualified Data.Text.Internal.Encoding.Fusion as E
+import qualified Data.Text.Internal.Fusion as F
+
+#include "text_cbits.h"
+
+-- $strict
+--
+-- All of the single-parameter functions for decoding bytestrings
+-- encoded in one of the Unicode Transformation Formats (UTF) operate
+-- in a /strict/ mode: each will throw an exception if given invalid
+-- input.
+--
+-- Each function has a variant, whose name is suffixed with -'With',
+-- that gives greater control over the handling of decoding errors.
+-- For instance, 'decodeUtf8' will throw an exception, but
+-- 'decodeUtf8With' allows the programmer to determine what to do on a
+-- decoding error.
+
+-- | /Deprecated/. Decode a 'ByteString' containing 7-bit ASCII
+-- encoded text.
+decodeASCII :: ByteString -> Text
+decodeASCII = decodeUtf8
+{-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-}
+
+-- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
+--
+-- 'decodeLatin1' is semantically equivalent to
+-- @Data.Text.pack . Data.ByteString.Char8.unpack@
+decodeLatin1 :: ByteString -> Text
+decodeLatin1 s = F.unstream (E.streamASCII s)
+
+-- | Decode a 'ByteString' containing UTF-8 encoded text.
+decodeUtf8With :: OnDecodeError -> ByteString -> Text
+decodeUtf8With onErr s@(PS fp off len) = runST $ do
+ dest <- A.new len
+ unsafeIOToST $ do
+ withForeignPtr fp $ \ptr ->
+ with (0::CSize) $ \destOffPtr ->do
+ let curPtr = ptr `plusPtr` off
+ let end = ptr `plusPtr` (off + len)
+ curPtr' <- c_decode_utf8 (A.maBA dest) destOffPtr curPtr end
+ if curPtr' == end
+ then do
+ n <- peek destOffPtr
+ dest' <- unsafeSTToIO (A.unsafeFreeze dest)
+ return (Text dest' 0 (fromIntegral n))
+ else do
+ return (F.unstream (E.streamUtf8 onErr s))
+{- INLINE[0] decodeUtf8With #-}
+
+-- $stream
+--
+-- The 'streamDecodeUtf8' and 'streamDecodeUtf8With' functions accept
+-- a 'ByteString' that represents a possibly incomplete input (e.g. a
+-- packet from a network stream) that may not end on a UTF-8 boundary.
+--
+-- 1. The maximal prefix of 'Text' that could be decoded from the
+-- given input.
+--
+-- 2. The suffix of the 'ByteString' that could not be decoded due to
+-- insufficient input.
+--
+-- 3. A function that accepts another 'ByteString'. That string will
+-- be assumed to directly follow the string that was passed as
+-- input to the original function, and it will in turn be decoded.
+--
+-- To help understand the use of these functions, consider the Unicode
+-- string @\"hi &#9731;\"@. If encoded as UTF-8, this becomes @\"hi
+-- \\xe2\\x98\\x83\"@; the final @\'&#9731;\'@ is encoded as 3 bytes.
+--
+-- Now suppose that we receive this encoded string as 3 packets that
+-- are split up on untidy boundaries: @[\"hi \\xe2\", \"\\x98\",
+-- \"\\x83\"]@. We cannot decode the entire Unicode string until we
+-- have received all three packets, but we would like to make progress
+-- as we receive each one.
+--
+-- @
+-- ghci> let s0\@('Some' _ _ f0) = 'streamDecodeUtf8' \"hi \\xe2\"
+-- ghci> s0
+-- 'Some' \"hi \" \"\\xe2\" _
+-- @
+--
+-- We use the continuation @f0@ to decode our second packet.
+--
+-- @
+-- ghci> let s1\@('Some' _ _ f1) = f0 \"\\x98\"
+-- ghci> s1
+-- 'Some' \"\" \"\\xe2\\x98\"
+-- @
+--
+-- We could not give @f0@ enough input to decode anything, so it
+-- returned an empty string. Once we feed our second continuation @f1@
+-- the last byte of input, it will make progress.
+--
+-- @
+-- ghci> let s2\@('Some' _ _ f2) = f1 \"\\x83\"
+-- ghci> s2
+-- 'Some' \"\\x2603\" \"\" _
+-- @
+--
+-- If given invalid input, an exception will be thrown by the function
+-- or continuation where it is encountered.
+
+-- | A stream oriented decoding result.
+--
+-- @since 1.0.0.0
+data Decoding = Some Text ByteString (ByteString -> Decoding)
+
+instance Show Decoding where
+ showsPrec d (Some t bs _) = showParen (d > prec) $
+ showString "Some " . showsPrec prec' t .
+ showChar ' ' . showsPrec prec' bs .
+ showString " _"
+ where prec = 10; prec' = prec + 1
+
+newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
+newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
+
+-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
+-- encoded text that is known to be valid.
+--
+-- If the input contains any invalid UTF-8 data, an exception will be
+-- thrown (either by this function or a continuation) that cannot be
+-- caught in pure code. For more control over the handling of invalid
+-- data, use 'streamDecodeUtf8With'.
+--
+-- @since 1.0.0.0
+streamDecodeUtf8 :: ByteString -> Decoding
+streamDecodeUtf8 = streamDecodeUtf8With strictDecode
+
+-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
+-- encoded text.
+--
+-- @since 1.0.0.0
+streamDecodeUtf8With :: OnDecodeError -> ByteString -> Decoding
+streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
+ where
+ -- We create a slightly larger than necessary buffer to accommodate a
+ -- potential surrogate pair started in the last buffer
+ decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString
+ -> Decoding
+ decodeChunk undecoded0 codepoint0 state0 bs@(PS fp off len) =
+ runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
+ where
+ decodeChunkToBuffer :: A.MArray s -> IO Decoding
+ decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
+ with (0::CSize) $ \destOffPtr ->
+ with codepoint0 $ \codepointPtr ->
+ with state0 $ \statePtr ->
+ with nullPtr $ \curPtrPtr ->
+ let end = ptr `plusPtr` (off + len)
+ loop curPtr = do
+ poke curPtrPtr curPtr
+ curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
+ curPtrPtr end codepointPtr statePtr
+ state <- peek statePtr
+ case state of
+ UTF8_REJECT -> do
+ -- We encountered an encoding error
+ x <- peek curPtr'
+ poke statePtr 0
+ case onErr desc (Just x) of
+ Nothing -> loop $ curPtr' `plusPtr` 1
+ Just c -> do
+ destOff <- peek destOffPtr
+ w <- unsafeSTToIO $
+ unsafeWrite dest (fromIntegral destOff) (safe c)
+ poke destOffPtr (destOff + fromIntegral w)
+ loop $ curPtr' `plusPtr` 1
+
+ _ -> do
+ -- We encountered the end of the buffer while decoding
+ n <- peek destOffPtr
+ codepoint <- peek codepointPtr
+ chunkText <- unsafeSTToIO $ do
+ arr <- A.unsafeFreeze dest
+ return $! text arr 0 (fromIntegral n)
+ lastPtr <- peek curPtrPtr
+ let left = lastPtr `minusPtr` curPtr
+ !undecoded = case state of
+ UTF8_ACCEPT -> B.empty
+ _ -> B.append undecoded0 (B.drop left bs)
+ return $ Some chunkText undecoded
+ (decodeChunk undecoded codepoint state)
+ in loop (ptr `plusPtr` off)
+ desc = "Data.Text.Internal.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream"
+
+-- | Decode a 'ByteString' containing UTF-8 encoded text that is known
+-- to be valid.
+--
+-- If the input contains any invalid UTF-8 data, an exception will be
+-- thrown that cannot be caught in pure code. For more control over
+-- the handling of invalid data, use 'decodeUtf8'' or
+-- 'decodeUtf8With'.
+decodeUtf8 :: ByteString -> Text
+decodeUtf8 = decodeUtf8With strictDecode
+{-# INLINE[0] decodeUtf8 #-}
+{-# RULES "STREAM stream/decodeUtf8 fusion" [1]
+ forall bs. F.stream (decodeUtf8 bs) = E.streamUtf8 strictDecode bs #-}
+
+-- | Decode a 'ByteString' containing UTF-8 encoded text.
+--
+-- If the input contains any invalid UTF-8 data, the relevant
+-- exception will be returned, otherwise the decoded text.
+decodeUtf8' :: ByteString -> Either UnicodeException Text
+decodeUtf8' = unsafeDupablePerformIO . try . evaluate . decodeUtf8With strictDecode
+{-# INLINE decodeUtf8' #-}
+
+-- | Encode text to a ByteString 'B.Builder' using UTF-8 encoding.
+--
+-- @since 1.1.0.0
+encodeUtf8Builder :: Text -> B.Builder
+encodeUtf8Builder = \t -> B.builder (textCopyStep t)
+{-# INLINE encodeUtf8Builder #-}
+
+textCopyStep :: Text -> B.BuildStep a -> B.BuildStep a
+textCopyStep !(Text arr off len) k = go 0 len
+ where
+ go !ip !ipe !(B.BufferRange op ope)
+ | inpRemaining <= outRemaining = do
+ A.copyToPtr op 0 arr (off + ip) inpRemaining
+ let !br' = B.BufferRange (op `plusPtr` inpRemaining) ope
+ k br'
+ | otherwise = do
+ A.copyToPtr op 0 arr (off + ip) outRemaining
+ let !ip' = ip + outRemaining
+ return $ B.bufferFull 1 ope (go ip' ipe)
+ where
+ outRemaining = ope `minusPtr` op
+ inpRemaining = ipe - ip
+{-# INLINE textCopyStep #-}
+
+-- | Encode text using UTF-8 encoding and escape the ASCII characters using
+-- a 'BP.BoundedPrim'.
+--
+-- Use this function is to implement efficient encoders for text-based formats
+-- like JSON or HTML.
+--
+-- @since 1.1.0.0
+{-# INLINE encodeUtf8BuilderEscaped #-}
+-- TODO: Extend documentation with references to source code in @blaze-html@
+-- or @aeson@ that uses this function.
+encodeUtf8BuilderEscaped :: BP.BoundedPrim Word8 -> Text -> B.Builder
+encodeUtf8BuilderEscaped be =
+ -- manual eta-expansion to ensure inlining works as expected
+ \txt -> B.builder (mkBuildstep txt)
+ where
+ bound = max 4 $ BP.sizeBound be
+
+ mkBuildstep (Text arr off len) !k =
+ outerLoop off
+ where
+ iend = off + len
+
+ outerLoop !i0 !br@(B.BufferRange op0 ope)
+ | i0 >= iend = k br
+ | outRemaining > 0 = goPartial (i0 + min outRemaining inpRemaining)
+ -- TODO: Use a loop with an integrated bound's check if outRemaining
+ -- is smaller than 8, as this will save on divisions.
+ | otherwise = return $ B.bufferFull bound op0 (outerLoop i0)
+ where
+ outRemaining = (ope `minusPtr` op0) `div` bound
+ inpRemaining = iend - i0
+
+ goPartial !iendTmp = go i0 op0
+ where
+ go !i !op
+ | i < iendTmp = case () of
+ _ | a <= 0x7F ->
+ BP.runB be (fromIntegral a) op >>= go (i + 1)
+ | 0xC2 <= a && a <= 0xDF -> do
+ poke8 0 a
+ poke8 1 b
+ go (i + 2) (op `plusPtr` 2)
+ | 0xE0 <= a && a <= 0xEF -> do
+ poke8 0 a
+ poke8 1 b
+ poke8 2 c
+ go (i + 3) (op `plusPtr` 3)
+ | otherwise -> do
+ poke8 0 a
+ poke8 1 b
+ poke8 2 c
+ poke8 3 d
+ go (i + 4) (op `plusPtr` 4)
+ | otherwise =
+ outerLoop i (B.BufferRange op ope)
+ where
+ poke8 j v = poke (op `plusPtr` j) (fromIntegral v :: Word8)
+ a = A.unsafeIndex arr i
+ b = A.unsafeIndex arr (i+1)
+ c = A.unsafeIndex arr (i+2)
+ d = A.unsafeIndex arr (i+3)
+
+
+-- | Encode text using UTF-8 encoding.
+encodeUtf8 :: Text -> ByteString
+encodeUtf8 (Text arr off len)
+ | len == 0 = B.empty
+ | otherwise = B.unsafeCreate len (\op -> A.copyToPtr op 0 arr off len)
+
+-- | Decode text from little endian UTF-16 encoding.
+decodeUtf16LEWith :: OnDecodeError -> ByteString -> Text
+decodeUtf16LEWith onErr bs = F.unstream (E.streamUtf16LE onErr bs)
+{-# INLINE decodeUtf16LEWith #-}
+
+-- | Decode text from little endian UTF-16 encoding.
+--
+-- If the input contains any invalid little endian UTF-16 data, an
+-- exception will be thrown. For more control over the handling of
+-- invalid data, use 'decodeUtf16LEWith'.
+decodeUtf16LE :: ByteString -> Text
+decodeUtf16LE = decodeUtf16LEWith strictDecode
+{-# INLINE decodeUtf16LE #-}
+
+-- | Decode text from big endian UTF-16 encoding.
+decodeUtf16BEWith :: OnDecodeError -> ByteString -> Text
+decodeUtf16BEWith onErr bs = F.unstream (E.streamUtf16BE onErr bs)
+{-# INLINE decodeUtf16BEWith #-}
+
+-- | Decode text from big endian UTF-16 encoding.
+--
+-- If the input contains any invalid big endian UTF-16 data, an
+-- exception will be thrown. For more control over the handling of
+-- invalid data, use 'decodeUtf16BEWith'.
+decodeUtf16BE :: ByteString -> Text
+decodeUtf16BE = decodeUtf16BEWith strictDecode
+{-# INLINE decodeUtf16BE #-}
+
+-- | Encode text using little endian UTF-16 encoding.
+encodeUtf16LE :: Text -> ByteString
+encodeUtf16LE txt = E.unstream (E.restreamUtf16LE (F.stream txt))
+{-# INLINE encodeUtf16LE #-}
+
+-- | Encode text using big endian UTF-16 encoding.
+encodeUtf16BE :: Text -> ByteString
+encodeUtf16BE txt = E.unstream (E.restreamUtf16BE (F.stream txt))
+{-# INLINE encodeUtf16BE #-}
+
+-- | Decode text from little endian UTF-32 encoding.
+decodeUtf32LEWith :: OnDecodeError -> ByteString -> Text
+decodeUtf32LEWith onErr bs = F.unstream (E.streamUtf32LE onErr bs)
+{-# INLINE decodeUtf32LEWith #-}
+
+-- | Decode text from little endian UTF-32 encoding.
+--
+-- If the input contains any invalid little endian UTF-32 data, an
+-- exception will be thrown. For more control over the handling of
+-- invalid data, use 'decodeUtf32LEWith'.
+decodeUtf32LE :: ByteString -> Text
+decodeUtf32LE = decodeUtf32LEWith strictDecode
+{-# INLINE decodeUtf32LE #-}
+
+-- | Decode text from big endian UTF-32 encoding.
+decodeUtf32BEWith :: OnDecodeError -> ByteString -> Text
+decodeUtf32BEWith onErr bs = F.unstream (E.streamUtf32BE onErr bs)
+{-# INLINE decodeUtf32BEWith #-}
+
+-- | Decode text from big endian UTF-32 encoding.
+--
+-- If the input contains any invalid big endian UTF-32 data, an
+-- exception will be thrown. For more control over the handling of
+-- invalid data, use 'decodeUtf32BEWith'.
+decodeUtf32BE :: ByteString -> Text
+decodeUtf32BE = decodeUtf32BEWith strictDecode
+{-# INLINE decodeUtf32BE #-}
+
+-- | Encode text using little endian UTF-32 encoding.
+encodeUtf32LE :: Text -> ByteString
+encodeUtf32LE txt = E.unstream (E.restreamUtf32LE (F.stream txt))
+{-# INLINE encodeUtf32LE #-}
+
+-- | Encode text using big endian UTF-32 encoding.
+encodeUtf32BE :: Text -> ByteString
+encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt))
+{-# INLINE encodeUtf32BE #-}
+
+foreign import ccall unsafe "_hs_text_utf_8_decode_utf8" c_decode_utf8
+ :: MutableByteArray# s -> Ptr CSize
+ -> Ptr Word8 -> Ptr Word8 -> IO (Ptr Word8)
+
+foreign import ccall unsafe "_hs_text_utf_8_decode_utf8_state" c_decode_utf8_with_state
+ :: MutableByteArray# s -> Ptr CSize
+ -> Ptr (Ptr Word8) -> Ptr Word8
+ -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)
diff --git a/Data/Text/Encoding/Error.hs b/Data/Text/Encoding/Error.hs
new file mode 100644
index 0000000..e0cd4fc
--- /dev/null
+++ b/Data/Text/Encoding/Error.hs
@@ -0,0 +1,124 @@
+{-# LANGUAGE CPP, DeriveDataTypeable #-}
+#if __GLASGOW_HASKELL__ >= 704
+{-# LANGUAGE Safe #-}
+#elif __GLASGOW_HASKELL__ >= 702
+{-# LANGUAGE Trustworthy #-}
+#endif
+-- |
+-- Module : Data.Text.Encoding.Error
+-- Copyright : (c) Bryan O'Sullivan 2009
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Portability : GHC
+--
+-- Types and functions for dealing with encoding and decoding errors
+-- in Unicode text.
+--
+-- The standard functions for encoding and decoding text are strict,
+-- which is to say that they throw exceptions on invalid input. This
+-- is often unhelpful on real world input, so alternative functions
+-- exist that accept custom handlers for dealing with invalid inputs.
+-- These 'OnError' handlers are normal Haskell functions. You can use
+-- one of the presupplied functions in this module, or you can write a
+-- custom handler of your own.
+
+module Data.Text.Encoding.Error
+ (
+ -- * Error handling types
+ UnicodeException(..)
+ , OnError
+ , OnDecodeError
+ , OnEncodeError
+ -- * Useful error handling functions
+ , lenientDecode
+ , strictDecode
+ , strictEncode
+ , ignore
+ , replace
+ ) where
+
+import Control.DeepSeq (NFData (..))
+import Control.Exception (Exception, throw)
+import Data.Typeable (Typeable)
+import Data.Word (Word8)
+import Numeric (showHex)
+
+-- | Function type for handling a coding error. It is supplied with
+-- two inputs:
+--
+-- * A 'String' that describes the error.
+--
+-- * The input value that caused the error. If the error arose
+-- because the end of input was reached or could not be identified
+-- precisely, this value will be 'Nothing'.
+--
+-- If the handler returns a value wrapped with 'Just', that value will
+-- be used in the output as the replacement for the invalid input. If
+-- it returns 'Nothing', no value will be used in the output.
+--
+-- Should the handler need to abort processing, it should use 'error'
+-- or 'throw' an exception (preferably a 'UnicodeException'). It may
+-- use the description provided to construct a more helpful error
+-- report.
+type OnError a b = String -> Maybe a -> Maybe b
+
+-- | A handler for a decoding error.
+type OnDecodeError = OnError Word8 Char
+
+-- | A handler for an encoding error.
+{-# DEPRECATED OnEncodeError "This exception is never used in practice, and will be removed." #-}
+type OnEncodeError = OnError Char Word8
+
+-- | An exception type for representing Unicode encoding errors.
+data UnicodeException =
+ DecodeError String (Maybe Word8)
+ -- ^ Could not decode a byte sequence because it was invalid under
+ -- the given encoding, or ran out of input in mid-decode.
+ | EncodeError String (Maybe Char)
+ -- ^ Tried to encode a character that could not be represented
+ -- under the given encoding, or ran out of input in mid-encode.
+ deriving (Eq, Typeable)
+
+{-# DEPRECATED EncodeError "This constructor is never used, and will be removed." #-}
+
+showUnicodeException :: UnicodeException -> String
+showUnicodeException (DecodeError desc (Just w))
+ = "Cannot decode byte '\\x" ++ showHex w ("': " ++ desc)
+showUnicodeException (DecodeError desc Nothing)
+ = "Cannot decode input: " ++ desc
+showUnicodeException (EncodeError desc (Just c))
+ = "Cannot encode character '\\x" ++ showHex (fromEnum c) ("': " ++ desc)
+showUnicodeException (EncodeError desc Nothing)
+ = "Cannot encode input: " ++ desc
+
+instance Show UnicodeException where
+ show = showUnicodeException
+
+instance Exception UnicodeException
+
+instance NFData UnicodeException where
+ rnf (DecodeError desc w) = rnf desc `seq` rnf w `seq` ()
+ rnf (EncodeError desc c) = rnf desc `seq` rnf c `seq` ()
+
+-- | Throw a 'UnicodeException' if decoding fails.
+strictDecode :: OnDecodeError
+strictDecode desc c = throw (DecodeError desc c)
+
+-- | Replace an invalid input byte with the Unicode replacement
+-- character U+FFFD.
+lenientDecode :: OnDecodeError
+lenientDecode _ _ = Just '\xfffd'
+
+-- | Throw a 'UnicodeException' if encoding fails.
+{-# DEPRECATED strictEncode "This function always throws an exception, and will be removed." #-}
+strictEncode :: OnEncodeError
+strictEncode desc c = throw (EncodeError desc c)
+
+-- | Ignore an invalid input, substituting nothing in the output.
+ignore :: OnError a b
+ignore _ _ = Nothing
+
+-- | Replace an invalid input with a valid output.
+replace :: b -> OnError a b
+replace c _ _ = Just c
diff --git a/Data/Text/Foreign.hs b/Data/Text/Foreign.hs
new file mode 100644
index 0000000..ca64054
--- /dev/null
+++ b/Data/Text/Foreign.hs
@@ -0,0 +1,158 @@
+{-# LANGUAGE BangPatterns, CPP, GeneralizedNewtypeDeriving #-}
+-- |
+-- Module : Data.Text.Foreign
+-- Copyright : (c) 2009, 2010 Bryan O'Sullivan
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Portability : GHC
+--
+-- Support for using 'Text' data with native code via the Haskell
+-- foreign function interface.
+
+module Data.Text.Foreign
+ (
+ -- * Interoperability with native code
+ -- $interop
+ I8
+ -- * Safe conversion functions
+ , fromPtr
+ , useAsPtr
+ , asForeignPtr
+ -- ** Encoding as UTF-8
+ , peekCStringLen
+ , withCStringLen
+ -- * Unsafe conversion code
+ , lengthWord8
+ , unsafeCopyToPtr
+ -- * Low-level manipulation
+ -- $lowlevel
+ , dropWord8
+ , takeWord8
+ ) where
+
+#if defined(ASSERTS)
+import Control.Exception (assert)
+#endif
+import qualified Data.Text.Internal.Encoding.Utf8 as U8
+import Data.ByteString.Unsafe (unsafePackCStringLen, unsafeUseAsCStringLen)
+import Data.Text.Encoding (decodeUtf8, encodeUtf8)
+import Data.Text.Internal (Text(..), empty)
+import Data.Text.Unsafe (lengthWord8)
+import qualified Data.Text.Unsafe as Unsafe
+import Data.Word (Word8)
+import Foreign.C.String (CStringLen)
+import Foreign.ForeignPtr (ForeignPtr, mallocForeignPtrArray, withForeignPtr)
+import Foreign.Marshal.Alloc (allocaBytes)
+import Foreign.Ptr (Ptr, castPtr)
+import qualified Data.Text.Array as A
+
+-- $interop
+--
+-- The 'Text' type is implemented using arrays that are not guaranteed
+-- to have a fixed address in the Haskell heap. All communication with
+-- native code must thus occur by copying data back and forth.
+--
+-- The 'Text' type's internal representation is UTF-8.
+-- To interoperate with native libraries that use different
+-- internal representations, such as UTF-16 or UTF-32, consider using
+-- the functions in the 'Data.Text.Encoding' module.
+
+-- | A type representing a number of UTF-16 code units.
+newtype I8 = I8 Int
+ deriving (Bounded, Enum, Eq, Integral, Num, Ord, Read, Real, Show)
+
+-- | /O(n)/ Create a new 'Text' from a 'Ptr' 'Word16' by copying the
+-- contents of the array.
+fromPtr :: Ptr Word8 -- ^ source array
+ -> I8 -- ^ length of source array (in 'Word16' units)
+ -> IO Text
+fromPtr _ (I8 0) = return empty
+fromPtr ptr (I8 len) =
+#if defined(ASSERTS)
+ assert (len > 0) $
+#endif
+ return $! Text arr 0 len
+ where
+ arr = A.run (A.new len >>= copy)
+ copy marr = A.copyFromPtr marr 0 ptr 0 len >> return marr
+
+-- $lowlevel
+--
+-- Foreign functions that use UTF-16 internally may return indices in
+-- units of 'Word16' instead of characters. These functions may
+-- safely be used with such indices, as they will adjust offsets if
+-- necessary to preserve the validity of a Unicode string.
+
+-- | /O(1)/ Return the prefix of the 'Text' of @n@ 'Word16' units in
+-- length.
+--
+-- If @n@ would cause the 'Text' to end inside a surrogate pair, the
+-- end of the prefix will be advanced by one additional 'Word16' unit
+-- to maintain its validity.
+takeWord8 :: I8 -> Text -> Text
+takeWord8 (I8 n) t@(Text arr off len)
+ | n <= 0 = empty
+ | n >= len = t
+ | U8.continuationByte x = takeWord8 (I8 (n + 1)) t
+ | otherwise = Unsafe.takeWord8 n t
+ where
+ x = A.unsafeIndex arr (off + n)
+
+-- | /O(1)/ Return the suffix of the 'Text', with @n@ 'Word16' units
+-- dropped from its beginning.
+--
+-- If @n@ would cause the 'Text' to begin inside a surrogate pair, the
+-- beginning of the suffix will be advanced by one additional 'Word16'
+-- unit to maintain its validity.
+dropWord8 :: I8 -> Text -> Text
+dropWord8 (I8 n) t@(Text arr off len)
+ | n <= 0 = t
+ | n >= len = empty
+ | U8.continuationByte x = dropWord8 (I8 (n + 1)) t
+ | otherwise = Unsafe.dropWord8 n t
+ where
+ x = A.unsafeIndex arr (off + n)
+
+-- | /O(n)/ Copy a 'Text' to an array. The array is assumed to be big
+-- enough to hold the contents of the entire 'Text'.
+unsafeCopyToPtr :: Text -> Ptr Word8 -> IO ()
+unsafeCopyToPtr (Text arr off len) ptr =
+ A.copyToPtr ptr 0 arr off len
+
+-- | /O(n)/ Perform an action on a temporary, mutable copy of a
+-- 'Text'. The copy is freed as soon as the action returns.
+useAsPtr :: Text -> (Ptr Word8 -> I8 -> IO a) -> IO a
+useAsPtr t@(Text _arr _off len) action =
+ allocaBytes len $ \buf -> do
+ unsafeCopyToPtr t buf
+ action (castPtr buf) (fromIntegral len)
+
+-- | /O(n)/ Make a mutable copy of a 'Text'.
+asForeignPtr :: Text -> IO (ForeignPtr Word8, I8)
+asForeignPtr t@(Text _arr _off len) = do
+ fp <- mallocForeignPtrArray len
+ withForeignPtr fp $ unsafeCopyToPtr t
+ return (fp, I8 len)
+
+-- | /O(n)/ Decode a C string with explicit length, which is assumed
+-- to have been encoded as UTF-8. If decoding fails, a
+-- 'UnicodeException' is thrown.
+--
+-- @since 1.0.0.0
+peekCStringLen :: CStringLen -> IO Text
+peekCStringLen cs = do
+ bs <- unsafePackCStringLen cs
+ return $! decodeUtf8 bs
+
+-- | Marshal a 'Text' into a C string encoded as UTF-8 in temporary
+-- storage, with explicit length information. The encoded string may
+-- contain NUL bytes, and is not followed by a trailing NUL byte.
+--
+-- The temporary storage is freed when the subcomputation terminates
+-- (either normally or via an exception), so the pointer to the
+-- temporary storage must /not/ be used after this function returns.
+--
+-- @since 1.0.0.0
+withCStringLen :: Text -> (CStringLen -> IO a) -> IO a
+withCStringLen t act = unsafeUseAsCStringLen (encodeUtf8 t) act
diff --git a/Data/Text/IO.hs b/Data/Text/IO.hs
new file mode 100644
index 0000000..29afbd5
--- /dev/null
+++ b/Data/Text/IO.hs
@@ -0,0 +1,341 @@
+{-# LANGUAGE BangPatterns, CPP, RecordWildCards, ScopedTypeVariables #-}
+#if __GLASGOW_HASKELL__ >= 702
+{-# LANGUAGE Trustworthy #-}
+#endif
+-- |
+-- Module : Data.Text.IO
+-- Copyright : (c) 2009, 2010 Bryan O'Sullivan,
+-- (c) 2009 Simon Marlow
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Portability : GHC
+--
+-- Efficient locale-sensitive support for text I\/O.
+--
+-- Skip past the synopsis for some important notes on performance and
+-- portability across different versions of GHC.
+
+module Data.Text.IO
+ (
+ -- * Performance
+ -- $performance
+
+ -- * Locale support
+ -- $locale
+ -- * File-at-a-time operations
+ readFile
+ , writeFile
+ , appendFile
+ -- * Operations on handles
+ , hGetContents
+ , hGetChunk
+ , hGetLine
+ , hPutStr
+ , hPutStrLn
+ -- * Special cases for standard input and output
+ , interact
+ , getContents
+ , getLine
+ , putStr
+ , putStrLn
+ ) where
+
+import Data.Text (Text)
+import Prelude hiding (appendFile, getContents, getLine, interact,
+ putStr, putStrLn, readFile, writeFile)
+import System.IO (Handle, IOMode(..), hPutChar, openFile, stdin, stdout,
+ withFile)
+import qualified Control.Exception as E
+import Control.Monad (liftM2, when)
+import Data.IORef (readIORef, writeIORef)
+import qualified Data.Text as T
+import Data.Text.Internal.Fusion (stream)
+import Data.Text.Internal.Fusion.Types (Step(..), Stream(..))
+import Data.Text.Internal.IO (hGetLineWith, readChunk)
+import GHC.IO.Buffer (Buffer(..), BufferState(..), CharBufElem, CharBuffer,
+ RawCharBuffer, emptyBuffer, isEmptyBuffer, newCharBuffer,
+ writeCharBuf)
+import GHC.IO.Exception (IOException(ioe_type), IOErrorType(InappropriateType))
+import GHC.IO.Handle.Internals (augmentIOError, hClose_help, wantReadableHandle,
+ wantWritableHandle)
+import GHC.IO.Handle.Text (commitBuffer')
+import GHC.IO.Handle.Types (BufferList(..), BufferMode(..), Handle__(..),
+ HandleType(..), Newline(..))
+import System.IO (hGetBuffering, hFileSize, hSetBuffering, hTell)
+import System.IO.Error (isEOFError)
+
+-- $performance
+-- #performance#
+--
+-- The functions in this module obey the runtime system's locale,
+-- character set encoding, and line ending conversion settings.
+--
+-- If you know in advance that you will be working with data that has
+-- a specific encoding (e.g. UTF-8), and your application is highly
+-- performance sensitive, you may find that it is faster to perform
+-- I\/O with bytestrings and to encode and decode yourself than to use
+-- the functions in this module.
+--
+-- Whether this will hold depends on the version of GHC you are using,
+-- the platform you are working on, the data you are working with, and
+-- the encodings you are using, so be sure to test for yourself.
+
+-- | The 'readFile' function reads a file and returns the contents of
+-- the file as a string. The entire file is read strictly, as with
+-- 'getContents'.
+readFile :: FilePath -> IO Text
+readFile name = openFile name ReadMode >>= hGetContents
+
+-- | Write a string to a file. The file is truncated to zero length
+-- before writing begins.
+writeFile :: FilePath -> Text -> IO ()
+writeFile p = withFile p WriteMode . flip hPutStr
+
+-- | Write a string the end of a file.
+appendFile :: FilePath -> Text -> IO ()
+appendFile p = withFile p AppendMode . flip hPutStr
+
+catchError :: String -> Handle -> Handle__ -> IOError -> IO Text
+catchError caller h Handle__{..} err
+ | isEOFError err = do
+ buf <- readIORef haCharBuffer
+ return $ if isEmptyBuffer buf
+ then T.empty
+ else T.singleton '\r'
+ | otherwise = E.throwIO (augmentIOError err caller h)
+
+-- | /Experimental./ Read a single chunk of strict text from a
+-- 'Handle'. The size of the chunk depends on the amount of input
+-- currently buffered.
+--
+-- This function blocks only if there is no data available, and EOF
+-- has not yet been reached. Once EOF is reached, this function
+-- returns an empty string instead of throwing an exception.
+hGetChunk :: Handle -> IO Text
+hGetChunk h = wantReadableHandle "hGetChunk" h readSingleChunk
+ where
+ readSingleChunk hh@Handle__{..} = do
+ buf <- readIORef haCharBuffer
+ t <- readChunk hh buf `E.catch` catchError "hGetChunk" h hh
+ return (hh, t)
+
+-- | Read the remaining contents of a 'Handle' as a string. The
+-- 'Handle' is closed once the contents have been read, or if an
+-- exception is thrown.
+--
+-- Internally, this function reads a chunk at a time from the
+-- lower-level buffering abstraction, and concatenates the chunks into
+-- a single string once the entire file has been read.
+--
+-- As a result, it requires approximately twice as much memory as its
+-- result to construct its result. For files more than a half of
+-- available RAM in size, this may result in memory exhaustion.
+hGetContents :: Handle -> IO Text
+hGetContents h = do
+ chooseGoodBuffering h
+ wantReadableHandle "hGetContents" h readAll
+ where
+ readAll hh@Handle__{..} = do
+ let readChunks = do
+ buf <- readIORef haCharBuffer
+ t <- readChunk hh buf `E.catch` catchError "hGetContents" h hh
+ if T.null t
+ then return [t]
+ else (t:) `fmap` readChunks
+ ts <- readChunks
+ (hh', _) <- hClose_help hh
+ return (hh'{haType=ClosedHandle}, T.concat ts)
+
+-- | Use a more efficient buffer size if we're reading in
+-- block-buffered mode with the default buffer size. When we can
+-- determine the size of the handle we're reading, set the buffer size
+-- to that, so that we can read the entire file in one chunk.
+-- Otherwise, use a buffer size of at least 16KB.
+chooseGoodBuffering :: Handle -> IO ()
+chooseGoodBuffering h = do
+ bufMode <- hGetBuffering h
+ case bufMode of
+ BlockBuffering Nothing -> do
+ d <- E.catch (liftM2 (-) (hFileSize h) (hTell h)) $ \(e::IOException) ->
+ if ioe_type e == InappropriateType
+ then return 16384 -- faster than the 2KB default
+ else E.throwIO e
+ when (d > 0) . hSetBuffering h . BlockBuffering . Just . fromIntegral $ d
+ _ -> return ()
+
+-- | Read a single line from a handle.
+hGetLine :: Handle -> IO Text
+hGetLine = hGetLineWith T.concat
+
+-- | Write a string to a handle.
+hPutStr :: Handle -> Text -> IO ()
+-- This function is lifted almost verbatim from GHC.IO.Handle.Text.
+hPutStr h t = do
+ (buffer_mode, nl) <-
+ wantWritableHandle "hPutStr" h $ \h_ -> do
+ bmode <- getSpareBuffer h_
+ return (bmode, haOutputNL h_)
+ let str = stream t
+ case buffer_mode of
+ (NoBuffering, _) -> hPutChars h str
+ (LineBuffering, buf) -> writeLines h nl buf str
+ (BlockBuffering _, buf)
+ | nl == CRLF -> writeBlocksCRLF h buf str
+ | otherwise -> writeBlocksRaw h buf str
+
+hPutChars :: Handle -> Stream Char -> IO ()
+hPutChars h (Stream next0 s0 _len) = loop s0
+ where
+ loop !s = case next0 s of
+ Done -> return ()
+ Skip s' -> loop s'
+ Yield x s' -> hPutChar h x >> loop s'
+
+-- The following functions are largely lifted from GHC.IO.Handle.Text,
+-- but adapted to a coinductive stream of data instead of an inductive
+-- list.
+--
+-- We have several variations of more or less the same code for
+-- performance reasons. Splitting the original buffered write
+-- function into line- and block-oriented versions gave us a 2.1x
+-- performance improvement. Lifting out the raw/cooked newline
+-- handling gave a few more percent on top.
+
+writeLines :: Handle -> Newline -> Buffer CharBufElem -> Stream Char -> IO ()
+writeLines h nl buf0 (Stream next0 s0 _len) = outer s0 buf0
+ where
+ outer s1 Buffer{bufRaw=raw, bufSize=len} = inner s1 (0::Int)
+ where
+ inner !s !n =
+ case next0 s of
+ Done -> commit n False{-no flush-} True{-release-} >> return ()
+ Skip s' -> inner s' n
+ Yield x s'
+ | n + 1 >= len -> commit n True{-needs flush-} False >>= outer s
+ | x == '\n' -> do
+ n' <- if nl == CRLF
+ then do n1 <- writeCharBuf raw n '\r'
+ writeCharBuf raw n1 '\n'
+ else writeCharBuf raw n x
+ commit n' True{-needs flush-} False >>= outer s'
+ | otherwise -> writeCharBuf raw n x >>= inner s'
+ commit = commitBuffer h raw len
+
+writeBlocksCRLF :: Handle -> Buffer CharBufElem -> Stream Char -> IO ()
+writeBlocksCRLF h buf0 (Stream next0 s0 _len) = outer s0 buf0
+ where
+ outer s1 Buffer{bufRaw=raw, bufSize=len} = inner s1 (0::Int)
+ where
+ inner !s !n =
+ case next0 s of
+ Done -> commit n False{-no flush-} True{-release-} >> return ()
+ Skip s' -> inner s' n
+ Yield x s'
+ | n + 1 >= len -> commit n True{-needs flush-} False >>= outer s
+ | x == '\n' -> do n1 <- writeCharBuf raw n '\r'
+ writeCharBuf raw n1 '\n' >>= inner s'
+ | otherwise -> writeCharBuf raw n x >>= inner s'
+ commit = commitBuffer h raw len
+
+writeBlocksRaw :: Handle -> Buffer CharBufElem -> Stream Char -> IO ()
+writeBlocksRaw h buf0 (Stream next0 s0 _len) = outer s0 buf0
+ where
+ outer s1 Buffer{bufRaw=raw, bufSize=len} = inner s1 (0::Int)
+ where
+ inner !s !n =
+ case next0 s of
+ Done -> commit n False{-no flush-} True{-release-} >> return ()
+ Skip s' -> inner s' n
+ Yield x s'
+ | n + 1 >= len -> commit n True{-needs flush-} False >>= outer s
+ | otherwise -> writeCharBuf raw n x >>= inner s'
+ commit = commitBuffer h raw len
+
+-- This function is completely lifted from GHC.IO.Handle.Text.
+getSpareBuffer :: Handle__ -> IO (BufferMode, CharBuffer)
+getSpareBuffer Handle__{haCharBuffer=ref,
+ haBuffers=spare_ref,
+ haBufferMode=mode}
+ = do
+ case mode of
+ NoBuffering -> return (mode, error "no buffer!")
+ _ -> do
+ bufs <- readIORef spare_ref
+ buf <- readIORef ref
+ case bufs of
+ BufferListCons b rest -> do
+ writeIORef spare_ref rest
+ return ( mode, emptyBuffer b (bufSize buf) WriteBuffer)
+ BufferListNil -> do
+ new_buf <- newCharBuffer (bufSize buf) WriteBuffer
+ return (mode, new_buf)
+
+
+-- This function is completely lifted from GHC.IO.Handle.Text.
+commitBuffer :: Handle -> RawCharBuffer -> Int -> Int -> Bool -> Bool
+ -> IO CharBuffer
+commitBuffer hdl !raw !sz !count flush release =
+ wantWritableHandle "commitAndReleaseBuffer" hdl $
+ commitBuffer' raw sz count flush release
+{-# INLINE commitBuffer #-}
+
+-- | Write a string to a handle, followed by a newline.
+hPutStrLn :: Handle -> Text -> IO ()
+hPutStrLn h t = hPutStr h t >> hPutChar h '\n'
+
+-- | The 'interact' function takes a function of type @Text -> Text@
+-- as its argument. The entire input from the standard input device is
+-- passed to this function as its argument, and the resulting string
+-- is output on the standard output device.
+interact :: (Text -> Text) -> IO ()
+interact f = putStr . f =<< getContents
+
+-- | Read all user input on 'stdin' as a single string.
+getContents :: IO Text
+getContents = hGetContents stdin
+
+-- | Read a single line of user input from 'stdin'.
+getLine :: IO Text
+getLine = hGetLine stdin
+
+-- | Write a string to 'stdout'.
+putStr :: Text -> IO ()
+putStr = hPutStr stdout
+
+-- | Write a string to 'stdout', followed by a newline.
+putStrLn :: Text -> IO ()
+putStrLn = hPutStrLn stdout
+
+-- $locale
+--
+-- /Note/: The behaviour of functions in this module depends on the
+-- version of GHC you are using.
+--
+-- Beginning with GHC 6.12, text I\/O is performed using the system or
+-- handle's current locale and line ending conventions.
+--
+-- Under GHC 6.10 and earlier, the system I\/O libraries do not
+-- support locale-sensitive I\/O or line ending conversion. On these
+-- versions of GHC, functions in this library all use UTF-8. What
+-- does this mean in practice?
+--
+-- * All data that is read will be decoded as UTF-8.
+--
+-- * Before data is written, it is first encoded as UTF-8.
+--
+-- * On both reading and writing, the platform's native newline
+-- conversion is performed.
+--
+-- If you must use a non-UTF-8 locale on an older version of GHC, you
+-- will have to perform the transcoding yourself, e.g. as follows:
+--
+-- > import qualified Data.ByteString as B
+-- > import Data.Text (Text)
+-- > import Data.Text.Encoding (encodeUtf16)
+-- >
+-- > putStr_Utf16LE :: Text -> IO ()
+-- > putStr_Utf16LE t = B.putStr (encodeUtf16LE t)
+--
+-- On transcoding errors, an 'IOError' exception is thrown. You can
+-- use the API in "Data.Text.Encoding" if you need more control over
+-- error handling or transcoding.
diff --git a/Data/Text/Internal.hs b/Data/Text/Internal.hs
new file mode 100644
index 0000000..004ac04
--- /dev/null
+++ b/Data/Text/Internal.hs
@@ -0,0 +1,186 @@
+{-# LANGUAGE CPP, DeriveDataTypeable, UnboxedTuples #-}
+{-# OPTIONS_HADDOCK not-home #-}
+
+-- |
+-- Module : Data.Text.Internal
+-- Copyright : (c) 2008, 2009 Tom Harper,
+-- (c) 2009, 2010 Bryan O'Sullivan,
+-- (c) 2009 Duncan Coutts
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : GHC
+--
+-- A module containing private 'Text' internals. This exposes the
+-- 'Text' representation and low level construction functions.
+-- Modules which extend the 'Text' system may need to use this module.
+--
+-- You should not use this module unless you are determined to monkey
+-- with the internals, as the functions here do just about nothing to
+-- preserve data invariants. You have been warned!
+
+#if defined(__GLASGOW_HASKELL__) && !defined(__HADDOCK__)
+#include "MachDeps.h"
+#endif
+
+module Data.Text.Internal
+ (
+ -- * Types
+ -- $internals
+ Text(..)
+ -- * Construction
+ , text
+ , textP
+ -- * Safety
+ , safe
+ -- * Code that must be here for accessibility
+ , empty
+ , empty_
+ -- * Utilities
+ , firstf
+ -- * Checked multiplication
+ , mul
+ , mul32
+ , mul64
+ -- * Debugging
+ , showText
+ ) where
+
+#if defined(ASSERTS)
+import Control.Exception (assert)
+#endif
+import Data.Bits
+import Data.Int (Int32, Int64)
+import Data.Text.Internal.Unsafe.Char (ord)
+import Data.Typeable (Typeable)
+import qualified Data.Text.Array as A
+
+-- | A space efficient, packed, unboxed Unicode text type.
+data Text = Text
+ {-# UNPACK #-} !A.Array -- payload (Word16 elements)
+ {-# UNPACK #-} !Int -- offset (units of Word16, not Char)
+ {-# UNPACK #-} !Int -- length (units of Word16, not Char)
+ deriving (Typeable)
+
+-- | Smart constructor.
+text_ :: A.Array -> Int -> Int -> Text
+text_ arr off len =
+#if defined(ASSERTS)
+ let alen = A.length arr
+ in assert (len >= 0) .
+ assert (off >= 0) .
+ assert (alen == 0 || len == 0 || off < alen) $
+#endif
+ Text arr off len
+{-# INLINE text_ #-}
+
+-- | /O(1)/ The empty 'Text'.
+empty :: Text
+empty = Text A.empty 0 0
+{-# INLINE [1] empty #-}
+
+-- | A non-inlined version of 'empty'.
+empty_ :: Text
+empty_ = Text A.empty 0 0
+{-# NOINLINE empty_ #-}
+
+-- | Construct a 'Text' without invisibly pinning its byte array in
+-- memory if its length has dwindled to zero.
+text :: A.Array -> Int -> Int -> Text
+text arr off len | len == 0 = empty
+ | otherwise = text_ arr off len
+{-# INLINE text #-}
+
+textP :: A.Array -> Int -> Int -> Text
+{-# DEPRECATED textP "Use text instead" #-}
+textP = text
+
+-- | A useful 'show'-like function for debugging purposes.
+showText :: Text -> String
+showText (Text arr off len) =
+ "Text " ++ show (A.toList arr off len) ++ ' ' :
+ show off ++ ' ' : show len
+
+-- | Map a 'Char' to a 'Text'-safe value.
+--
+-- UTF-16 surrogate code points are not included in the set of Unicode
+-- scalar values, but are unfortunately admitted as valid 'Char'
+-- values by Haskell. They cannot be represented in a 'Text'. This
+-- function remaps those code points to the Unicode replacement
+-- character (U+FFFD, \'&#xfffd;\'), and leaves other code points
+-- unchanged.
+safe :: Char -> Char
+safe c
+ | ord c .&. 0x1ff800 /= 0xd800 = c
+ | otherwise = '\xfffd'
+{-# INLINE [0] safe #-}
+
+-- | Apply a function to the first element of an optional pair.
+firstf :: (a -> c) -> Maybe (a,b) -> Maybe (c,b)
+firstf f (Just (a, b)) = Just (f a, b)
+firstf _ Nothing = Nothing
+
+-- | Checked multiplication. Calls 'error' if the result would
+-- overflow.
+mul :: Int -> Int -> Int
+#if WORD_SIZE_IN_BITS == 64
+mul a b = fromIntegral $ fromIntegral a `mul64` fromIntegral b
+#else
+mul a b = fromIntegral $ fromIntegral a `mul32` fromIntegral b
+#endif
+{-# INLINE mul #-}
+infixl 7 `mul`
+
+-- | Checked multiplication. Calls 'error' if the result would
+-- overflow.
+mul64 :: Int64 -> Int64 -> Int64
+mul64 a b
+ | a >= 0 && b >= 0 = mul64_ a b
+ | a >= 0 = -mul64_ a (-b)
+ | b >= 0 = -mul64_ (-a) b
+ | otherwise = mul64_ (-a) (-b)
+{-# INLINE mul64 #-}
+infixl 7 `mul64`
+
+mul64_ :: Int64 -> Int64 -> Int64
+mul64_ a b
+ | ahi > 0 && bhi > 0 = error "overflow"
+ | top > 0x7fffffff = error "overflow"
+ | total < 0 = error "overflow"
+ | otherwise = total
+ where (# ahi, alo #) = (# a `shiftR` 32, a .&. 0xffffffff #)
+ (# bhi, blo #) = (# b `shiftR` 32, b .&. 0xffffffff #)
+ top = ahi * blo + alo * bhi
+ total = (top `shiftL` 32) + alo * blo
+{-# INLINE mul64_ #-}
+
+-- | Checked multiplication. Calls 'error' if the result would
+-- overflow.
+mul32 :: Int32 -> Int32 -> Int32
+mul32 a b = case fromIntegral a * fromIntegral b of
+ ab | ab < min32 || ab > max32 -> error "overflow"
+ | otherwise -> fromIntegral ab
+ where min32 = -0x80000000 :: Int64
+ max32 = 0x7fffffff
+{-# INLINE mul32 #-}
+infixl 7 `mul32`
+
+-- $internals
+--
+-- Internally, the 'Text' type is represented as an array of 'Word8'
+-- UTF-8 code units. The offset and length fields in the constructor
+-- are in these units, /not/ units of 'Char'.
+--
+-- Invariants that all functions must maintain:
+--
+-- * Since the 'Text' type uses UTF-8 internally, it cannot represent
+-- characters in the reserved surrogate code point range U+D800 to
+-- U+DFFF. To maintain this invariant, the 'safe' function maps
+-- 'Char' values in this range to the replacement character (U+FFFD,
+-- \'&#xfffd;\').
+--
+-- * A leading (or \"high\") surrogate code unit (0xD800–0xDBFF) must
+-- always be followed by a trailing (or \"low\") surrogate code unit
+-- (0xDC00-0xDFFF). A trailing surrogate code unit must always be
+-- preceded by a leading surrogate code unit.
diff --git a/Data/Text/Internal/Builder.hs b/Data/Text/Internal/Builder.hs
new file mode 100644
index 0000000..81c23f4
--- /dev/null
+++ b/Data/Text/Internal/Builder.hs
@@ -0,0 +1,329 @@
+{-# LANGUAGE BangPatterns, CPP, Rank2Types #-}
+{-# OPTIONS_HADDOCK not-home #-}
+
+-----------------------------------------------------------------------------
+-- |
+-- Module : Data.Text.Internal.Builder
+-- Copyright : (c) 2013 Bryan O'Sullivan
+-- (c) 2010 Johan Tibell
+-- License : BSD-style (see LICENSE)
+--
+-- Maintainer : Johan Tibell <johan.tibell@gmail.com>
+-- Stability : experimental
+-- Portability : portable to Hugs and GHC
+--
+-- /Warning/: this is an internal module, and does not have a stable
+-- API or name. Functions in this module may not check or enforce
+-- preconditions expected by public modules. Use at your own risk!
+--
+-- Efficient construction of lazy @Text@ values. The principal
+-- operations on a @Builder@ are @singleton@, @fromText@, and
+-- @fromLazyText@, which construct new builders, and 'mappend', which
+-- concatenates two builders.
+--
+-- To get maximum performance when building lazy @Text@ values using a
+-- builder, associate @mappend@ calls to the right. For example,
+-- prefer
+--
+-- > singleton 'a' `mappend` (singleton 'b' `mappend` singleton 'c')
+--
+-- to
+--
+-- > singleton 'a' `mappend` singleton 'b' `mappend` singleton 'c'
+--
+-- as the latter associates @mappend@ to the left.
+--
+-----------------------------------------------------------------------------
+
+module Data.Text.Internal.Builder
+ ( -- * Public API
+ -- ** The Builder type
+ Builder
+ , toLazyText
+ , toLazyTextWith
+
+ -- ** Constructing Builders
+ , singleton
+ , fromText
+ , fromLazyText
+ , fromString
+
+ -- ** Flushing the buffer state
+ , flush
+
+ -- * Internal functions
+ , append'
+ , ensureFree
+ , writeN
+ ) where
+
+import Control.Monad.ST (ST, runST)
+import Data.Monoid (Monoid(..))
+#if MIN_VERSION_base(4,9,0)
+import Data.Semigroup (Semigroup(..))
+#endif
+import Data.Text.Internal (Text(..))
+import Data.Text.Internal.Lazy (smallChunkSize)
+import Data.Text.Unsafe (inlineInterleaveST)
+import Data.Text.Internal.Unsafe.Char (unsafeWrite)
+import Prelude hiding (map, putChar)
+
+import qualified Data.String as String
+import qualified Data.Text as S
+import qualified Data.Text.Array as A
+import qualified Data.Text.Lazy as L
+
+------------------------------------------------------------------------
+
+-- | A @Builder@ is an efficient way to build lazy @Text@ values.
+-- There are several functions for constructing builders, but only one
+-- to inspect them: to extract any data, you have to turn them into
+-- lazy @Text@ values using @toLazyText@.
+--
+-- Internally, a builder constructs a lazy @Text@ by filling arrays
+-- piece by piece. As each buffer is filled, it is \'popped\' off, to
+-- become a new chunk of the resulting lazy @Text@. All this is
+-- hidden from the user of the @Builder@.
+newtype Builder = Builder {
+ -- Invariant (from Data.Text.Lazy):
+ -- The lists include no null Texts.
+ runBuilder :: forall s. (Buffer s -> ST s [S.Text])
+ -> Buffer s
+ -> ST s [S.Text]
+ }
+
+#if MIN_VERSION_base(4,9,0)
+instance Semigroup Builder where
+ (<>) = append
+ {-# INLINE (<>) #-}
+#endif
+
+instance Monoid Builder where
+ mempty = empty
+ {-# INLINE mempty #-}
+#if MIN_VERSION_base(4,9,0)
+ mappend = (<>) -- future-proof definition
+#else
+ mappend = append
+#endif
+ {-# INLINE mappend #-}
+ mconcat = foldr mappend Data.Monoid.mempty
+ {-# INLINE mconcat #-}
+
+instance String.IsString Builder where
+ fromString = fromString
+ {-# INLINE fromString #-}
+
+instance Show Builder where
+ show = show . toLazyText
+
+instance Eq Builder where
+ a == b = toLazyText a == toLazyText b
+
+instance Ord Builder where
+ a <= b = toLazyText a <= toLazyText b
+
+------------------------------------------------------------------------
+
+-- | /O(1)./ The empty @Builder@, satisfying
+--
+-- * @'toLazyText' 'empty' = 'L.empty'@
+--
+empty :: Builder
+empty = Builder (\ k buf -> k buf)
+{-# INLINE empty #-}
+
+-- | /O(1)./ A @Builder@ taking a single character, satisfying
+--
+-- * @'toLazyText' ('singleton' c) = 'L.singleton' c@
+--
+singleton :: Char -> Builder
+singleton c = writeAtMost 4 $ \ marr o -> unsafeWrite marr o c
+{-# INLINE singleton #-}
+
+------------------------------------------------------------------------
+
+-- | /O(1)./ The concatenation of two builders, an associative
+-- operation with identity 'empty', satisfying
+--
+-- * @'toLazyText' ('append' x y) = 'L.append' ('toLazyText' x) ('toLazyText' y)@
+--
+append :: Builder -> Builder -> Builder
+append (Builder f) (Builder g) = Builder (f . g)
+{-# INLINE [0] append #-}
+
+-- TODO: Experiment to find the right threshold.
+copyLimit :: Int
+copyLimit = 128
+
+-- This function attempts to merge small @Text@ values instead of
+-- treating each value as its own chunk. We may not always want this.
+
+-- | /O(1)./ A @Builder@ taking a 'S.Text', satisfying
+--
+-- * @'toLazyText' ('fromText' t) = 'L.fromChunks' [t]@
+--
+fromText :: S.Text -> Builder
+fromText t@(Text arr off l)
+ | S.null t = empty
+ | l <= copyLimit = writeN l $ \marr o -> A.copyI marr o arr off (l+o)
+ | otherwise = flush `append` mapBuilder (t :)
+{-# INLINE [1] fromText #-}
+
+{-# RULES
+"fromText/pack" forall s .
+ fromText (S.pack s) = fromString s
+ #-}
+
+-- | /O(1)./ A Builder taking a @String@, satisfying
+--
+-- * @'toLazyText' ('fromString' s) = 'L.fromChunks' [S.pack s]@
+--
+fromString :: String -> Builder
+fromString str = Builder $ \k (Buffer p0 o0 u0 l0) ->
+ let loop !marr !o !u !l [] = k (Buffer marr o u l)
+ loop marr o u l s@(c:cs)
+ | l <= 1 = do
+ arr <- A.unsafeFreeze marr
+ let !t = Text arr o u
+ marr' <- A.new chunkSize
+ ts <- inlineInterleaveST (loop marr' 0 0 chunkSize s)
+ return $ t : ts
+ | otherwise = do
+ n <- unsafeWrite marr (o+u) c
+ loop marr o (u+n) (l-n) cs
+ in loop p0 o0 u0 l0 str
+ where
+ chunkSize = smallChunkSize
+{-# INLINE fromString #-}
+
+-- | /O(1)./ A @Builder@ taking a lazy @Text@, satisfying
+--
+-- * @'toLazyText' ('fromLazyText' t) = t@
+--
+fromLazyText :: L.Text -> Builder
+fromLazyText ts = flush `append` mapBuilder (L.toChunks ts ++)
+{-# INLINE fromLazyText #-}
+
+------------------------------------------------------------------------
+
+-- Our internal buffer type
+data Buffer s = Buffer {-# UNPACK #-} !(A.MArray s)
+ {-# UNPACK #-} !Int -- offset
+ {-# UNPACK #-} !Int -- used units
+ {-# UNPACK #-} !Int -- length left
+
+------------------------------------------------------------------------
+
+-- | /O(n)./ Extract a lazy @Text@ from a @Builder@ with a default
+-- buffer size. The construction work takes place if and when the
+-- relevant part of the lazy @Text@ is demanded.
+toLazyText :: Builder -> L.Text
+toLazyText = toLazyTextWith smallChunkSize
+
+-- | /O(n)./ Extract a lazy @Text@ from a @Builder@, using the given
+-- size for the initial buffer. The construction work takes place if
+-- and when the relevant part of the lazy @Text@ is demanded.
+--
+-- If the initial buffer is too small to hold all data, subsequent
+-- buffers will be the default buffer size.
+toLazyTextWith :: Int -> Builder -> L.Text
+toLazyTextWith chunkSize m = L.fromChunks (runST $
+ newBuffer chunkSize >>= runBuilder (m `append` flush) (const (return [])))
+
+-- | /O(1)./ Pop the strict @Text@ we have constructed so far, if any,
+-- yielding a new chunk in the result lazy @Text@.
+flush :: Builder
+flush = Builder $ \ k buf@(Buffer p o u l) ->
+ if u == 0
+ then k buf
+ else do arr <- A.unsafeFreeze p
+ let !b = Buffer p (o+u) 0 l
+ !t = Text arr o u
+ ts <- inlineInterleaveST (k b)
+ return $! t : ts
+{-# INLINE [1] flush #-}
+-- defer inlining so that flush/flush rule may fire.
+
+------------------------------------------------------------------------
+
+-- | Sequence an ST operation on the buffer
+withBuffer :: (forall s. Buffer s -> ST s (Buffer s)) -> Builder
+withBuffer f = Builder $ \k buf -> f buf >>= k
+{-# INLINE withBuffer #-}
+
+-- | Get the size of the buffer
+withSize :: (Int -> Builder) -> Builder
+withSize f = Builder $ \ k buf@(Buffer _ _ _ l) ->
+ runBuilder (f l) k buf
+{-# INLINE withSize #-}
+
+-- | Map the resulting list of texts.
+mapBuilder :: ([S.Text] -> [S.Text]) -> Builder
+mapBuilder f = Builder (fmap f .)
+
+------------------------------------------------------------------------
+
+-- | Ensure that there are at least @n@ many elements available.
+ensureFree :: Int -> Builder
+ensureFree !n = withSize $ \ l ->
+ if n <= l
+ then empty
+ else flush `append'` withBuffer (const (newBuffer (max n smallChunkSize)))
+{-# INLINE [0] ensureFree #-}
+
+writeAtMost :: Int -> (forall s. A.MArray s -> Int -> ST s Int) -> Builder
+writeAtMost n f = ensureFree n `append'` withBuffer (writeBuffer f)
+{-# INLINE [0] writeAtMost #-}
+
+-- | Ensure that @n@ many elements are available, and then use @f@ to
+-- write some elements into the memory.
+writeN :: Int -> (forall s. A.MArray s -> Int -> ST s ()) -> Builder
+writeN n f = writeAtMost n (\ p o -> f p o >> return n)
+{-# INLINE writeN #-}
+
+writeBuffer :: (A.MArray s -> Int -> ST s Int) -> Buffer s -> ST s (Buffer s)
+writeBuffer f (Buffer p o u l) = do
+ n <- f p (o+u)
+ return $! Buffer p o (u+n) (l-n)
+{-# INLINE writeBuffer #-}
+
+newBuffer :: Int -> ST s (Buffer s)
+newBuffer size = do
+ arr <- A.new size
+ return $! Buffer arr 0 0 size
+{-# INLINE newBuffer #-}
+
+------------------------------------------------------------------------
+-- Some nice rules for Builder
+
+-- This function makes GHC understand that 'writeN' and 'ensureFree'
+-- are *not* recursive in the precense of the rewrite rules below.
+-- This is not needed with GHC 7+.
+append' :: Builder -> Builder -> Builder
+append' (Builder f) (Builder g) = Builder (f . g)
+{-# INLINE append' #-}
+
+{-# RULES
+
+"append/writeAtMost" forall a b (f::forall s. A.MArray s -> Int -> ST s Int)
+ (g::forall s. A.MArray s -> Int -> ST s Int) ws.
+ append (writeAtMost a f) (append (writeAtMost b g) ws) =
+ append (writeAtMost (a+b) (\marr o -> f marr o >>= \ n ->
+ g marr (o+n) >>= \ m ->
+ let s = n+m in s `seq` return s)) ws
+
+"writeAtMost/writeAtMost" forall a b (f::forall s. A.MArray s -> Int -> ST s Int)
+ (g::forall s. A.MArray s -> Int -> ST s Int).
+ append (writeAtMost a f) (writeAtMost b g) =
+ writeAtMost (a+b) (\marr o -> f marr o >>= \ n ->
+ g marr (o+n) >>= \ m ->
+ let s = n+m in s `seq` return s)
+
+"ensureFree/ensureFree" forall a b .
+ append (ensureFree a) (ensureFree b) = ensureFree (max a b)
+
+"flush/flush"
+ append flush flush = flush
+
+ #-}
diff --git a/Data/Text/Internal/Builder/Functions.hs b/Data/Text/Internal/Builder/Functions.hs
new file mode 100644
index 0000000..25f1db0
--- /dev/null
+++ b/Data/Text/Internal/Builder/Functions.hs
@@ -0,0 +1,40 @@
+{-# LANGUAGE MagicHash #-}
+
+-- |
+-- Module : Data.Text.Internal.Builder.Functions
+-- Copyright : (c) 2011 MailRank, Inc.
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : GHC
+--
+-- /Warning/: this is an internal module, and does not have a stable
+-- API or name. Functions in this module may not check or enforce
+-- preconditions expected by public modules. Use at your own risk!
+--
+-- Useful functions and combinators.
+
+module Data.Text.Internal.Builder.Functions
+ (
+ (<>)
+ , i2d
+ ) where
+
+import Data.Monoid (mappend)
+import Data.Text.Lazy.Builder (Builder)
+import GHC.Base (chr#,ord#,(+#),Int(I#),Char(C#))
+import Prelude ()
+
+-- | Unsafe conversion for decimal digits.
+{-# INLINE i2d #-}
+i2d :: Int -> Char
+i2d (I# i#) = C# (chr# (ord# '0'# +# i#))
+
+-- | The normal 'mappend' function with right associativity instead of
+-- left.
+(<>) :: Builder -> Builder -> Builder
+(<>) = mappend
+{-# INLINE (<>) #-}
+
+infixr 4 <>
diff --git a/Data/Text/Internal/Builder/Int/Digits.hs b/Data/Text/Internal/Builder/Int/Digits.hs
new file mode 100644
index 0000000..e1b3e49
--- /dev/null
+++ b/Data/Text/Internal/Builder/Int/Digits.hs
@@ -0,0 +1,26 @@
+{-# LANGUAGE OverloadedStrings #-}
+
+-- Module: Data.Text.Internal.Builder.Int.Digits
+-- Copyright: (c) 2013 Bryan O'Sullivan
+-- License: BSD-style
+-- Maintainer: Bryan O'Sullivan <bos@serpentine.com>
+-- Stability: experimental
+-- Portability: portable
+--
+-- /Warning/: this is an internal module, and does not have a stable
+-- API or name. Functions in this module may not check or enforce
+-- preconditions expected by public modules. Use at your own risk!
+--
+-- This module exists because the C preprocessor does things that we
+-- shall not speak of when confronted with Haskell multiline strings.
+
+module Data.Text.Internal.Builder.Int.Digits (digits) where
+
+import Data.ByteString.Char8 (ByteString)
+
+digits :: ByteString
+digits = "0001020304050607080910111213141516171819\
+ \2021222324252627282930313233343536373839\
+ \4041424344454647484950515253545556575859\
+ \6061626364656667686970717273747576777879\
+ \8081828384858687888990919293949596979899"
diff --git a/Data/Text/Internal/Builder/RealFloat/Functions.hs b/Data/Text/Internal/Builder/RealFloat/Functions.hs
new file mode 100644
index 0000000..8a76ff0
--- /dev/null
+++ b/Data/Text/Internal/Builder/RealFloat/Functions.hs
@@ -0,0 +1,57 @@
+{-# LANGUAGE CPP #-}
+
+-- |
+-- Module: Data.Text.Internal.Builder.RealFloat.Functions
+-- Copyright: (c) The University of Glasgow 1994-2002
+-- License: see libraries/base/LICENSE
+--
+-- /Warning/: this is an internal module, and does not have a stable
+-- API or name. Functions in this module may not check or enforce
+-- preconditions expected by public modules. Use at your own risk!
+
+module Data.Text.Internal.Builder.RealFloat.Functions
+ (
+ roundTo
+ ) where
+
+roundTo :: Int -> [Int] -> (Int,[Int])
+
+#if MIN_VERSION_base(4,6,0)
+
+roundTo d is =
+ case f d True is of
+ x@(0,_) -> x
+ (1,xs) -> (1, 1:xs)
+ _ -> error "roundTo: bad Value"
+ where
+ b2 = base `quot` 2
+
+ f n _ [] = (0, replicate n 0)
+ f 0 e (x:xs) | x == b2 && e && all (== 0) xs = (0, []) -- Round to even when at exactly half the base
+ | otherwise = (if x >= b2 then 1 else 0, [])
+ f n _ (i:xs)
+ | i' == base = (1,0:ds)
+ | otherwise = (0,i':ds)
+ where
+ (c,ds) = f (n-1) (even i) xs
+ i' = c + i
+ base = 10
+
+#else
+
+roundTo d is =
+ case f d is of
+ x@(0,_) -> x
+ (1,xs) -> (1, 1:xs)
+ _ -> error "roundTo: bad Value"
+ where
+ f n [] = (0, replicate n 0)
+ f 0 (x:_) = (if x >= 5 then 1 else 0, [])
+ f n (i:xs)
+ | i' == 10 = (1,0:ds)
+ | otherwise = (0,i':ds)
+ where
+ (c,ds) = f (n-1) xs
+ i' = c + i
+
+#endif
diff --git a/Data/Text/Internal/Encoding/Fusion.hs b/Data/Text/Internal/Encoding/Fusion.hs
new file mode 100644
index 0000000..8c6aa94
--- /dev/null
+++ b/Data/Text/Internal/Encoding/Fusion.hs
@@ -0,0 +1,209 @@
+{-# LANGUAGE BangPatterns, CPP, Rank2Types #-}
+
+-- |
+-- Module : Data.Text.Internal.Encoding.Fusion
+-- Copyright : (c) Tom Harper 2008-2009,
+-- (c) Bryan O'Sullivan 2009,
+-- (c) Duncan Coutts 2009
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : portable
+--
+-- /Warning/: this is an internal module, and does not have a stable
+-- API or name. Functions in this module may not check or enforce
+-- preconditions expected by public modules. Use at your own risk!
+--
+-- Fusible 'Stream'-oriented functions for converting between 'Text'
+-- and several common encodings.
+
+module Data.Text.Internal.Encoding.Fusion
+ (
+ -- * Streaming
+ streamASCII
+ , streamUtf8
+ , streamUtf16LE
+ , streamUtf16BE
+ , streamUtf32LE
+ , streamUtf32BE
+
+ -- * Unstreaming
+ , unstream
+
+ , module Data.Text.Internal.Encoding.Fusion.Common
+ ) where
+
+#if defined(ASSERTS)
+import Control.Exception (assert)
+#endif
+import Data.ByteString.Internal (ByteString(..), mallocByteString, memcpy)
+import Data.Text.Internal.Fusion (Step(..), Stream(..))
+import Data.Text.Internal.Fusion.Size
+import Data.Text.Encoding.Error
+import Data.Text.Internal (safe)
+import Data.Text.Internal.Encoding.Fusion.Common
+import Data.Text.Internal.Unsafe.Char (unsafeChr, unsafeChr8, unsafeChr32)
+import Data.Text.Internal.Unsafe.Shift (shiftL, shiftR)
+import Data.Word (Word8, Word16, Word32)
+import Foreign.ForeignPtr (withForeignPtr, ForeignPtr)
+import Foreign.Storable (pokeByteOff)
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Unsafe as B
+import qualified Data.Text.Internal.Encoding.Utf8 as U8
+import qualified Data.Text.Internal.Encoding.Utf16 as U16
+import qualified Data.Text.Internal.Encoding.Utf32 as U32
+import Data.Text.Unsafe (unsafeDupablePerformIO)
+
+streamASCII :: ByteString -> Stream Char
+streamASCII bs = Stream next 0 (maxSize l)
+ where
+ l = B.length bs
+ {-# INLINE next #-}
+ next i
+ | i >= l = Done
+ | otherwise = Yield (unsafeChr8 x1) (i+1)
+ where
+ x1 = B.unsafeIndex bs i
+{-# DEPRECATED streamASCII "Do not use this function" #-}
+{-# INLINE [0] streamASCII #-}
+
+-- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using UTF-8
+-- encoding.
+streamUtf8 :: OnDecodeError -> ByteString -> Stream Char
+streamUtf8 onErr bs = Stream next 0 (maxSize l)
+ where
+ l = B.length bs
+ next i
+ | i >= l = Done
+ | U8.validate1 x1 = Yield (unsafeChr8 x1) (i+1)
+ | i+1 < l && U8.validate2 x1 x2 = Yield (U8.chr2 x1 x2) (i+2)
+ | i+2 < l && U8.validate3 x1 x2 x3 = Yield (U8.chr3 x1 x2 x3) (i+3)
+ | i+3 < l && U8.validate4 x1 x2 x3 x4 = Yield (U8.chr4 x1 x2 x3 x4) (i+4)
+ | otherwise = decodeError "streamUtf8" "UTF-8" onErr (Just x1) (i+1)
+ where
+ x1 = idx i
+ x2 = idx (i + 1)
+ x3 = idx (i + 2)
+ x4 = idx (i + 3)
+ idx = B.unsafeIndex bs
+{-# INLINE [0] streamUtf8 #-}
+
+-- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using little
+-- endian UTF-16 encoding.
+streamUtf16LE :: OnDecodeError -> ByteString -> Stream Char
+streamUtf16LE onErr bs = Stream next 0 (maxSize (l `shiftR` 1))
+ where
+ l = B.length bs
+ {-# INLINE next #-}
+ next i
+ | i >= l = Done
+ | i+1 < l && U16.validate1 x1 = Yield (unsafeChr x1) (i+2)
+ | i+3 < l && U16.validate2 x1 x2 = Yield (U16.chr2 x1 x2) (i+4)
+ | otherwise = decodeError "streamUtf16LE" "UTF-16LE" onErr Nothing (i+1)
+ where
+ x1 = idx i + (idx (i + 1) `shiftL` 8)
+ x2 = idx (i + 2) + (idx (i + 3) `shiftL` 8)
+ idx = fromIntegral . B.unsafeIndex bs :: Int -> Word16
+{-# INLINE [0] streamUtf16LE #-}
+
+-- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using big
+-- endian UTF-16 encoding.
+streamUtf16BE :: OnDecodeError -> ByteString -> Stream Char
+streamUtf16BE onErr bs = Stream next 0 (maxSize (l `shiftR` 1))
+ where
+ l = B.length bs
+ {-# INLINE next #-}
+ next i
+ | i >= l = Done
+ | i+1 < l && U16.validate1 x1 = Yield (unsafeChr x1) (i+2)
+ | i+3 < l && U16.validate2 x1 x2 = Yield (U16.chr2 x1 x2) (i+4)
+ | otherwise = decodeError "streamUtf16BE" "UTF-16BE" onErr Nothing (i+1)
+ where
+ x1 = (idx i `shiftL` 8) + idx (i + 1)
+ x2 = (idx (i + 2) `shiftL` 8) + idx (i + 3)
+ idx = fromIntegral . B.unsafeIndex bs :: Int -> Word16
+{-# INLINE [0] streamUtf16BE #-}
+
+-- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using big
+-- endian UTF-32 encoding.
+streamUtf32BE :: OnDecodeError -> ByteString -> Stream Char
+streamUtf32BE onErr bs = Stream next 0 (maxSize (l `shiftR` 2))
+ where
+ l = B.length bs
+ {-# INLINE next #-}
+ next i
+ | i >= l = Done
+ | i+3 < l && U32.validate x = Yield (unsafeChr32 x) (i+4)
+ | otherwise = decodeError "streamUtf32BE" "UTF-32BE" onErr Nothing (i+1)
+ where
+ x = shiftL x1 24 + shiftL x2 16 + shiftL x3 8 + x4
+ x1 = idx i
+ x2 = idx (i+1)
+ x3 = idx (i+2)
+ x4 = idx (i+3)
+ idx = fromIntegral . B.unsafeIndex bs :: Int -> Word32
+{-# INLINE [0] streamUtf32BE #-}
+
+-- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using little
+-- endian UTF-32 encoding.
+streamUtf32LE :: OnDecodeError -> ByteString -> Stream Char
+streamUtf32LE onErr bs = Stream next 0 (maxSize (l `shiftR` 2))
+ where
+ l = B.length bs
+ {-# INLINE next #-}
+ next i
+ | i >= l = Done
+ | i+3 < l && U32.validate x = Yield (unsafeChr32 x) (i+4)
+ | otherwise = decodeError "streamUtf32LE" "UTF-32LE" onErr Nothing (i+1)
+ where
+ x = shiftL x4 24 + shiftL x3 16 + shiftL x2 8 + x1
+ x1 = idx i
+ x2 = idx $ i+1
+ x3 = idx $ i+2
+ x4 = idx $ i+3
+ idx = fromIntegral . B.unsafeIndex bs :: Int -> Word32
+{-# INLINE [0] streamUtf32LE #-}
+
+-- | /O(n)/ Convert a 'Stream' 'Word8' to a 'ByteString'.
+unstream :: Stream Word8 -> ByteString
+unstream (Stream next s0 len) = unsafeDupablePerformIO $ do
+ let mlen = upperBound 4 len
+ mallocByteString mlen >>= loop mlen 0 s0
+ where
+ loop !n !off !s fp = case next s of
+ Done -> trimUp fp n off
+ Skip s' -> loop n off s' fp
+ Yield x s'
+ | off == n -> realloc fp n off s' x
+ | otherwise -> do
+ withForeignPtr fp $ \p -> pokeByteOff p off x
+ loop n (off+1) s' fp
+ {-# NOINLINE realloc #-}
+ realloc fp n off s x = do
+ let n' = n+n
+ fp' <- copy0 fp n n'
+ withForeignPtr fp' $ \p -> pokeByteOff p off x
+ loop n' (off+1) s fp'
+ {-# NOINLINE trimUp #-}
+ trimUp fp _ off = return $! PS fp 0 off
+ copy0 :: ForeignPtr Word8 -> Int -> Int -> IO (ForeignPtr Word8)
+ copy0 !src !srcLen !destLen =
+#if defined(ASSERTS)
+ assert (srcLen <= destLen) $
+#endif
+ do
+ dest <- mallocByteString destLen
+ withForeignPtr src $ \src' ->
+ withForeignPtr dest $ \dest' ->
+ memcpy dest' src' (fromIntegral srcLen)
+ return dest
+
+decodeError :: forall s. String -> String -> OnDecodeError -> Maybe Word8
+ -> s -> Step s Char
+decodeError func kind onErr mb i =
+ case onErr desc mb of
+ Nothing -> Skip i
+ Just c -> Yield (safe c) i
+ where desc = "Data.Text.Internal.Encoding.Fusion." ++ func ++ ": Invalid " ++
+ kind ++ " stream"
diff --git a/Data/Text/Internal/Encoding/Fusion/Common.hs b/Data/Text/Internal/Encoding/Fusion/Common.hs
new file mode 100644
index 0000000..06bd657
--- /dev/null
+++ b/Data/Text/Internal/Encoding/Fusion/Common.hs
@@ -0,0 +1,123 @@
+{-# LANGUAGE BangPatterns #-}
+
+-- |
+-- Module : Data.Text.Internal.Encoding.Fusion.Common
+-- Copyright : (c) Tom Harper 2008-2009,
+-- (c) Bryan O'Sullivan 2009,
+-- (c) Duncan Coutts 2009,
+-- (c) Jasper Van der Jeugt 2011
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : portable
+--
+-- /Warning/: this is an internal module, and does not have a stable
+-- API or name. Use at your own risk!
+--
+-- Fusible 'Stream'-oriented functions for converting between 'Text'
+-- and several common encodings.
+
+module Data.Text.Internal.Encoding.Fusion.Common
+ (
+ -- * Restreaming
+ -- Restreaming is the act of converting from one 'Stream'
+ -- representation to another.
+ restreamUtf16LE
+ , restreamUtf16BE
+ , restreamUtf32LE
+ , restreamUtf32BE
+ ) where
+
+import Data.Bits ((.&.))
+import Data.Text.Internal.Fusion (Step(..), Stream(..))
+import Data.Text.Internal.Fusion.Types (RS(..))
+import Data.Text.Internal.Unsafe.Char (ord)
+import Data.Text.Internal.Unsafe.Shift (shiftR)
+import Data.Word (Word8)
+
+restreamUtf16BE :: Stream Char -> Stream Word8
+restreamUtf16BE (Stream next0 s0 len) = Stream next (RS0 s0) (len * 2)
+ where
+ next (RS0 s) = case next0 s of
+ Done -> Done
+ Skip s' -> Skip (RS0 s')
+ Yield x s'
+ | n < 0x10000 -> Yield (fromIntegral $ n `shiftR` 8) $
+ RS1 s' (fromIntegral n)
+ | otherwise -> Yield c1 $ RS3 s' c2 c3 c4
+ where
+ n = ord x
+ n1 = n - 0x10000
+ c1 = fromIntegral (n1 `shiftR` 18 + 0xD8)
+ c2 = fromIntegral (n1 `shiftR` 10)
+ n2 = n1 .&. 0x3FF
+ c3 = fromIntegral (n2 `shiftR` 8 + 0xDC)
+ c4 = fromIntegral n2
+ next (RS1 s x2) = Yield x2 (RS0 s)
+ next (RS2 s x2 x3) = Yield x2 (RS1 s x3)
+ next (RS3 s x2 x3 x4) = Yield x2 (RS2 s x3 x4)
+ {-# INLINE next #-}
+{-# INLINE restreamUtf16BE #-}
+
+restreamUtf16LE :: Stream Char -> Stream Word8
+restreamUtf16LE (Stream next0 s0 len) = Stream next (RS0 s0) (len * 2)
+ where
+ next (RS0 s) = case next0 s of
+ Done -> Done
+ Skip s' -> Skip (RS0 s')
+ Yield x s'
+ | n < 0x10000 -> Yield (fromIntegral n) $
+ RS1 s' (fromIntegral $ shiftR n 8)
+ | otherwise -> Yield c1 $ RS3 s' c2 c3 c4
+ where
+ n = ord x
+ n1 = n - 0x10000
+ c2 = fromIntegral (shiftR n1 18 + 0xD8)
+ c1 = fromIntegral (shiftR n1 10)
+ n2 = n1 .&. 0x3FF
+ c4 = fromIntegral (shiftR n2 8 + 0xDC)
+ c3 = fromIntegral n2
+ next (RS1 s x2) = Yield x2 (RS0 s)
+ next (RS2 s x2 x3) = Yield x2 (RS1 s x3)
+ next (RS3 s x2 x3 x4) = Yield x2 (RS2 s x3 x4)
+ {-# INLINE next #-}
+{-# INLINE restreamUtf16LE #-}
+
+restreamUtf32BE :: Stream Char -> Stream Word8
+restreamUtf32BE (Stream next0 s0 len) = Stream next (RS0 s0) (len * 2)
+ where
+ next (RS0 s) = case next0 s of
+ Done -> Done
+ Skip s' -> Skip (RS0 s')
+ Yield x s' -> Yield c1 (RS3 s' c2 c3 c4)
+ where
+ n = ord x
+ c1 = fromIntegral $ shiftR n 24
+ c2 = fromIntegral $ shiftR n 16
+ c3 = fromIntegral $ shiftR n 8
+ c4 = fromIntegral n
+ next (RS1 s x2) = Yield x2 (RS0 s)
+ next (RS2 s x2 x3) = Yield x2 (RS1 s x3)
+ next (RS3 s x2 x3 x4) = Yield x2 (RS2 s x3 x4)
+ {-# INLINE next #-}
+{-# INLINE restreamUtf32BE #-}
+
+restreamUtf32LE :: Stream Char -> Stream Word8
+restreamUtf32LE (Stream next0 s0 len) = Stream next (RS0 s0) (len * 2)
+ where
+ next (RS0 s) = case next0 s of
+ Done -> Done
+ Skip s' -> Skip (RS0 s')
+ Yield x s' -> Yield c1 (RS3 s' c2 c3 c4)
+ where
+ n = ord x
+ c4 = fromIntegral $ shiftR n 24
+ c3 = fromIntegral $ shiftR n 16
+ c2 = fromIntegral $ shiftR n 8
+ c1 = fromIntegral n
+ next (RS1 s x2) = Yield x2 (RS0 s)
+ next (RS2 s x2 x3) = Yield x2 (RS1 s x3)
+ next (RS3 s x2 x3 x4) = Yield x2 (RS2 s x3 x4)
+ {-# INLINE next #-}
+{-# INLINE restreamUtf32LE #-}
diff --git a/Data/Text/Internal/Encoding/Utf16.hs b/Data/Text/Internal/Encoding/Utf16.hs
new file mode 100644
index 0000000..e5e3c49
--- /dev/null
+++ b/Data/Text/Internal/Encoding/Utf16.hs
@@ -0,0 +1,45 @@
+{-# LANGUAGE MagicHash, BangPatterns #-}
+
+-- |
+-- Module : Data.Text.Internal.Encoding.Utf16
+-- Copyright : (c) 2008, 2009 Tom Harper,
+-- (c) 2009 Bryan O'Sullivan,
+-- (c) 2009 Duncan Coutts
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : GHC
+--
+-- /Warning/: this is an internal module, and does not have a stable
+-- API or name. Functions in this module may not check or enforce
+-- preconditions expected by public modules. Use at your own risk!
+--
+-- Basic UTF-16 validation and character manipulation.
+module Data.Text.Internal.Encoding.Utf16
+ (
+ chr2
+ , validate1
+ , validate2
+ ) where
+
+import GHC.Exts
+import GHC.Word (Word16(..))
+
+chr2 :: Word16 -> Word16 -> Char
+chr2 (W16# a#) (W16# b#) = C# (chr# (upper# +# lower# +# 0x10000#))
+ where
+ !x# = word2Int# a#
+ !y# = word2Int# b#
+ !upper# = uncheckedIShiftL# (x# -# 0xD800#) 10#
+ !lower# = y# -# 0xDC00#
+{-# INLINE chr2 #-}
+
+validate1 :: Word16 -> Bool
+validate1 x1 = x1 < 0xD800 || x1 > 0xDFFF
+{-# INLINE validate1 #-}
+
+validate2 :: Word16 -> Word16 -> Bool
+validate2 x1 x2 = x1 >= 0xD800 && x1 <= 0xDBFF &&
+ x2 >= 0xDC00 && x2 <= 0xDFFF
+{-# INLINE validate2 #-}
diff --git a/Data/Text/Internal/Encoding/Utf32.hs b/Data/Text/Internal/Encoding/Utf32.hs
new file mode 100644
index 0000000..4e8e9b4
--- /dev/null
+++ b/Data/Text/Internal/Encoding/Utf32.hs
@@ -0,0 +1,26 @@
+-- |
+-- Module : Data.Text.Internal.Encoding.Utf32
+-- Copyright : (c) 2008, 2009 Tom Harper,
+-- (c) 2009, 2010 Bryan O'Sullivan,
+-- (c) 2009 Duncan Coutts
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : portable
+--
+-- /Warning/: this is an internal module, and does not have a stable
+-- API or name. Functions in this module may not check or enforce
+-- preconditions expected by public modules. Use at your own risk!
+--
+-- Basic UTF-32 validation.
+module Data.Text.Internal.Encoding.Utf32
+ (
+ validate
+ ) where
+
+import Data.Word (Word32)
+
+validate :: Word32 -> Bool
+validate x1 = x1 < 0xD800 || (x1 > 0xDFFF && x1 <= 0x10FFFF)
+{-# INLINE validate #-}
diff --git a/Data/Text/Internal/Encoding/Utf8.hs b/Data/Text/Internal/Encoding/Utf8.hs
new file mode 100644
index 0000000..ad19797
--- /dev/null
+++ b/Data/Text/Internal/Encoding/Utf8.hs
@@ -0,0 +1,255 @@
+{-# LANGUAGE CPP, MagicHash, BangPatterns #-}
+
+-- |
+-- Module : Data.Text.Internal.Encoding.Utf8
+-- Copyright : (c) 2008, 2009 Tom Harper,
+-- (c) 2009, 2010 Bryan O'Sullivan,
+-- (c) 2009 Duncan Coutts
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : GHC
+--
+-- /Warning/: this is an internal module, and does not have a stable
+-- API or name. Functions in this module may not check or enforce
+-- preconditions expected by public modules. Use at your own risk!
+--
+-- Basic UTF-8 validation and character manipulation.
+module Data.Text.Internal.Encoding.Utf8
+ (
+ -- Decomposition
+ ord2
+ , ord3
+ , ord4
+ -- Construction
+ , chr2
+ , chr3
+ , chr4
+ -- * Validation
+ , continuationByte
+ , validate1
+ , validate2
+ , validate3
+ , validate4
+
+ , decodeChar
+ , decodeCharIndex
+ , reverseDecodeCharIndex
+ , encodeChar
+ , charTailBytes
+ ) where
+
+#if defined(TEST_SUITE)
+# undef ASSERTS
+#endif
+
+#if defined(ASSERTS)
+import Control.Exception (assert)
+#endif
+import Data.Bits ((.&.))
+import Data.Text.Internal.Unsafe.Char (ord, unsafeChr8)
+import Data.Text.Internal.Unsafe.Shift (shiftR)
+import GHC.Exts
+import GHC.Word (Word8(..))
+
+default(Int)
+
+between :: Word8 -- ^ byte to check
+ -> Word8 -- ^ lower bound
+ -> Word8 -- ^ upper bound
+ -> Bool
+between x y z = x >= y && x <= z
+{-# INLINE between #-}
+
+ord2 :: Char -> (Word8,Word8)
+ord2 c =
+#if defined(ASSERTS)
+ assert (n >= 0x80 && n <= 0x07ff)
+#endif
+ (x1,x2)
+ where
+ n = ord c
+ x1 = fromIntegral $ (n `shiftR` 6) + 0xC0
+ x2 = fromIntegral $ (n .&. 0x3F) + 0x80
+
+ord3 :: Char -> (Word8,Word8,Word8)
+ord3 c =
+#if defined(ASSERTS)
+ assert (n >= 0x0800 && n <= 0xffff)
+#endif
+ (x1,x2,x3)
+ where
+ n = ord c
+ x1 = fromIntegral $ (n `shiftR` 12) + 0xE0
+ x2 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80
+ x3 = fromIntegral $ (n .&. 0x3F) + 0x80
+
+ord4 :: Char -> (Word8,Word8,Word8,Word8)
+ord4 c =
+#if defined(ASSERTS)
+ assert (n >= 0x10000)
+#endif
+ (x1,x2,x3,x4)
+ where
+ n = ord c
+ x1 = fromIntegral $ (n `shiftR` 18) + 0xF0
+ x2 = fromIntegral $ ((n `shiftR` 12) .&. 0x3F) + 0x80
+ x3 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80
+ x4 = fromIntegral $ (n .&. 0x3F) + 0x80
+
+chr2 :: Word8 -> Word8 -> Char
+chr2 (W8# x1#) (W8# x2#) = C# (chr# (z1# +# z2#))
+ where
+ !y1# = word2Int# x1#
+ !y2# = word2Int# x2#
+ !z1# = uncheckedIShiftL# (y1# -# 0xC0#) 6#
+ !z2# = y2# -# 0x80#
+{-# INLINE chr2 #-}
+
+chr3 :: Word8 -> Word8 -> Word8 -> Char
+chr3 (W8# x1#) (W8# x2#) (W8# x3#) = C# (chr# (z1# +# z2# +# z3#))
+ where
+ !y1# = word2Int# x1#
+ !y2# = word2Int# x2#
+ !y3# = word2Int# x3#
+ !z1# = uncheckedIShiftL# (y1# -# 0xE0#) 12#
+ !z2# = uncheckedIShiftL# (y2# -# 0x80#) 6#
+ !z3# = y3# -# 0x80#
+{-# INLINE chr3 #-}
+
+chr4 :: Word8 -> Word8 -> Word8 -> Word8 -> Char
+chr4 (W8# x1#) (W8# x2#) (W8# x3#) (W8# x4#) =
+ C# (chr# (z1# +# z2# +# z3# +# z4#))
+ where
+ !y1# = word2Int# x1#
+ !y2# = word2Int# x2#
+ !y3# = word2Int# x3#
+ !y4# = word2Int# x4#
+ !z1# = uncheckedIShiftL# (y1# -# 0xF0#) 18#
+ !z2# = uncheckedIShiftL# (y2# -# 0x80#) 12#
+ !z3# = uncheckedIShiftL# (y3# -# 0x80#) 6#
+ !z4# = y4# -# 0x80#
+{-# INLINE chr4 #-}
+
+validate1 :: Word8 -> Bool
+validate1 x1 = x1 <= 0x7F
+{-# INLINE validate1 #-}
+
+validate2 :: Word8 -> Word8 -> Bool
+validate2 x1 x2 = between x1 0xC2 0xDF && between x2 0x80 0xBF
+{-# INLINE validate2 #-}
+
+validate3 :: Word8 -> Word8 -> Word8 -> Bool
+{-# INLINE validate3 #-}
+validate3 x1 x2 x3 = validate3_1 || validate3_2 || validate3_3 || validate3_4
+ where
+ validate3_1 = (x1 == 0xE0) &&
+ between x2 0xA0 0xBF &&
+ between x3 0x80 0xBF
+ validate3_2 = between x1 0xE1 0xEC &&
+ between x2 0x80 0xBF &&
+ between x3 0x80 0xBF
+ validate3_3 = x1 == 0xED &&
+ between x2 0x80 0x9F &&
+ between x3 0x80 0xBF
+ validate3_4 = between x1 0xEE 0xEF &&
+ between x2 0x80 0xBF &&
+ between x3 0x80 0xBF
+
+validate4 :: Word8 -> Word8 -> Word8 -> Word8 -> Bool
+{-# INLINE validate4 #-}
+validate4 x1 x2 x3 x4 = validate4_1 || validate4_2 || validate4_3
+ where
+ validate4_1 = x1 == 0xF0 &&
+ between x2 0x90 0xBF &&
+ between x3 0x80 0xBF &&
+ between x4 0x80 0xBF
+ validate4_2 = between x1 0xF1 0xF3 &&
+ between x2 0x80 0xBF &&
+ between x3 0x80 0xBF &&
+ between x4 0x80 0xBF
+ validate4_3 = x1 == 0xF4 &&
+ between x2 0x80 0x8F &&
+ between x3 0x80 0xBF &&
+ between x4 0x80 0xBF
+
+-- | Utility function: check if a word is an UTF-8 continuation byte
+continuationByte :: Word8 -> Bool
+continuationByte x = x .&. 0xC0 == 0x80
+{-# INLINE [0] continuationByte #-}
+
+-- | Inverse of 'continuationByte'
+notContinuationByte :: Word8 -> Bool
+notContinuationByte x = x .&. 0xC0 /= 0x80
+{-# INLINE [0] notContinuationByte #-}
+
+-- | Hybrid combination of 'unsafeChr8', 'chr2', 'chr3' and 'chr4'. This
+-- function will not touch the bytes it doesn't need.
+decodeChar :: (Char -> Int -> a) -> Word8 -> Word8 -> Word8 -> Word8 -> a
+decodeChar f !n1 n2 n3 n4
+ | n1 < 0xC0 = f (unsafeChr8 n1) 1
+ | n1 < 0xE0 = f (chr2 n1 n2) 2
+ | n1 < 0xF0 = f (chr3 n1 n2 n3) 3
+ | otherwise = f (chr4 n1 n2 n3 n4) 4
+{-# INLINE [0] decodeChar #-}
+
+-- | Version of 'decodeChar' which works with an indexing function.
+decodeCharIndex :: (Char -> Int -> a) -> (Int -> Word8) -> Int -> a
+decodeCharIndex f idx n =
+ decodeChar f (idx n) (idx (n + 1)) (idx (n + 2)) (idx (n + 3))
+{-# INLINE [0] decodeCharIndex #-}
+
+-- | Version of 'decodeCharIndex' that takes the rightmost index and tracks
+-- back to the left. Note that this function requires that the input is
+-- valid unicode.
+reverseDecodeCharIndex :: (Char -> Int -> a) -> (Int -> Word8) -> Int -> a
+reverseDecodeCharIndex f idx !r =
+ let !x1 = idx r in
+ if notContinuationByte x1 then f (unsafeChr8 x1) 1
+ else let !x2 = idx (r - 1) in
+ if notContinuationByte x2 then f (chr2 x2 x1) 2
+ else let !x3 = idx (r - 2) in
+ if notContinuationByte x3 then f (chr3 x3 x2 x1) 3
+ else let !x4 = idx (r - 3) in
+ f (chr4 x4 x3 x2 x1) 4
+{-# INLINE [0] reverseDecodeCharIndex #-}
+
+-- | This function provides fast UTF-8 encoding of characters because the user
+-- can supply custom functions for the different code paths, which should be
+-- inlined properly.
+encodeChar :: (Word8 -> a)
+ -> (Word8 -> Word8 -> a)
+ -> (Word8 -> Word8 -> Word8 -> a)
+ -> (Word8 -> Word8 -> Word8 -> Word8 -> a)
+ -> Char
+ -> a
+encodeChar f1 f2 f3 f4 c
+ -- One-byte character
+ | n < 0x80 = f1 (fromIntegral n)
+ -- Two-byte character
+ | n < 0x0800 = f2 (fromIntegral $ (n `shiftR` 6) + 0xC0)
+ (fromIntegral $ (n .&. 0x3F) + 0x80)
+ -- Three-byte character
+ | n < 0x10000 = f3 (fromIntegral $ (n `shiftR` 12) + 0xE0)
+ (fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80)
+ (fromIntegral $ (n .&. 0x3F) + 0x80)
+ -- Four-byte character
+ | otherwise = f4 (fromIntegral $ (n `shiftR` 18) + 0xF0)
+ (fromIntegral $ ((n `shiftR` 12) .&. 0x3F) + 0x80)
+ (fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80)
+ (fromIntegral $ (n .&. 0x3F) + 0x80)
+ where
+ n = ord c
+{-# INLINE [0] encodeChar #-}
+
+-- | Count the number of UTF-8 tail bytes needed to encode a character
+charTailBytes :: Char -> Int
+charTailBytes x
+ | n < 0x00080 = 0
+ | n < 0x00800 = 1
+ | n < 0x10000 = 2
+ | otherwise = 3
+ where
+ n = ord x
+{-# INLINE [0] charTailBytes #-}
diff --git a/Data/Text/Internal/Functions.hs b/Data/Text/Internal/Functions.hs
new file mode 100644
index 0000000..f002ccc
--- /dev/null
+++ b/Data/Text/Internal/Functions.hs
@@ -0,0 +1,31 @@
+{-# LANGUAGE CPP, DeriveDataTypeable #-}
+
+-- |
+-- Module : Data.Text.Internal.Functions
+-- Copyright : 2010 Bryan O'Sullivan
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : GHC
+--
+-- /Warning/: this is an internal module, and does not have a stable
+-- API or name. Functions in this module may not check or enforce
+-- preconditions expected by public modules. Use at your own risk!
+--
+-- Useful functions.
+
+module Data.Text.Internal.Functions
+ (
+ intersperse
+ ) where
+
+-- | A lazier version of Data.List.intersperse. The other version
+-- causes space leaks!
+intersperse :: a -> [a] -> [a]
+intersperse _ [] = []
+intersperse sep (x:xs) = x : go xs
+ where
+ go [] = []
+ go (y:ys) = sep : y: go ys
+{-# INLINE intersperse #-}
diff --git a/Data/Text/Internal/Fusion.hs b/Data/Text/Internal/Fusion.hs
new file mode 100644
index 0000000..380964e
--- /dev/null
+++ b/Data/Text/Internal/Fusion.hs
@@ -0,0 +1,231 @@
+{-# LANGUAGE BangPatterns, MagicHash #-}
+
+-- |
+-- Module : Data.Text.Internal.Fusion
+-- Copyright : (c) Tom Harper 2008-2009,
+-- (c) Bryan O'Sullivan 2009-2010,
+-- (c) Duncan Coutts 2009
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : GHC
+--
+-- /Warning/: this is an internal module, and does not have a stable
+-- API or name. Functions in this module may not check or enforce
+-- preconditions expected by public modules. Use at your own risk!
+--
+-- Text manipulation functions represented as fusible operations over
+-- streams.
+module Data.Text.Internal.Fusion
+ (
+ -- * Types
+ Stream(..)
+ , Step(..)
+
+ -- * Creation and elimination
+ , stream
+ , unstream
+ , reverseStream
+
+ , length
+
+ -- * Transformations
+ , reverse
+
+ -- * Construction
+ -- ** Scans
+ , reverseScanr
+
+ -- ** Accumulating maps
+ , mapAccumL
+
+ -- ** Generation and unfolding
+ , unfoldrN
+
+ -- * Indexing
+ , index
+ , findIndex
+ , countChar
+ ) where
+
+import Prelude (Bool(..), Char, Maybe(..), Monad(..), Int,
+ Num(..), Ord(..), ($),
+ otherwise)
+import Data.Text.Internal (Text(..))
+import Data.Text.Internal.Private (runText)
+import Data.Text.Internal.Unsafe.Char (unsafeWrite)
+import Data.Text.Internal.Unsafe.Shift (shiftL, shiftR)
+import qualified Data.Text.Array as A
+import qualified Data.Text.Internal.Fusion.Common as S
+import Data.Text.Internal.Fusion.Types
+import Data.Text.Internal.Fusion.Size
+import qualified Data.Text.Internal as I
+import qualified Data.Text.Internal.Encoding.Utf8 as U8
+
+
+default(Int)
+
+-- | /O(n)/ Convert a 'Text' into a 'Stream Char'.
+stream :: Text -> Stream Char
+stream (Text arr off len) = Stream next off (betweenSize (len `shiftR` 2) len)
+ where
+ !end = off+len
+ next !i
+ | i >= end = Done
+ | otherwise = U8.decodeCharIndex (\c s -> Yield c (i + s)) idx i
+ where
+ idx = A.unsafeIndex arr
+{-# INLINE [0] stream #-}
+
+-- | /O(n)/ Convert a 'Text' into a 'Stream Char', but iterate
+-- backwards.
+reverseStream :: Text -> Stream Char
+reverseStream (Text arr off len) = Stream next (off+len-1) (betweenSize (len `shiftR` 2) len)
+ where
+ {-# INLINE next #-}
+ next !i
+ | i < off = Done
+ | otherwise = U8.reverseDecodeCharIndex (\c w -> Yield c (i - w)) idx i
+ where
+ idx = A.unsafeIndex arr
+{-# INLINE [0] reverseStream #-}
+
+-- | /O(n)/ Convert a 'Stream Char' into a 'Text'.
+unstream :: Stream Char -> Text
+unstream (Stream next0 s0 len) = runText $ \done -> do
+ -- Before encoding each char we perform a buffer realloc check assuming
+ -- worst case encoding size of two 16-bit units for the char. Just add an
+ -- extra space to the buffer so that we do not end up reallocating even when
+ -- all the chars are encoded as single unit.
+ let mlen = upperBound 4 len + 1
+ arr0 <- A.new mlen
+ let outer !arr !maxi = encode
+ where
+ -- keep the common case loop as small as possible
+ encode !si !di =
+ case next0 si of
+ Done -> done arr di
+ Skip si' -> encode si' di
+ Yield c si'
+ -- simply check for the worst case
+ | maxi < di + U8.charTailBytes c -> realloc si di
+ | otherwise -> do
+ n <- unsafeWrite arr di c
+ encode si' (di + n)
+
+ -- keep uncommon case separate from the common case code
+ {-# NOINLINE realloc #-}
+ realloc !si !di = do
+ let newlen = (maxi + 1) * 2
+ arr' <- A.new newlen
+ A.copyM arr' 0 arr 0 di
+ outer arr' (newlen - 1) si di
+
+ outer arr0 (mlen - 1) s0 0
+{-# INLINE [0] unstream #-}
+{-# RULES "STREAM stream/unstream fusion" forall s. stream (unstream s) = s #-}
+
+
+-- ----------------------------------------------------------------------------
+-- * Basic stream functions
+
+length :: Stream Char -> Int
+length = S.lengthI
+{-# INLINE[0] length #-}
+
+-- | /O(n)/ Reverse the characters of a string.
+reverse :: Stream Char -> Text
+reverse (Stream next s len0)
+ | isEmpty len0 = I.empty
+ | otherwise = I.text arr off' len'
+ where
+ len0' = upperBound 4 (larger len0 4)
+ (arr, (off', len')) = A.run2 (A.new len0' >>= loop s (len0'-1) len0')
+ loop !s0 !i !len marr =
+ case next s0 of
+ Done -> return (marr, (j, len-j))
+ where j = i + 1
+ Skip s1 -> loop s1 i len marr
+ Yield x s1 | i < least -> {-# SCC "reverse/resize" #-} do
+ let newLen = len `shiftL` 1
+ marr' <- A.new newLen
+ A.copyM marr' (newLen-len) marr 0 len
+ write s1 (len+i) newLen marr'
+ | otherwise -> write s1 i len marr
+ where
+ least = U8.charTailBytes x
+ write t j l mar = do
+ _ <- unsafeWrite mar (j-least) x
+ loop t (j-least-1) l mar
+{-# INLINE [0] reverse #-}
+
+-- | /O(n)/ Perform the equivalent of 'scanr' over a list, only with
+-- the input and result reversed.
+reverseScanr :: (Char -> Char -> Char) -> Char -> Stream Char -> Stream Char
+reverseScanr f z0 (Stream next0 s0 len) = Stream next (Scan1 z0 s0) (len+1) -- HINT maybe too low
+ where
+ {-# INLINE next #-}
+ next (Scan1 z s) = Yield z (Scan2 z s)
+ next (Scan2 z s) = case next0 s of
+ Yield x s' -> let !x' = f x z
+ in Yield x' (Scan2 x' s')
+ Skip s' -> Skip (Scan2 z s')
+ Done -> Done
+{-# INLINE reverseScanr #-}
+
+-- | /O(n)/ Like 'unfoldr', 'unfoldrN' builds a stream from a seed
+-- value. However, the length of the result is limited by the
+-- first argument to 'unfoldrN'. This function is more efficient than
+-- 'unfoldr' when the length of the result is known.
+unfoldrN :: Int -> (a -> Maybe (Char,a)) -> a -> Stream Char
+unfoldrN n = S.unfoldrNI n
+{-# INLINE [0] unfoldrN #-}
+
+-------------------------------------------------------------------------------
+-- ** Indexing streams
+
+-- | /O(n)/ stream index (subscript) operator, starting from 0.
+index :: Stream Char -> Int -> Char
+index = S.indexI
+{-# INLINE [0] index #-}
+
+-- | The 'findIndex' function takes a predicate and a stream and
+-- returns the index of the first element in the stream
+-- satisfying the predicate.
+findIndex :: (Char -> Bool) -> Stream Char -> Maybe Int
+findIndex = S.findIndexI
+{-# INLINE [0] findIndex #-}
+
+-- | /O(n)/ The 'count' function returns the number of times the query
+-- element appears in the given stream.
+countChar :: Char -> Stream Char -> Int
+countChar = S.countCharI
+{-# INLINE [0] countChar #-}
+
+-- | /O(n)/ Like a combination of 'map' and 'foldl''. Applies a
+-- function to each element of a 'Text', passing an accumulating
+-- parameter from left to right, and returns a final 'Text'.
+mapAccumL :: (a -> Char -> (a,Char)) -> a -> Stream Char -> (a, Text)
+mapAccumL f z0 (Stream next0 s0 len) = (nz, I.text na 0 nl)
+ where
+ (na,(nz,nl)) = A.run2 (A.new mlen >>= \arr -> outer arr mlen z0 s0 0)
+ where mlen = upperBound 4 len
+ outer arr top = loop
+ where
+ loop !z !s !i =
+ case next0 s of
+ Done -> return (arr, (z,i))
+ Skip s' -> loop z s' i
+ Yield x s'
+ | j >= top -> {-# SCC "mapAccumL/resize" #-} do
+ let top' = (top + 1) `shiftL` 1
+ arr' <- A.new top'
+ A.copyM arr' 0 arr 0 top
+ outer arr' top' z s i
+ | otherwise -> do d <- unsafeWrite arr i c
+ loop z' s' (i+d)
+ where
+ (z',c) = f z x
+ j = i + U8.charTailBytes c
+{-# INLINE [0] mapAccumL #-}
diff --git a/Data/Text/Internal/Fusion/CaseMapping.hs b/Data/Text/Internal/Fusion/CaseMapping.hs
new file mode 100644
index 0000000..0ea4f1a
--- /dev/null
+++ b/Data/Text/Internal/Fusion/CaseMapping.hs
@@ -0,0 +1,1002 @@
+{-# LANGUAGE Rank2Types #-}
+-- AUTOMATICALLY GENERATED - DO NOT EDIT
+-- Generated by scripts/CaseMapping.hs
+-- CaseFolding-9.0.0.txt
+-- Date: 2016-03-02, 18:54:54 GMT
+-- SpecialCasing-9.0.0.txt
+-- Date: 2016-03-02, 18:55:13 GMT
+
+module Data.Text.Internal.Fusion.CaseMapping where
+import Data.Char
+import Data.Text.Internal.Fusion.Types
+
+upperMapping :: forall s. Char -> s -> Step (CC s) Char
+{-# NOINLINE upperMapping #-}
+-- LATIN SMALL LETTER SHARP S
+upperMapping '\x00df' s = Yield '\x0053' (CC s '\x0053' '\x0000')
+-- LATIN SMALL LIGATURE FF
+upperMapping '\xfb00' s = Yield '\x0046' (CC s '\x0046' '\x0000')
+-- LATIN SMALL LIGATURE FI
+upperMapping '\xfb01' s = Yield '\x0046' (CC s '\x0049' '\x0000')
+-- LATIN SMALL LIGATURE FL
+upperMapping '\xfb02' s = Yield '\x0046' (CC s '\x004c' '\x0000')
+-- LATIN SMALL LIGATURE FFI
+upperMapping '\xfb03' s = Yield '\x0046' (CC s '\x0046' '\x0049')
+-- LATIN SMALL LIGATURE FFL
+upperMapping '\xfb04' s = Yield '\x0046' (CC s '\x0046' '\x004c')
+-- LATIN SMALL LIGATURE LONG S T
+upperMapping '\xfb05' s = Yield '\x0053' (CC s '\x0054' '\x0000')
+-- LATIN SMALL LIGATURE ST
+upperMapping '\xfb06' s = Yield '\x0053' (CC s '\x0054' '\x0000')
+-- ARMENIAN SMALL LIGATURE ECH YIWN
+upperMapping '\x0587' s = Yield '\x0535' (CC s '\x0552' '\x0000')
+-- ARMENIAN SMALL LIGATURE MEN NOW
+upperMapping '\xfb13' s = Yield '\x0544' (CC s '\x0546' '\x0000')
+-- ARMENIAN SMALL LIGATURE MEN ECH
+upperMapping '\xfb14' s = Yield '\x0544' (CC s '\x0535' '\x0000')
+-- ARMENIAN SMALL LIGATURE MEN INI
+upperMapping '\xfb15' s = Yield '\x0544' (CC s '\x053b' '\x0000')
+-- ARMENIAN SMALL LIGATURE VEW NOW
+upperMapping '\xfb16' s = Yield '\x054e' (CC s '\x0546' '\x0000')
+-- ARMENIAN SMALL LIGATURE MEN XEH
+upperMapping '\xfb17' s = Yield '\x0544' (CC s '\x053d' '\x0000')
+-- LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
+upperMapping '\x0149' s = Yield '\x02bc' (CC s '\x004e' '\x0000')
+-- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+upperMapping '\x0390' s = Yield '\x0399' (CC s '\x0308' '\x0301')
+-- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+upperMapping '\x03b0' s = Yield '\x03a5' (CC s '\x0308' '\x0301')
+-- LATIN SMALL LETTER J WITH CARON
+upperMapping '\x01f0' s = Yield '\x004a' (CC s '\x030c' '\x0000')
+-- LATIN SMALL LETTER H WITH LINE BELOW
+upperMapping '\x1e96' s = Yield '\x0048' (CC s '\x0331' '\x0000')
+-- LATIN SMALL LETTER T WITH DIAERESIS
+upperMapping '\x1e97' s = Yield '\x0054' (CC s '\x0308' '\x0000')
+-- LATIN SMALL LETTER W WITH RING ABOVE
+upperMapping '\x1e98' s = Yield '\x0057' (CC s '\x030a' '\x0000')
+-- LATIN SMALL LETTER Y WITH RING ABOVE
+upperMapping '\x1e99' s = Yield '\x0059' (CC s '\x030a' '\x0000')
+-- LATIN SMALL LETTER A WITH RIGHT HALF RING
+upperMapping '\x1e9a' s = Yield '\x0041' (CC s '\x02be' '\x0000')
+-- GREEK SMALL LETTER UPSILON WITH PSILI
+upperMapping '\x1f50' s = Yield '\x03a5' (CC s '\x0313' '\x0000')
+-- GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA
+upperMapping '\x1f52' s = Yield '\x03a5' (CC s '\x0313' '\x0300')
+-- GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA
+upperMapping '\x1f54' s = Yield '\x03a5' (CC s '\x0313' '\x0301')
+-- GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI
+upperMapping '\x1f56' s = Yield '\x03a5' (CC s '\x0313' '\x0342')
+-- GREEK SMALL LETTER ALPHA WITH PERISPOMENI
+upperMapping '\x1fb6' s = Yield '\x0391' (CC s '\x0342' '\x0000')
+-- GREEK SMALL LETTER ETA WITH PERISPOMENI
+upperMapping '\x1fc6' s = Yield '\x0397' (CC s '\x0342' '\x0000')
+-- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA
+upperMapping '\x1fd2' s = Yield '\x0399' (CC s '\x0308' '\x0300')
+-- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
+upperMapping '\x1fd3' s = Yield '\x0399' (CC s '\x0308' '\x0301')
+-- GREEK SMALL LETTER IOTA WITH PERISPOMENI
+upperMapping '\x1fd6' s = Yield '\x0399' (CC s '\x0342' '\x0000')
+-- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI
+upperMapping '\x1fd7' s = Yield '\x0399' (CC s '\x0308' '\x0342')
+-- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA
+upperMapping '\x1fe2' s = Yield '\x03a5' (CC s '\x0308' '\x0300')
+-- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
+upperMapping '\x1fe3' s = Yield '\x03a5' (CC s '\x0308' '\x0301')
+-- GREEK SMALL LETTER RHO WITH PSILI
+upperMapping '\x1fe4' s = Yield '\x03a1' (CC s '\x0313' '\x0000')
+-- GREEK SMALL LETTER UPSILON WITH PERISPOMENI
+upperMapping '\x1fe6' s = Yield '\x03a5' (CC s '\x0342' '\x0000')
+-- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI
+upperMapping '\x1fe7' s = Yield '\x03a5' (CC s '\x0308' '\x0342')
+-- GREEK SMALL LETTER OMEGA WITH PERISPOMENI
+upperMapping '\x1ff6' s = Yield '\x03a9' (CC s '\x0342' '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI
+upperMapping '\x1f80' s = Yield '\x1f08' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI
+upperMapping '\x1f81' s = Yield '\x1f09' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+upperMapping '\x1f82' s = Yield '\x1f0a' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+upperMapping '\x1f83' s = Yield '\x1f0b' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+upperMapping '\x1f84' s = Yield '\x1f0c' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+upperMapping '\x1f85' s = Yield '\x1f0d' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+upperMapping '\x1f86' s = Yield '\x1f0e' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+upperMapping '\x1f87' s = Yield '\x1f0f' (CC s '\x0399' '\x0000')
+-- GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI
+upperMapping '\x1f88' s = Yield '\x1f08' (CC s '\x0399' '\x0000')
+-- GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI
+upperMapping '\x1f89' s = Yield '\x1f09' (CC s '\x0399' '\x0000')
+-- GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+upperMapping '\x1f8a' s = Yield '\x1f0a' (CC s '\x0399' '\x0000')
+-- GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+upperMapping '\x1f8b' s = Yield '\x1f0b' (CC s '\x0399' '\x0000')
+-- GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+upperMapping '\x1f8c' s = Yield '\x1f0c' (CC s '\x0399' '\x0000')
+-- GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+upperMapping '\x1f8d' s = Yield '\x1f0d' (CC s '\x0399' '\x0000')
+-- GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+upperMapping '\x1f8e' s = Yield '\x1f0e' (CC s '\x0399' '\x0000')
+-- GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+upperMapping '\x1f8f' s = Yield '\x1f0f' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI
+upperMapping '\x1f90' s = Yield '\x1f28' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI
+upperMapping '\x1f91' s = Yield '\x1f29' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+upperMapping '\x1f92' s = Yield '\x1f2a' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+upperMapping '\x1f93' s = Yield '\x1f2b' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+upperMapping '\x1f94' s = Yield '\x1f2c' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+upperMapping '\x1f95' s = Yield '\x1f2d' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+upperMapping '\x1f96' s = Yield '\x1f2e' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+upperMapping '\x1f97' s = Yield '\x1f2f' (CC s '\x0399' '\x0000')
+-- GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI
+upperMapping '\x1f98' s = Yield '\x1f28' (CC s '\x0399' '\x0000')
+-- GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI
+upperMapping '\x1f99' s = Yield '\x1f29' (CC s '\x0399' '\x0000')
+-- GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+upperMapping '\x1f9a' s = Yield '\x1f2a' (CC s '\x0399' '\x0000')
+-- GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+upperMapping '\x1f9b' s = Yield '\x1f2b' (CC s '\x0399' '\x0000')
+-- GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+upperMapping '\x1f9c' s = Yield '\x1f2c' (CC s '\x0399' '\x0000')
+-- GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+upperMapping '\x1f9d' s = Yield '\x1f2d' (CC s '\x0399' '\x0000')
+-- GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+upperMapping '\x1f9e' s = Yield '\x1f2e' (CC s '\x0399' '\x0000')
+-- GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+upperMapping '\x1f9f' s = Yield '\x1f2f' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI
+upperMapping '\x1fa0' s = Yield '\x1f68' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI
+upperMapping '\x1fa1' s = Yield '\x1f69' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+upperMapping '\x1fa2' s = Yield '\x1f6a' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+upperMapping '\x1fa3' s = Yield '\x1f6b' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+upperMapping '\x1fa4' s = Yield '\x1f6c' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+upperMapping '\x1fa5' s = Yield '\x1f6d' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+upperMapping '\x1fa6' s = Yield '\x1f6e' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+upperMapping '\x1fa7' s = Yield '\x1f6f' (CC s '\x0399' '\x0000')
+-- GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI
+upperMapping '\x1fa8' s = Yield '\x1f68' (CC s '\x0399' '\x0000')
+-- GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI
+upperMapping '\x1fa9' s = Yield '\x1f69' (CC s '\x0399' '\x0000')
+-- GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+upperMapping '\x1faa' s = Yield '\x1f6a' (CC s '\x0399' '\x0000')
+-- GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+upperMapping '\x1fab' s = Yield '\x1f6b' (CC s '\x0399' '\x0000')
+-- GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+upperMapping '\x1fac' s = Yield '\x1f6c' (CC s '\x0399' '\x0000')
+-- GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+upperMapping '\x1fad' s = Yield '\x1f6d' (CC s '\x0399' '\x0000')
+-- GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+upperMapping '\x1fae' s = Yield '\x1f6e' (CC s '\x0399' '\x0000')
+-- GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+upperMapping '\x1faf' s = Yield '\x1f6f' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI
+upperMapping '\x1fb3' s = Yield '\x0391' (CC s '\x0399' '\x0000')
+-- GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
+upperMapping '\x1fbc' s = Yield '\x0391' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI
+upperMapping '\x1fc3' s = Yield '\x0397' (CC s '\x0399' '\x0000')
+-- GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
+upperMapping '\x1fcc' s = Yield '\x0397' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI
+upperMapping '\x1ff3' s = Yield '\x03a9' (CC s '\x0399' '\x0000')
+-- GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
+upperMapping '\x1ffc' s = Yield '\x03a9' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI
+upperMapping '\x1fb2' s = Yield '\x1fba' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI
+upperMapping '\x1fb4' s = Yield '\x0386' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI
+upperMapping '\x1fc2' s = Yield '\x1fca' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI
+upperMapping '\x1fc4' s = Yield '\x0389' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI
+upperMapping '\x1ff2' s = Yield '\x1ffa' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI
+upperMapping '\x1ff4' s = Yield '\x038f' (CC s '\x0399' '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
+upperMapping '\x1fb7' s = Yield '\x0391' (CC s '\x0342' '\x0399')
+-- GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
+upperMapping '\x1fc7' s = Yield '\x0397' (CC s '\x0342' '\x0399')
+-- GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
+upperMapping '\x1ff7' s = Yield '\x03a9' (CC s '\x0342' '\x0399')
+upperMapping c s = Yield (toUpper c) (CC s '\0' '\0')
+lowerMapping :: forall s. Char -> s -> Step (CC s) Char
+{-# NOINLINE lowerMapping #-}
+-- LATIN CAPITAL LETTER I WITH DOT ABOVE
+lowerMapping '\x0130' s = Yield '\x0069' (CC s '\x0307' '\x0000')
+lowerMapping c s = Yield (toLower c) (CC s '\0' '\0')
+titleMapping :: forall s. Char -> s -> Step (CC s) Char
+{-# NOINLINE titleMapping #-}
+-- LATIN SMALL LETTER SHARP S
+titleMapping '\x00df' s = Yield '\x0053' (CC s '\x0073' '\x0000')
+-- LATIN SMALL LIGATURE FF
+titleMapping '\xfb00' s = Yield '\x0046' (CC s '\x0066' '\x0000')
+-- LATIN SMALL LIGATURE FI
+titleMapping '\xfb01' s = Yield '\x0046' (CC s '\x0069' '\x0000')
+-- LATIN SMALL LIGATURE FL
+titleMapping '\xfb02' s = Yield '\x0046' (CC s '\x006c' '\x0000')
+-- LATIN SMALL LIGATURE FFI
+titleMapping '\xfb03' s = Yield '\x0046' (CC s '\x0066' '\x0069')
+-- LATIN SMALL LIGATURE FFL
+titleMapping '\xfb04' s = Yield '\x0046' (CC s '\x0066' '\x006c')
+-- LATIN SMALL LIGATURE LONG S T
+titleMapping '\xfb05' s = Yield '\x0053' (CC s '\x0074' '\x0000')
+-- LATIN SMALL LIGATURE ST
+titleMapping '\xfb06' s = Yield '\x0053' (CC s '\x0074' '\x0000')
+-- ARMENIAN SMALL LIGATURE ECH YIWN
+titleMapping '\x0587' s = Yield '\x0535' (CC s '\x0582' '\x0000')
+-- ARMENIAN SMALL LIGATURE MEN NOW
+titleMapping '\xfb13' s = Yield '\x0544' (CC s '\x0576' '\x0000')
+-- ARMENIAN SMALL LIGATURE MEN ECH
+titleMapping '\xfb14' s = Yield '\x0544' (CC s '\x0565' '\x0000')
+-- ARMENIAN SMALL LIGATURE MEN INI
+titleMapping '\xfb15' s = Yield '\x0544' (CC s '\x056b' '\x0000')
+-- ARMENIAN SMALL LIGATURE VEW NOW
+titleMapping '\xfb16' s = Yield '\x054e' (CC s '\x0576' '\x0000')
+-- ARMENIAN SMALL LIGATURE MEN XEH
+titleMapping '\xfb17' s = Yield '\x0544' (CC s '\x056d' '\x0000')
+-- LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
+titleMapping '\x0149' s = Yield '\x02bc' (CC s '\x004e' '\x0000')
+-- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+titleMapping '\x0390' s = Yield '\x0399' (CC s '\x0308' '\x0301')
+-- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+titleMapping '\x03b0' s = Yield '\x03a5' (CC s '\x0308' '\x0301')
+-- LATIN SMALL LETTER J WITH CARON
+titleMapping '\x01f0' s = Yield '\x004a' (CC s '\x030c' '\x0000')
+-- LATIN SMALL LETTER H WITH LINE BELOW
+titleMapping '\x1e96' s = Yield '\x0048' (CC s '\x0331' '\x0000')
+-- LATIN SMALL LETTER T WITH DIAERESIS
+titleMapping '\x1e97' s = Yield '\x0054' (CC s '\x0308' '\x0000')
+-- LATIN SMALL LETTER W WITH RING ABOVE
+titleMapping '\x1e98' s = Yield '\x0057' (CC s '\x030a' '\x0000')
+-- LATIN SMALL LETTER Y WITH RING ABOVE
+titleMapping '\x1e99' s = Yield '\x0059' (CC s '\x030a' '\x0000')
+-- LATIN SMALL LETTER A WITH RIGHT HALF RING
+titleMapping '\x1e9a' s = Yield '\x0041' (CC s '\x02be' '\x0000')
+-- GREEK SMALL LETTER UPSILON WITH PSILI
+titleMapping '\x1f50' s = Yield '\x03a5' (CC s '\x0313' '\x0000')
+-- GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA
+titleMapping '\x1f52' s = Yield '\x03a5' (CC s '\x0313' '\x0300')
+-- GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA
+titleMapping '\x1f54' s = Yield '\x03a5' (CC s '\x0313' '\x0301')
+-- GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI
+titleMapping '\x1f56' s = Yield '\x03a5' (CC s '\x0313' '\x0342')
+-- GREEK SMALL LETTER ALPHA WITH PERISPOMENI
+titleMapping '\x1fb6' s = Yield '\x0391' (CC s '\x0342' '\x0000')
+-- GREEK SMALL LETTER ETA WITH PERISPOMENI
+titleMapping '\x1fc6' s = Yield '\x0397' (CC s '\x0342' '\x0000')
+-- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA
+titleMapping '\x1fd2' s = Yield '\x0399' (CC s '\x0308' '\x0300')
+-- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
+titleMapping '\x1fd3' s = Yield '\x0399' (CC s '\x0308' '\x0301')
+-- GREEK SMALL LETTER IOTA WITH PERISPOMENI
+titleMapping '\x1fd6' s = Yield '\x0399' (CC s '\x0342' '\x0000')
+-- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI
+titleMapping '\x1fd7' s = Yield '\x0399' (CC s '\x0308' '\x0342')
+-- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA
+titleMapping '\x1fe2' s = Yield '\x03a5' (CC s '\x0308' '\x0300')
+-- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
+titleMapping '\x1fe3' s = Yield '\x03a5' (CC s '\x0308' '\x0301')
+-- GREEK SMALL LETTER RHO WITH PSILI
+titleMapping '\x1fe4' s = Yield '\x03a1' (CC s '\x0313' '\x0000')
+-- GREEK SMALL LETTER UPSILON WITH PERISPOMENI
+titleMapping '\x1fe6' s = Yield '\x03a5' (CC s '\x0342' '\x0000')
+-- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI
+titleMapping '\x1fe7' s = Yield '\x03a5' (CC s '\x0308' '\x0342')
+-- GREEK SMALL LETTER OMEGA WITH PERISPOMENI
+titleMapping '\x1ff6' s = Yield '\x03a9' (CC s '\x0342' '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI
+titleMapping '\x1fb2' s = Yield '\x1fba' (CC s '\x0345' '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI
+titleMapping '\x1fb4' s = Yield '\x0386' (CC s '\x0345' '\x0000')
+-- GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI
+titleMapping '\x1fc2' s = Yield '\x1fca' (CC s '\x0345' '\x0000')
+-- GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI
+titleMapping '\x1fc4' s = Yield '\x0389' (CC s '\x0345' '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI
+titleMapping '\x1ff2' s = Yield '\x1ffa' (CC s '\x0345' '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI
+titleMapping '\x1ff4' s = Yield '\x038f' (CC s '\x0345' '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
+titleMapping '\x1fb7' s = Yield '\x0391' (CC s '\x0342' '\x0345')
+-- GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
+titleMapping '\x1fc7' s = Yield '\x0397' (CC s '\x0342' '\x0345')
+-- GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
+titleMapping '\x1ff7' s = Yield '\x03a9' (CC s '\x0342' '\x0345')
+titleMapping c s = Yield (toTitle c) (CC s '\0' '\0')
+foldMapping :: forall s. Char -> s -> Step (CC s) Char
+{-# NOINLINE foldMapping #-}
+-- MICRO SIGN
+foldMapping '\x00b5' s = Yield '\x03bc' (CC s '\x0000' '\x0000')
+-- LATIN SMALL LETTER SHARP S
+foldMapping '\x00df' s = Yield '\x0073' (CC s '\x0073' '\x0000')
+-- LATIN CAPITAL LETTER I WITH DOT ABOVE
+foldMapping '\x0130' s = Yield '\x0069' (CC s '\x0307' '\x0000')
+-- LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
+foldMapping '\x0149' s = Yield '\x02bc' (CC s '\x006e' '\x0000')
+-- LATIN SMALL LETTER LONG S
+foldMapping '\x017f' s = Yield '\x0073' (CC s '\x0000' '\x0000')
+-- LATIN SMALL LETTER J WITH CARON
+foldMapping '\x01f0' s = Yield '\x006a' (CC s '\x030c' '\x0000')
+-- COMBINING GREEK YPOGEGRAMMENI
+foldMapping '\x0345' s = Yield '\x03b9' (CC s '\x0000' '\x0000')
+-- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+foldMapping '\x0390' s = Yield '\x03b9' (CC s '\x0308' '\x0301')
+-- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+foldMapping '\x03b0' s = Yield '\x03c5' (CC s '\x0308' '\x0301')
+-- GREEK SMALL LETTER FINAL SIGMA
+foldMapping '\x03c2' s = Yield '\x03c3' (CC s '\x0000' '\x0000')
+-- GREEK BETA SYMBOL
+foldMapping '\x03d0' s = Yield '\x03b2' (CC s '\x0000' '\x0000')
+-- GREEK THETA SYMBOL
+foldMapping '\x03d1' s = Yield '\x03b8' (CC s '\x0000' '\x0000')
+-- GREEK PHI SYMBOL
+foldMapping '\x03d5' s = Yield '\x03c6' (CC s '\x0000' '\x0000')
+-- GREEK PI SYMBOL
+foldMapping '\x03d6' s = Yield '\x03c0' (CC s '\x0000' '\x0000')
+-- GREEK KAPPA SYMBOL
+foldMapping '\x03f0' s = Yield '\x03ba' (CC s '\x0000' '\x0000')
+-- GREEK RHO SYMBOL
+foldMapping '\x03f1' s = Yield '\x03c1' (CC s '\x0000' '\x0000')
+-- GREEK LUNATE EPSILON SYMBOL
+foldMapping '\x03f5' s = Yield '\x03b5' (CC s '\x0000' '\x0000')
+-- ARMENIAN SMALL LIGATURE ECH YIWN
+foldMapping '\x0587' s = Yield '\x0565' (CC s '\x0582' '\x0000')
+-- CHEROKEE SMALL LETTER YE
+foldMapping '\x13f8' s = Yield '\x13f0' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER YI
+foldMapping '\x13f9' s = Yield '\x13f1' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER YO
+foldMapping '\x13fa' s = Yield '\x13f2' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER YU
+foldMapping '\x13fb' s = Yield '\x13f3' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER YV
+foldMapping '\x13fc' s = Yield '\x13f4' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER MV
+foldMapping '\x13fd' s = Yield '\x13f5' (CC s '\x0000' '\x0000')
+-- CYRILLIC SMALL LETTER ROUNDED VE
+foldMapping '\x1c80' s = Yield '\x0432' (CC s '\x0000' '\x0000')
+-- CYRILLIC SMALL LETTER LONG-LEGGED DE
+foldMapping '\x1c81' s = Yield '\x0434' (CC s '\x0000' '\x0000')
+-- CYRILLIC SMALL LETTER NARROW O
+foldMapping '\x1c82' s = Yield '\x043e' (CC s '\x0000' '\x0000')
+-- CYRILLIC SMALL LETTER WIDE ES
+foldMapping '\x1c83' s = Yield '\x0441' (CC s '\x0000' '\x0000')
+-- CYRILLIC SMALL LETTER TALL TE
+foldMapping '\x1c84' s = Yield '\x0442' (CC s '\x0000' '\x0000')
+-- CYRILLIC SMALL LETTER THREE-LEGGED TE
+foldMapping '\x1c85' s = Yield '\x0442' (CC s '\x0000' '\x0000')
+-- CYRILLIC SMALL LETTER TALL HARD SIGN
+foldMapping '\x1c86' s = Yield '\x044a' (CC s '\x0000' '\x0000')
+-- CYRILLIC SMALL LETTER TALL YAT
+foldMapping '\x1c87' s = Yield '\x0463' (CC s '\x0000' '\x0000')
+-- CYRILLIC SMALL LETTER UNBLENDED UK
+foldMapping '\x1c88' s = Yield '\xa64b' (CC s '\x0000' '\x0000')
+-- LATIN SMALL LETTER H WITH LINE BELOW
+foldMapping '\x1e96' s = Yield '\x0068' (CC s '\x0331' '\x0000')
+-- LATIN SMALL LETTER T WITH DIAERESIS
+foldMapping '\x1e97' s = Yield '\x0074' (CC s '\x0308' '\x0000')
+-- LATIN SMALL LETTER W WITH RING ABOVE
+foldMapping '\x1e98' s = Yield '\x0077' (CC s '\x030a' '\x0000')
+-- LATIN SMALL LETTER Y WITH RING ABOVE
+foldMapping '\x1e99' s = Yield '\x0079' (CC s '\x030a' '\x0000')
+-- LATIN SMALL LETTER A WITH RIGHT HALF RING
+foldMapping '\x1e9a' s = Yield '\x0061' (CC s '\x02be' '\x0000')
+-- LATIN SMALL LETTER LONG S WITH DOT ABOVE
+foldMapping '\x1e9b' s = Yield '\x1e61' (CC s '\x0000' '\x0000')
+-- LATIN CAPITAL LETTER SHARP S
+foldMapping '\x1e9e' s = Yield '\x0073' (CC s '\x0073' '\x0000')
+-- GREEK SMALL LETTER UPSILON WITH PSILI
+foldMapping '\x1f50' s = Yield '\x03c5' (CC s '\x0313' '\x0000')
+-- GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA
+foldMapping '\x1f52' s = Yield '\x03c5' (CC s '\x0313' '\x0300')
+-- GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA
+foldMapping '\x1f54' s = Yield '\x03c5' (CC s '\x0313' '\x0301')
+-- GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI
+foldMapping '\x1f56' s = Yield '\x03c5' (CC s '\x0313' '\x0342')
+-- GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI
+foldMapping '\x1f80' s = Yield '\x1f00' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI
+foldMapping '\x1f81' s = Yield '\x1f01' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+foldMapping '\x1f82' s = Yield '\x1f02' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+foldMapping '\x1f83' s = Yield '\x1f03' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+foldMapping '\x1f84' s = Yield '\x1f04' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+foldMapping '\x1f85' s = Yield '\x1f05' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+foldMapping '\x1f86' s = Yield '\x1f06' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+foldMapping '\x1f87' s = Yield '\x1f07' (CC s '\x03b9' '\x0000')
+-- GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI
+foldMapping '\x1f88' s = Yield '\x1f00' (CC s '\x03b9' '\x0000')
+-- GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI
+foldMapping '\x1f89' s = Yield '\x1f01' (CC s '\x03b9' '\x0000')
+-- GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+foldMapping '\x1f8a' s = Yield '\x1f02' (CC s '\x03b9' '\x0000')
+-- GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+foldMapping '\x1f8b' s = Yield '\x1f03' (CC s '\x03b9' '\x0000')
+-- GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+foldMapping '\x1f8c' s = Yield '\x1f04' (CC s '\x03b9' '\x0000')
+-- GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+foldMapping '\x1f8d' s = Yield '\x1f05' (CC s '\x03b9' '\x0000')
+-- GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+foldMapping '\x1f8e' s = Yield '\x1f06' (CC s '\x03b9' '\x0000')
+-- GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+foldMapping '\x1f8f' s = Yield '\x1f07' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI
+foldMapping '\x1f90' s = Yield '\x1f20' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI
+foldMapping '\x1f91' s = Yield '\x1f21' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+foldMapping '\x1f92' s = Yield '\x1f22' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+foldMapping '\x1f93' s = Yield '\x1f23' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+foldMapping '\x1f94' s = Yield '\x1f24' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+foldMapping '\x1f95' s = Yield '\x1f25' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+foldMapping '\x1f96' s = Yield '\x1f26' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+foldMapping '\x1f97' s = Yield '\x1f27' (CC s '\x03b9' '\x0000')
+-- GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI
+foldMapping '\x1f98' s = Yield '\x1f20' (CC s '\x03b9' '\x0000')
+-- GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI
+foldMapping '\x1f99' s = Yield '\x1f21' (CC s '\x03b9' '\x0000')
+-- GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+foldMapping '\x1f9a' s = Yield '\x1f22' (CC s '\x03b9' '\x0000')
+-- GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+foldMapping '\x1f9b' s = Yield '\x1f23' (CC s '\x03b9' '\x0000')
+-- GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+foldMapping '\x1f9c' s = Yield '\x1f24' (CC s '\x03b9' '\x0000')
+-- GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+foldMapping '\x1f9d' s = Yield '\x1f25' (CC s '\x03b9' '\x0000')
+-- GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+foldMapping '\x1f9e' s = Yield '\x1f26' (CC s '\x03b9' '\x0000')
+-- GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+foldMapping '\x1f9f' s = Yield '\x1f27' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI
+foldMapping '\x1fa0' s = Yield '\x1f60' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI
+foldMapping '\x1fa1' s = Yield '\x1f61' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+foldMapping '\x1fa2' s = Yield '\x1f62' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+foldMapping '\x1fa3' s = Yield '\x1f63' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+foldMapping '\x1fa4' s = Yield '\x1f64' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+foldMapping '\x1fa5' s = Yield '\x1f65' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+foldMapping '\x1fa6' s = Yield '\x1f66' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+foldMapping '\x1fa7' s = Yield '\x1f67' (CC s '\x03b9' '\x0000')
+-- GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI
+foldMapping '\x1fa8' s = Yield '\x1f60' (CC s '\x03b9' '\x0000')
+-- GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI
+foldMapping '\x1fa9' s = Yield '\x1f61' (CC s '\x03b9' '\x0000')
+-- GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+foldMapping '\x1faa' s = Yield '\x1f62' (CC s '\x03b9' '\x0000')
+-- GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+foldMapping '\x1fab' s = Yield '\x1f63' (CC s '\x03b9' '\x0000')
+-- GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+foldMapping '\x1fac' s = Yield '\x1f64' (CC s '\x03b9' '\x0000')
+-- GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+foldMapping '\x1fad' s = Yield '\x1f65' (CC s '\x03b9' '\x0000')
+-- GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+foldMapping '\x1fae' s = Yield '\x1f66' (CC s '\x03b9' '\x0000')
+-- GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+foldMapping '\x1faf' s = Yield '\x1f67' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI
+foldMapping '\x1fb2' s = Yield '\x1f70' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI
+foldMapping '\x1fb3' s = Yield '\x03b1' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI
+foldMapping '\x1fb4' s = Yield '\x03ac' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH PERISPOMENI
+foldMapping '\x1fb6' s = Yield '\x03b1' (CC s '\x0342' '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
+foldMapping '\x1fb7' s = Yield '\x03b1' (CC s '\x0342' '\x03b9')
+-- GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
+foldMapping '\x1fbc' s = Yield '\x03b1' (CC s '\x03b9' '\x0000')
+-- GREEK PROSGEGRAMMENI
+foldMapping '\x1fbe' s = Yield '\x03b9' (CC s '\x0000' '\x0000')
+-- GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI
+foldMapping '\x1fc2' s = Yield '\x1f74' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI
+foldMapping '\x1fc3' s = Yield '\x03b7' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI
+foldMapping '\x1fc4' s = Yield '\x03ae' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER ETA WITH PERISPOMENI
+foldMapping '\x1fc6' s = Yield '\x03b7' (CC s '\x0342' '\x0000')
+-- GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
+foldMapping '\x1fc7' s = Yield '\x03b7' (CC s '\x0342' '\x03b9')
+-- GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
+foldMapping '\x1fcc' s = Yield '\x03b7' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA
+foldMapping '\x1fd2' s = Yield '\x03b9' (CC s '\x0308' '\x0300')
+-- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
+foldMapping '\x1fd3' s = Yield '\x03b9' (CC s '\x0308' '\x0301')
+-- GREEK SMALL LETTER IOTA WITH PERISPOMENI
+foldMapping '\x1fd6' s = Yield '\x03b9' (CC s '\x0342' '\x0000')
+-- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI
+foldMapping '\x1fd7' s = Yield '\x03b9' (CC s '\x0308' '\x0342')
+-- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA
+foldMapping '\x1fe2' s = Yield '\x03c5' (CC s '\x0308' '\x0300')
+-- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
+foldMapping '\x1fe3' s = Yield '\x03c5' (CC s '\x0308' '\x0301')
+-- GREEK SMALL LETTER RHO WITH PSILI
+foldMapping '\x1fe4' s = Yield '\x03c1' (CC s '\x0313' '\x0000')
+-- GREEK SMALL LETTER UPSILON WITH PERISPOMENI
+foldMapping '\x1fe6' s = Yield '\x03c5' (CC s '\x0342' '\x0000')
+-- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI
+foldMapping '\x1fe7' s = Yield '\x03c5' (CC s '\x0308' '\x0342')
+-- GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI
+foldMapping '\x1ff2' s = Yield '\x1f7c' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI
+foldMapping '\x1ff3' s = Yield '\x03c9' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI
+foldMapping '\x1ff4' s = Yield '\x03ce' (CC s '\x03b9' '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH PERISPOMENI
+foldMapping '\x1ff6' s = Yield '\x03c9' (CC s '\x0342' '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
+foldMapping '\x1ff7' s = Yield '\x03c9' (CC s '\x0342' '\x03b9')
+-- GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
+foldMapping '\x1ffc' s = Yield '\x03c9' (CC s '\x03b9' '\x0000')
+-- LATIN CAPITAL LETTER SMALL CAPITAL I
+foldMapping '\xa7ae' s = Yield '\x026a' (CC s '\x0000' '\x0000')
+-- LATIN CAPITAL LETTER J WITH CROSSED-TAIL
+foldMapping '\xa7b2' s = Yield '\x029d' (CC s '\x0000' '\x0000')
+-- LATIN CAPITAL LETTER CHI
+foldMapping '\xa7b3' s = Yield '\xab53' (CC s '\x0000' '\x0000')
+-- LATIN CAPITAL LETTER BETA
+foldMapping '\xa7b4' s = Yield '\xa7b5' (CC s '\x0000' '\x0000')
+-- LATIN CAPITAL LETTER OMEGA
+foldMapping '\xa7b6' s = Yield '\xa7b7' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER A
+foldMapping '\xab70' s = Yield '\x13a0' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER E
+foldMapping '\xab71' s = Yield '\x13a1' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER I
+foldMapping '\xab72' s = Yield '\x13a2' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER O
+foldMapping '\xab73' s = Yield '\x13a3' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER U
+foldMapping '\xab74' s = Yield '\x13a4' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER V
+foldMapping '\xab75' s = Yield '\x13a5' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER GA
+foldMapping '\xab76' s = Yield '\x13a6' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER KA
+foldMapping '\xab77' s = Yield '\x13a7' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER GE
+foldMapping '\xab78' s = Yield '\x13a8' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER GI
+foldMapping '\xab79' s = Yield '\x13a9' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER GO
+foldMapping '\xab7a' s = Yield '\x13aa' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER GU
+foldMapping '\xab7b' s = Yield '\x13ab' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER GV
+foldMapping '\xab7c' s = Yield '\x13ac' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER HA
+foldMapping '\xab7d' s = Yield '\x13ad' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER HE
+foldMapping '\xab7e' s = Yield '\x13ae' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER HI
+foldMapping '\xab7f' s = Yield '\x13af' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER HO
+foldMapping '\xab80' s = Yield '\x13b0' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER HU
+foldMapping '\xab81' s = Yield '\x13b1' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER HV
+foldMapping '\xab82' s = Yield '\x13b2' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER LA
+foldMapping '\xab83' s = Yield '\x13b3' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER LE
+foldMapping '\xab84' s = Yield '\x13b4' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER LI
+foldMapping '\xab85' s = Yield '\x13b5' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER LO
+foldMapping '\xab86' s = Yield '\x13b6' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER LU
+foldMapping '\xab87' s = Yield '\x13b7' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER LV
+foldMapping '\xab88' s = Yield '\x13b8' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER MA
+foldMapping '\xab89' s = Yield '\x13b9' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER ME
+foldMapping '\xab8a' s = Yield '\x13ba' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER MI
+foldMapping '\xab8b' s = Yield '\x13bb' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER MO
+foldMapping '\xab8c' s = Yield '\x13bc' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER MU
+foldMapping '\xab8d' s = Yield '\x13bd' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER NA
+foldMapping '\xab8e' s = Yield '\x13be' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER HNA
+foldMapping '\xab8f' s = Yield '\x13bf' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER NAH
+foldMapping '\xab90' s = Yield '\x13c0' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER NE
+foldMapping '\xab91' s = Yield '\x13c1' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER NI
+foldMapping '\xab92' s = Yield '\x13c2' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER NO
+foldMapping '\xab93' s = Yield '\x13c3' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER NU
+foldMapping '\xab94' s = Yield '\x13c4' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER NV
+foldMapping '\xab95' s = Yield '\x13c5' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER QUA
+foldMapping '\xab96' s = Yield '\x13c6' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER QUE
+foldMapping '\xab97' s = Yield '\x13c7' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER QUI
+foldMapping '\xab98' s = Yield '\x13c8' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER QUO
+foldMapping '\xab99' s = Yield '\x13c9' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER QUU
+foldMapping '\xab9a' s = Yield '\x13ca' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER QUV
+foldMapping '\xab9b' s = Yield '\x13cb' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER SA
+foldMapping '\xab9c' s = Yield '\x13cc' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER S
+foldMapping '\xab9d' s = Yield '\x13cd' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER SE
+foldMapping '\xab9e' s = Yield '\x13ce' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER SI
+foldMapping '\xab9f' s = Yield '\x13cf' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER SO
+foldMapping '\xaba0' s = Yield '\x13d0' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER SU
+foldMapping '\xaba1' s = Yield '\x13d1' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER SV
+foldMapping '\xaba2' s = Yield '\x13d2' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER DA
+foldMapping '\xaba3' s = Yield '\x13d3' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER TA
+foldMapping '\xaba4' s = Yield '\x13d4' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER DE
+foldMapping '\xaba5' s = Yield '\x13d5' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER TE
+foldMapping '\xaba6' s = Yield '\x13d6' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER DI
+foldMapping '\xaba7' s = Yield '\x13d7' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER TI
+foldMapping '\xaba8' s = Yield '\x13d8' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER DO
+foldMapping '\xaba9' s = Yield '\x13d9' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER DU
+foldMapping '\xabaa' s = Yield '\x13da' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER DV
+foldMapping '\xabab' s = Yield '\x13db' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER DLA
+foldMapping '\xabac' s = Yield '\x13dc' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER TLA
+foldMapping '\xabad' s = Yield '\x13dd' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER TLE
+foldMapping '\xabae' s = Yield '\x13de' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER TLI
+foldMapping '\xabaf' s = Yield '\x13df' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER TLO
+foldMapping '\xabb0' s = Yield '\x13e0' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER TLU
+foldMapping '\xabb1' s = Yield '\x13e1' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER TLV
+foldMapping '\xabb2' s = Yield '\x13e2' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER TSA
+foldMapping '\xabb3' s = Yield '\x13e3' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER TSE
+foldMapping '\xabb4' s = Yield '\x13e4' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER TSI
+foldMapping '\xabb5' s = Yield '\x13e5' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER TSO
+foldMapping '\xabb6' s = Yield '\x13e6' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER TSU
+foldMapping '\xabb7' s = Yield '\x13e7' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER TSV
+foldMapping '\xabb8' s = Yield '\x13e8' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER WA
+foldMapping '\xabb9' s = Yield '\x13e9' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER WE
+foldMapping '\xabba' s = Yield '\x13ea' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER WI
+foldMapping '\xabbb' s = Yield '\x13eb' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER WO
+foldMapping '\xabbc' s = Yield '\x13ec' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER WU
+foldMapping '\xabbd' s = Yield '\x13ed' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER WV
+foldMapping '\xabbe' s = Yield '\x13ee' (CC s '\x0000' '\x0000')
+-- CHEROKEE SMALL LETTER YA
+foldMapping '\xabbf' s = Yield '\x13ef' (CC s '\x0000' '\x0000')
+-- LATIN SMALL LIGATURE FF
+foldMapping '\xfb00' s = Yield '\x0066' (CC s '\x0066' '\x0000')
+-- LATIN SMALL LIGATURE FI
+foldMapping '\xfb01' s = Yield '\x0066' (CC s '\x0069' '\x0000')
+-- LATIN SMALL LIGATURE FL
+foldMapping '\xfb02' s = Yield '\x0066' (CC s '\x006c' '\x0000')
+-- LATIN SMALL LIGATURE FFI
+foldMapping '\xfb03' s = Yield '\x0066' (CC s '\x0066' '\x0069')
+-- LATIN SMALL LIGATURE FFL
+foldMapping '\xfb04' s = Yield '\x0066' (CC s '\x0066' '\x006c')
+-- LATIN SMALL LIGATURE LONG S T
+foldMapping '\xfb05' s = Yield '\x0073' (CC s '\x0074' '\x0000')
+-- LATIN SMALL LIGATURE ST
+foldMapping '\xfb06' s = Yield '\x0073' (CC s '\x0074' '\x0000')
+-- ARMENIAN SMALL LIGATURE MEN NOW
+foldMapping '\xfb13' s = Yield '\x0574' (CC s '\x0576' '\x0000')
+-- ARMENIAN SMALL LIGATURE MEN ECH
+foldMapping '\xfb14' s = Yield '\x0574' (CC s '\x0565' '\x0000')
+-- ARMENIAN SMALL LIGATURE MEN INI
+foldMapping '\xfb15' s = Yield '\x0574' (CC s '\x056b' '\x0000')
+-- ARMENIAN SMALL LIGATURE VEW NOW
+foldMapping '\xfb16' s = Yield '\x057e' (CC s '\x0576' '\x0000')
+-- ARMENIAN SMALL LIGATURE MEN XEH
+foldMapping '\xfb17' s = Yield '\x0574' (CC s '\x056d' '\x0000')
+-- OSAGE CAPITAL LETTER A
+foldMapping '\x104b0' s = Yield '\x104d8' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER AI
+foldMapping '\x104b1' s = Yield '\x104d9' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER AIN
+foldMapping '\x104b2' s = Yield '\x104da' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER AH
+foldMapping '\x104b3' s = Yield '\x104db' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER BRA
+foldMapping '\x104b4' s = Yield '\x104dc' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER CHA
+foldMapping '\x104b5' s = Yield '\x104dd' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER EHCHA
+foldMapping '\x104b6' s = Yield '\x104de' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER E
+foldMapping '\x104b7' s = Yield '\x104df' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER EIN
+foldMapping '\x104b8' s = Yield '\x104e0' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER HA
+foldMapping '\x104b9' s = Yield '\x104e1' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER HYA
+foldMapping '\x104ba' s = Yield '\x104e2' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER I
+foldMapping '\x104bb' s = Yield '\x104e3' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER KA
+foldMapping '\x104bc' s = Yield '\x104e4' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER EHKA
+foldMapping '\x104bd' s = Yield '\x104e5' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER KYA
+foldMapping '\x104be' s = Yield '\x104e6' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER LA
+foldMapping '\x104bf' s = Yield '\x104e7' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER MA
+foldMapping '\x104c0' s = Yield '\x104e8' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER NA
+foldMapping '\x104c1' s = Yield '\x104e9' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER O
+foldMapping '\x104c2' s = Yield '\x104ea' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER OIN
+foldMapping '\x104c3' s = Yield '\x104eb' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER PA
+foldMapping '\x104c4' s = Yield '\x104ec' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER EHPA
+foldMapping '\x104c5' s = Yield '\x104ed' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER SA
+foldMapping '\x104c6' s = Yield '\x104ee' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER SHA
+foldMapping '\x104c7' s = Yield '\x104ef' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER TA
+foldMapping '\x104c8' s = Yield '\x104f0' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER EHTA
+foldMapping '\x104c9' s = Yield '\x104f1' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER TSA
+foldMapping '\x104ca' s = Yield '\x104f2' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER EHTSA
+foldMapping '\x104cb' s = Yield '\x104f3' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER TSHA
+foldMapping '\x104cc' s = Yield '\x104f4' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER DHA
+foldMapping '\x104cd' s = Yield '\x104f5' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER U
+foldMapping '\x104ce' s = Yield '\x104f6' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER WA
+foldMapping '\x104cf' s = Yield '\x104f7' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER KHA
+foldMapping '\x104d0' s = Yield '\x104f8' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER GHA
+foldMapping '\x104d1' s = Yield '\x104f9' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER ZA
+foldMapping '\x104d2' s = Yield '\x104fa' (CC s '\x0000' '\x0000')
+-- OSAGE CAPITAL LETTER ZHA
+foldMapping '\x104d3' s = Yield '\x104fb' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER A
+foldMapping '\x10c80' s = Yield '\x10cc0' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER AA
+foldMapping '\x10c81' s = Yield '\x10cc1' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER EB
+foldMapping '\x10c82' s = Yield '\x10cc2' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER AMB
+foldMapping '\x10c83' s = Yield '\x10cc3' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER EC
+foldMapping '\x10c84' s = Yield '\x10cc4' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER ENC
+foldMapping '\x10c85' s = Yield '\x10cc5' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER ECS
+foldMapping '\x10c86' s = Yield '\x10cc6' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER ED
+foldMapping '\x10c87' s = Yield '\x10cc7' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER AND
+foldMapping '\x10c88' s = Yield '\x10cc8' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER E
+foldMapping '\x10c89' s = Yield '\x10cc9' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER CLOSE E
+foldMapping '\x10c8a' s = Yield '\x10cca' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER EE
+foldMapping '\x10c8b' s = Yield '\x10ccb' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER EF
+foldMapping '\x10c8c' s = Yield '\x10ccc' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER EG
+foldMapping '\x10c8d' s = Yield '\x10ccd' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER EGY
+foldMapping '\x10c8e' s = Yield '\x10cce' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER EH
+foldMapping '\x10c8f' s = Yield '\x10ccf' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER I
+foldMapping '\x10c90' s = Yield '\x10cd0' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER II
+foldMapping '\x10c91' s = Yield '\x10cd1' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER EJ
+foldMapping '\x10c92' s = Yield '\x10cd2' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER EK
+foldMapping '\x10c93' s = Yield '\x10cd3' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER AK
+foldMapping '\x10c94' s = Yield '\x10cd4' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER UNK
+foldMapping '\x10c95' s = Yield '\x10cd5' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER EL
+foldMapping '\x10c96' s = Yield '\x10cd6' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER ELY
+foldMapping '\x10c97' s = Yield '\x10cd7' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER EM
+foldMapping '\x10c98' s = Yield '\x10cd8' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER EN
+foldMapping '\x10c99' s = Yield '\x10cd9' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER ENY
+foldMapping '\x10c9a' s = Yield '\x10cda' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER O
+foldMapping '\x10c9b' s = Yield '\x10cdb' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER OO
+foldMapping '\x10c9c' s = Yield '\x10cdc' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER NIKOLSBURG OE
+foldMapping '\x10c9d' s = Yield '\x10cdd' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER RUDIMENTA OE
+foldMapping '\x10c9e' s = Yield '\x10cde' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER OEE
+foldMapping '\x10c9f' s = Yield '\x10cdf' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER EP
+foldMapping '\x10ca0' s = Yield '\x10ce0' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER EMP
+foldMapping '\x10ca1' s = Yield '\x10ce1' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER ER
+foldMapping '\x10ca2' s = Yield '\x10ce2' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER SHORT ER
+foldMapping '\x10ca3' s = Yield '\x10ce3' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER ES
+foldMapping '\x10ca4' s = Yield '\x10ce4' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER ESZ
+foldMapping '\x10ca5' s = Yield '\x10ce5' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER ET
+foldMapping '\x10ca6' s = Yield '\x10ce6' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER ENT
+foldMapping '\x10ca7' s = Yield '\x10ce7' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER ETY
+foldMapping '\x10ca8' s = Yield '\x10ce8' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER ECH
+foldMapping '\x10ca9' s = Yield '\x10ce9' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER U
+foldMapping '\x10caa' s = Yield '\x10cea' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER UU
+foldMapping '\x10cab' s = Yield '\x10ceb' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER NIKOLSBURG UE
+foldMapping '\x10cac' s = Yield '\x10cec' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER RUDIMENTA UE
+foldMapping '\x10cad' s = Yield '\x10ced' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER EV
+foldMapping '\x10cae' s = Yield '\x10cee' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER EZ
+foldMapping '\x10caf' s = Yield '\x10cef' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER EZS
+foldMapping '\x10cb0' s = Yield '\x10cf0' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER ENT-SHAPED SIGN
+foldMapping '\x10cb1' s = Yield '\x10cf1' (CC s '\x0000' '\x0000')
+-- OLD HUNGARIAN CAPITAL LETTER US
+foldMapping '\x10cb2' s = Yield '\x10cf2' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER ALIF
+foldMapping '\x1e900' s = Yield '\x1e922' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER DAALI
+foldMapping '\x1e901' s = Yield '\x1e923' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER LAAM
+foldMapping '\x1e902' s = Yield '\x1e924' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER MIIM
+foldMapping '\x1e903' s = Yield '\x1e925' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER BA
+foldMapping '\x1e904' s = Yield '\x1e926' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER SINNYIIYHE
+foldMapping '\x1e905' s = Yield '\x1e927' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER PE
+foldMapping '\x1e906' s = Yield '\x1e928' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER BHE
+foldMapping '\x1e907' s = Yield '\x1e929' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER RA
+foldMapping '\x1e908' s = Yield '\x1e92a' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER E
+foldMapping '\x1e909' s = Yield '\x1e92b' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER FA
+foldMapping '\x1e90a' s = Yield '\x1e92c' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER I
+foldMapping '\x1e90b' s = Yield '\x1e92d' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER O
+foldMapping '\x1e90c' s = Yield '\x1e92e' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER DHA
+foldMapping '\x1e90d' s = Yield '\x1e92f' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER YHE
+foldMapping '\x1e90e' s = Yield '\x1e930' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER WAW
+foldMapping '\x1e90f' s = Yield '\x1e931' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER NUN
+foldMapping '\x1e910' s = Yield '\x1e932' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER KAF
+foldMapping '\x1e911' s = Yield '\x1e933' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER YA
+foldMapping '\x1e912' s = Yield '\x1e934' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER U
+foldMapping '\x1e913' s = Yield '\x1e935' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER JIIM
+foldMapping '\x1e914' s = Yield '\x1e936' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER CHI
+foldMapping '\x1e915' s = Yield '\x1e937' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER HA
+foldMapping '\x1e916' s = Yield '\x1e938' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER QAAF
+foldMapping '\x1e917' s = Yield '\x1e939' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER GA
+foldMapping '\x1e918' s = Yield '\x1e93a' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER NYA
+foldMapping '\x1e919' s = Yield '\x1e93b' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER TU
+foldMapping '\x1e91a' s = Yield '\x1e93c' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER NHA
+foldMapping '\x1e91b' s = Yield '\x1e93d' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER VA
+foldMapping '\x1e91c' s = Yield '\x1e93e' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER KHA
+foldMapping '\x1e91d' s = Yield '\x1e93f' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER GBE
+foldMapping '\x1e91e' s = Yield '\x1e940' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER ZAL
+foldMapping '\x1e91f' s = Yield '\x1e941' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER KPO
+foldMapping '\x1e920' s = Yield '\x1e942' (CC s '\x0000' '\x0000')
+-- ADLAM CAPITAL LETTER SHA
+foldMapping '\x1e921' s = Yield '\x1e943' (CC s '\x0000' '\x0000')
+foldMapping c s = Yield (toLower c) (CC s '\0' '\0')
diff --git a/Data/Text/Internal/Fusion/Common.hs b/Data/Text/Internal/Fusion/Common.hs
new file mode 100644
index 0000000..15fbab0
--- /dev/null
+++ b/Data/Text/Internal/Fusion/Common.hs
@@ -0,0 +1,945 @@
+{-# LANGUAGE PatternGuards, BangPatterns, MagicHash, Rank2Types #-}
+-- |
+-- Module : Data.Text.Internal.Fusion.Common
+-- Copyright : (c) Bryan O'Sullivan 2009, 2012
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : GHC
+--
+-- /Warning/: this is an internal module, and does not have a stable
+-- API or name. Functions in this module may not check or enforce
+-- preconditions expected by public modules. Use at your own risk!
+--
+-- Common stream fusion functionality for text.
+
+module Data.Text.Internal.Fusion.Common
+ (
+ -- * Creation and elimination
+ singleton
+ , streamList
+ , unstreamList
+ , streamCString#
+
+ -- * Basic interface
+ , cons
+ , snoc
+ , append
+ , head
+ , uncons
+ , last
+ , tail
+ , init
+ , null
+ , lengthI
+ , compareLengthI
+ , isSingleton
+
+ -- * Transformations
+ , map
+ , intercalate
+ , intersperse
+
+ -- ** Case conversion
+ -- $case
+ , toCaseFold
+ , toLower
+ , toTitle
+ , toUpper
+
+ -- ** Justification
+ , justifyLeftI
+
+ -- * Folds
+ , foldl
+ , foldl'
+ , foldl1
+ , foldl1'
+ , foldr
+ , foldr1
+
+ -- ** Special folds
+ , concat
+ , concatMap
+ , any
+ , all
+ , maximum
+ , minimum
+
+ -- * Construction
+ -- ** Scans
+ , scanl
+
+ -- ** Generation and unfolding
+ , replicateCharI
+ , replicateI
+ , unfoldr
+ , unfoldrNI
+
+ -- * Substrings
+ -- ** Breaking strings
+ , take
+ , drop
+ , takeWhile
+ , dropWhile
+
+ -- * Predicates
+ , isPrefixOf
+
+ -- * Searching
+ , elem
+ , filter
+
+ -- * Indexing
+ , findBy
+ , indexI
+ , findIndexI
+ , countCharI
+
+ -- * Zipping and unzipping
+ , zipWith
+ ) where
+
+import Prelude (Bool(..), Char, Eq(..), Int, Integral, Maybe(..),
+ Ord(..), Ordering(..), String, (.), ($), (+), (-), (*), (++),
+ (&&), fromIntegral, otherwise)
+import qualified Data.List as L
+import qualified Prelude as P
+import Data.Bits (shiftL)
+import Data.Char (isLetter, isSpace)
+import Data.Int (Int64)
+import Data.Text.Internal.Fusion.Types
+import Data.Text.Internal.Fusion.CaseMapping (foldMapping, lowerMapping, titleMapping,
+ upperMapping)
+import Data.Text.Internal.Fusion.Size
+import GHC.Prim (Addr#, chr#, indexCharOffAddr#, ord#)
+import GHC.Types (Char(..), Int(..))
+
+singleton :: Char -> Stream Char
+singleton c = Stream next False (codePointsSize 1)
+ where next False = Yield c True
+ next True = Done
+{-# INLINE [0] singleton #-}
+
+streamList :: [a] -> Stream a
+{-# INLINE [0] streamList #-}
+streamList s = Stream next s unknownSize
+ where next [] = Done
+ next (x:xs) = Yield x xs
+
+unstreamList :: Stream a -> [a]
+unstreamList (Stream next s0 _len) = unfold s0
+ where unfold !s = case next s of
+ Done -> []
+ Skip s' -> unfold s'
+ Yield x s' -> x : unfold s'
+{-# INLINE [0] unstreamList #-}
+
+{-# RULES "STREAM streamList/unstreamList fusion" forall s. streamList (unstreamList s) = s #-}
+
+-- | Stream the UTF-8-like packed encoding used by GHC to represent
+-- constant strings in generated code.
+--
+-- This encoding uses the byte sequence "\xc0\x80" to represent NUL,
+-- and the string is NUL-terminated.
+streamCString# :: Addr# -> Stream Char
+streamCString# addr = Stream step 0 unknownSize
+ where
+ step !i
+ | b == 0 = Done
+ | b <= 0x7f = Yield (C# b#) (i+1)
+ | b <= 0xdf = let !c = chr $ ((b-0xc0) `shiftL` 6) + next 1
+ in Yield c (i+2)
+ | b <= 0xef = let !c = chr $ ((b-0xe0) `shiftL` 12) +
+ (next 1 `shiftL` 6) +
+ next 2
+ in Yield c (i+3)
+ | otherwise = let !c = chr $ ((b-0xf0) `shiftL` 18) +
+ (next 1 `shiftL` 12) +
+ (next 2 `shiftL` 6) +
+ next 3
+ in Yield c (i+4)
+ where b = I# (ord# b#)
+ next n = I# (ord# (at# (i+n))) - 0x80
+ !b# = at# i
+ at# (I# i#) = indexCharOffAddr# addr i#
+ chr (I# i#) = C# (chr# i#)
+{-# INLINE [0] streamCString# #-}
+
+-- ----------------------------------------------------------------------------
+-- * Basic stream functions
+
+data C s = C0 !s
+ | C1 !s
+
+-- | /O(n)/ Adds a character to the front of a Stream Char.
+cons :: Char -> Stream Char -> Stream Char
+cons !w (Stream next0 s0 len) = Stream next (C1 s0) (len + codePointsSize 1)
+ where
+ next (C1 s) = Yield w (C0 s)
+ next (C0 s) = case next0 s of
+ Done -> Done
+ Skip s' -> Skip (C0 s')
+ Yield x s' -> Yield x (C0 s')
+{-# INLINE [0] cons #-}
+
+data Snoc a = N
+ | J !a
+
+-- | /O(n)/ Adds a character to the end of a stream.
+snoc :: Stream Char -> Char -> Stream Char
+snoc (Stream next0 xs0 len) w = Stream next (J xs0) (len + codePointsSize 1)
+ where
+ next (J xs) = case next0 xs of
+ Done -> Yield w N
+ Skip xs' -> Skip (J xs')
+ Yield x xs' -> Yield x (J xs')
+ next N = Done
+{-# INLINE [0] snoc #-}
+
+data E l r = L !l
+ | R !r
+
+-- | /O(n)/ Appends one Stream to the other.
+append :: Stream Char -> Stream Char -> Stream Char
+append (Stream next0 s01 len1) (Stream next1 s02 len2) =
+ Stream next (L s01) (len1 + len2)
+ where
+ next (L s1) = case next0 s1 of
+ Done -> Skip (R s02)
+ Skip s1' -> Skip (L s1')
+ Yield x s1' -> Yield x (L s1')
+ next (R s2) = case next1 s2 of
+ Done -> Done
+ Skip s2' -> Skip (R s2')
+ Yield x s2' -> Yield x (R s2')
+{-# INLINE [0] append #-}
+
+-- | /O(1)/ Returns the first character of a Text, which must be non-empty.
+-- Subject to array fusion.
+head :: Stream Char -> Char
+head (Stream next s0 _len) = loop_head s0
+ where
+ loop_head !s = case next s of
+ Yield x _ -> x
+ Skip s' -> loop_head s'
+ Done -> head_empty
+{-# INLINE [0] head #-}
+
+head_empty :: a
+head_empty = streamError "head" "Empty stream"
+{-# NOINLINE head_empty #-}
+
+-- | /O(1)/ Returns the first character and remainder of a 'Stream
+-- Char', or 'Nothing' if empty. Subject to array fusion.
+uncons :: Stream Char -> Maybe (Char, Stream Char)
+uncons (Stream next s0 len) = loop_uncons s0
+ where
+ loop_uncons !s = case next s of
+ Yield x s1 -> Just (x, Stream next s1 (len - codePointsSize 1))
+ Skip s' -> loop_uncons s'
+ Done -> Nothing
+{-# INLINE [0] uncons #-}
+
+-- | /O(n)/ Returns the last character of a 'Stream Char', which must
+-- be non-empty.
+last :: Stream Char -> Char
+last (Stream next s0 _len) = loop0_last s0
+ where
+ loop0_last !s = case next s of
+ Done -> emptyError "last"
+ Skip s' -> loop0_last s'
+ Yield x s' -> loop_last x s'
+ loop_last !x !s = case next s of
+ Done -> x
+ Skip s' -> loop_last x s'
+ Yield x' s' -> loop_last x' s'
+{-# INLINE[0] last #-}
+
+-- | /O(1)/ Returns all characters after the head of a Stream Char, which must
+-- be non-empty.
+tail :: Stream Char -> Stream Char
+tail (Stream next0 s0 len) = Stream next (C0 s0) (len - codePointsSize 1)
+ where
+ next (C0 s) = case next0 s of
+ Done -> emptyError "tail"
+ Skip s' -> Skip (C0 s')
+ Yield _ s' -> Skip (C1 s')
+ next (C1 s) = case next0 s of
+ Done -> Done
+ Skip s' -> Skip (C1 s')
+ Yield x s' -> Yield x (C1 s')
+{-# INLINE [0] tail #-}
+
+data Init s = Init0 !s
+ | Init1 {-# UNPACK #-} !Char !s
+
+-- | /O(1)/ Returns all but the last character of a Stream Char, which
+-- must be non-empty.
+init :: Stream Char -> Stream Char
+init (Stream next0 s0 len) = Stream next (Init0 s0) (len - codePointsSize 1)
+ where
+ next (Init0 s) = case next0 s of
+ Done -> emptyError "init"
+ Skip s' -> Skip (Init0 s')
+ Yield x s' -> Skip (Init1 x s')
+ next (Init1 x s) = case next0 s of
+ Done -> Done
+ Skip s' -> Skip (Init1 x s')
+ Yield x' s' -> Yield x (Init1 x' s')
+{-# INLINE [0] init #-}
+
+-- | /O(1)/ Tests whether a Stream Char is empty or not.
+null :: Stream Char -> Bool
+null (Stream next s0 _len) = loop_null s0
+ where
+ loop_null !s = case next s of
+ Done -> True
+ Yield _ _ -> False
+ Skip s' -> loop_null s'
+{-# INLINE[0] null #-}
+
+-- | /O(n)/ Returns the number of characters in a string.
+lengthI :: Integral a => Stream Char -> a
+lengthI (Stream next s0 _len) = loop_length 0 s0
+ where
+ loop_length !z s = case next s of
+ Done -> z
+ Skip s' -> loop_length z s'
+ Yield _ s' -> loop_length (z + 1) s'
+{-# INLINE[0] lengthI #-}
+
+-- | /O(n)/ Compares the count of characters in a string to a number.
+-- Subject to fusion.
+--
+-- This function gives the same answer as comparing against the result
+-- of 'lengthI', but can short circuit if the count of characters is
+-- greater than the number or if the stream can't possibly be as long
+-- as the number supplied, and hence be more efficient.
+compareLengthI :: Integral a => Stream Char -> a -> Ordering
+compareLengthI (Stream next s0 len) n
+ -- Note that @len@ tracks code units whereas we want to compare the length
+ -- in code points. Specifically, a stream with hint @len@ may consist of
+ -- anywhere from @len/2@ to @len@ code points.
+ | Just r <- compareSize len n' = r
+ | otherwise = loop_cmp 0 s0
+ where
+ n' = codePointsSize $ fromIntegral n
+ loop_cmp !z s = case next s of
+ Done -> compare z n
+ Skip s' -> loop_cmp z s'
+ Yield _ s' | z > n -> GT
+ | otherwise -> loop_cmp (z + 1) s'
+{-# INLINE[0] compareLengthI #-}
+
+-- | /O(n)/ Indicate whether a string contains exactly one element.
+isSingleton :: Stream Char -> Bool
+isSingleton (Stream next s0 _len) = loop 0 s0
+ where
+ loop !z s = case next s of
+ Done -> z == (1::Int)
+ Skip s' -> loop z s'
+ Yield _ s'
+ | z >= 1 -> False
+ | otherwise -> loop (z+1) s'
+{-# INLINE[0] isSingleton #-}
+
+-- ----------------------------------------------------------------------------
+-- * Stream transformations
+
+-- | /O(n)/ 'map' @f @xs is the Stream Char obtained by applying @f@
+-- to each element of @xs@.
+map :: (Char -> Char) -> Stream Char -> Stream Char
+map f (Stream next0 s0 len) = Stream next s0 len
+ where
+ next !s = case next0 s of
+ Done -> Done
+ Skip s' -> Skip s'
+ Yield x s' -> Yield (f x) s'
+{-# INLINE [0] map #-}
+
+{-#
+ RULES "STREAM map/map fusion" forall f g s.
+ map f (map g s) = map (\x -> f (g x)) s
+ #-}
+
+data I s = I1 !s
+ | I2 !s {-# UNPACK #-} !Char
+ | I3 !s
+
+-- | /O(n)/ Take a character and place it between each of the
+-- characters of a 'Stream Char'.
+intersperse :: Char -> Stream Char -> Stream Char
+intersperse c (Stream next0 s0 len) = Stream next (I1 s0) (len + unknownSize)
+ where
+ next (I1 s) = case next0 s of
+ Done -> Done
+ Skip s' -> Skip (I1 s')
+ Yield x s' -> Skip (I2 s' x)
+ next (I2 s x) = Yield x (I3 s)
+ next (I3 s) = case next0 s of
+ Done -> Done
+ Skip s' -> Skip (I3 s')
+ Yield x s' -> Yield c (I2 s' x)
+{-# INLINE [0] intersperse #-}
+
+-- ----------------------------------------------------------------------------
+-- ** Case conversions (folds)
+
+-- $case
+--
+-- With Unicode text, it is incorrect to use combinators like @map
+-- toUpper@ to case convert each character of a string individually.
+-- Instead, use the whole-string case conversion functions from this
+-- module. For correctness in different writing systems, these
+-- functions may map one input character to two or three output
+-- characters.
+
+-- | Map a 'Stream' through the given case-mapping function.
+caseConvert :: (forall s. Char -> s -> Step (CC s) Char)
+ -> Stream Char -> Stream Char
+caseConvert remap (Stream next0 s0 len) =
+ Stream next (CC s0 '\0' '\0') (len `unionSize` 3*len)
+ where
+ next (CC s '\0' _) =
+ case next0 s of
+ Done -> Done
+ Skip s' -> Skip (CC s' '\0' '\0')
+ Yield c s' -> remap c s'
+ next (CC s a b) = Yield a (CC s b '\0')
+
+-- | /O(n)/ Convert a string to folded case. This function is mainly
+-- useful for performing caseless (or case insensitive) string
+-- comparisons.
+--
+-- A string @x@ is a caseless match for a string @y@ if and only if:
+--
+-- @toCaseFold x == toCaseFold y@
+--
+-- The result string may be longer than the input string, and may
+-- differ from applying 'toLower' to the input string. For instance,
+-- the Armenian small ligature men now (U+FB13) is case folded to the
+-- bigram men now (U+0574 U+0576), while the micro sign (U+00B5) is
+-- case folded to the Greek small letter letter mu (U+03BC) instead of
+-- itself.
+toCaseFold :: Stream Char -> Stream Char
+toCaseFold = caseConvert foldMapping
+{-# INLINE [0] toCaseFold #-}
+
+-- | /O(n)/ Convert a string to upper case, using simple case
+-- conversion. The result string may be longer than the input string.
+-- For instance, the German eszett (U+00DF) maps to the two-letter
+-- sequence SS.
+toUpper :: Stream Char -> Stream Char
+toUpper = caseConvert upperMapping
+{-# INLINE [0] toUpper #-}
+
+-- | /O(n)/ Convert a string to lower case, using simple case
+-- conversion. The result string may be longer than the input string.
+-- For instance, the Latin capital letter I with dot above (U+0130)
+-- maps to the sequence Latin small letter i (U+0069) followed by
+-- combining dot above (U+0307).
+toLower :: Stream Char -> Stream Char
+toLower = caseConvert lowerMapping
+{-# INLINE [0] toLower #-}
+
+-- | /O(n)/ Convert a string to title case, using simple case
+-- conversion.
+--
+-- The first letter of the input is converted to title case, as is
+-- every subsequent letter that immediately follows a non-letter.
+-- Every letter that immediately follows another letter is converted
+-- to lower case.
+--
+-- The result string may be longer than the input string. For example,
+-- the Latin small ligature &#xfb02; (U+FB02) is converted to the
+-- sequence Latin capital letter F (U+0046) followed by Latin small
+-- letter l (U+006C).
+--
+-- /Note/: this function does not take language or culture specific
+-- rules into account. For instance, in English, different style
+-- guides disagree on whether the book name \"The Hill of the Red
+-- Fox\" is correctly title cased&#x2014;but this function will
+-- capitalize /every/ word.
+toTitle :: Stream Char -> Stream Char
+toTitle (Stream next0 s0 len) = Stream next (CC (False :*: s0) '\0' '\0') (len + unknownSize)
+ where
+ next (CC (letter :*: s) '\0' _) =
+ case next0 s of
+ Done -> Done
+ Skip s' -> Skip (CC (letter :*: s') '\0' '\0')
+ Yield c s'
+ | nonSpace -> if letter
+ then lowerMapping c (nonSpace :*: s')
+ else titleMapping c (letter' :*: s')
+ | otherwise -> Yield c (CC (letter' :*: s') '\0' '\0')
+ where nonSpace = P.not (isSpace c)
+ letter' = isLetter c
+ next (CC s a b) = Yield a (CC s b '\0')
+{-# INLINE [0] toTitle #-}
+
+data Justify i s = Just1 !i !s
+ | Just2 !i !s
+
+justifyLeftI :: Integral a => a -> Char -> Stream Char -> Stream Char
+justifyLeftI k c (Stream next0 s0 len) =
+ Stream next (Just1 0 s0) (larger (fromIntegral k * charSize c + len) len)
+ where
+ next (Just1 n s) =
+ case next0 s of
+ Done -> next (Just2 n s)
+ Skip s' -> Skip (Just1 n s')
+ Yield x s' -> Yield x (Just1 (n+1) s')
+ next (Just2 n s)
+ | n < k = Yield c (Just2 (n+1) s)
+ | otherwise = Done
+ {-# INLINE next #-}
+{-# INLINE [0] justifyLeftI #-}
+
+-- ----------------------------------------------------------------------------
+-- * Reducing Streams (folds)
+
+-- | foldl, applied to a binary operator, a starting value (typically the
+-- left-identity of the operator), and a Stream, reduces the Stream using the
+-- binary operator, from left to right.
+foldl :: (b -> Char -> b) -> b -> Stream Char -> b
+foldl f z0 (Stream next s0 _len) = loop_foldl z0 s0
+ where
+ loop_foldl z !s = case next s of
+ Done -> z
+ Skip s' -> loop_foldl z s'
+ Yield x s' -> loop_foldl (f z x) s'
+{-# INLINE [0] foldl #-}
+
+-- | A strict version of foldl.
+foldl' :: (b -> Char -> b) -> b -> Stream Char -> b
+foldl' f z0 (Stream next s0 _len) = loop_foldl' z0 s0
+ where
+ loop_foldl' !z !s = case next s of
+ Done -> z
+ Skip s' -> loop_foldl' z s'
+ Yield x s' -> loop_foldl' (f z x) s'
+{-# INLINE [0] foldl' #-}
+
+-- | foldl1 is a variant of foldl that has no starting value argument,
+-- and thus must be applied to non-empty Streams.
+foldl1 :: (Char -> Char -> Char) -> Stream Char -> Char
+foldl1 f (Stream next s0 _len) = loop0_foldl1 s0
+ where
+ loop0_foldl1 !s = case next s of
+ Skip s' -> loop0_foldl1 s'
+ Yield x s' -> loop_foldl1 x s'
+ Done -> emptyError "foldl1"
+ loop_foldl1 z !s = case next s of
+ Done -> z
+ Skip s' -> loop_foldl1 z s'
+ Yield x s' -> loop_foldl1 (f z x) s'
+{-# INLINE [0] foldl1 #-}
+
+-- | A strict version of foldl1.
+foldl1' :: (Char -> Char -> Char) -> Stream Char -> Char
+foldl1' f (Stream next s0 _len) = loop0_foldl1' s0
+ where
+ loop0_foldl1' !s = case next s of
+ Skip s' -> loop0_foldl1' s'
+ Yield x s' -> loop_foldl1' x s'
+ Done -> emptyError "foldl1"
+ loop_foldl1' !z !s = case next s of
+ Done -> z
+ Skip s' -> loop_foldl1' z s'
+ Yield x s' -> loop_foldl1' (f z x) s'
+{-# INLINE [0] foldl1' #-}
+
+-- | 'foldr', applied to a binary operator, a starting value (typically the
+-- right-identity of the operator), and a stream, reduces the stream using the
+-- binary operator, from right to left.
+foldr :: (Char -> b -> b) -> b -> Stream Char -> b
+foldr f z (Stream next s0 _len) = loop_foldr s0
+ where
+ loop_foldr !s = case next s of
+ Done -> z
+ Skip s' -> loop_foldr s'
+ Yield x s' -> f x (loop_foldr s')
+{-# INLINE [0] foldr #-}
+
+-- | foldr1 is a variant of 'foldr' that has no starting value argument,
+-- and thus must be applied to non-empty streams.
+-- Subject to array fusion.
+foldr1 :: (Char -> Char -> Char) -> Stream Char -> Char
+foldr1 f (Stream next s0 _len) = loop0_foldr1 s0
+ where
+ loop0_foldr1 !s = case next s of
+ Done -> emptyError "foldr1"
+ Skip s' -> loop0_foldr1 s'
+ Yield x s' -> loop_foldr1 x s'
+
+ loop_foldr1 x !s = case next s of
+ Done -> x
+ Skip s' -> loop_foldr1 x s'
+ Yield x' s' -> f x (loop_foldr1 x' s')
+{-# INLINE [0] foldr1 #-}
+
+intercalate :: Stream Char -> [Stream Char] -> Stream Char
+intercalate s = concat . (L.intersperse s)
+{-# INLINE [0] intercalate #-}
+
+-- ----------------------------------------------------------------------------
+-- ** Special folds
+
+-- | /O(n)/ Concatenate a list of streams. Subject to array fusion.
+concat :: [Stream Char] -> Stream Char
+concat = L.foldr append empty
+{-# INLINE [0] concat #-}
+
+-- | Map a function over a stream that results in a stream and concatenate the
+-- results.
+concatMap :: (Char -> Stream Char) -> Stream Char -> Stream Char
+concatMap f = foldr (append . f) empty
+{-# INLINE [0] concatMap #-}
+
+-- | /O(n)/ any @p @xs determines if any character in the stream
+-- @xs@ satisfies the predicate @p@.
+any :: (Char -> Bool) -> Stream Char -> Bool
+any p (Stream next0 s0 _len) = loop_any s0
+ where
+ loop_any !s = case next0 s of
+ Done -> False
+ Skip s' -> loop_any s'
+ Yield x s' | p x -> True
+ | otherwise -> loop_any s'
+{-# INLINE [0] any #-}
+
+-- | /O(n)/ all @p @xs determines if all characters in the 'Text'
+-- @xs@ satisfy the predicate @p@.
+all :: (Char -> Bool) -> Stream Char -> Bool
+all p (Stream next0 s0 _len) = loop_all s0
+ where
+ loop_all !s = case next0 s of
+ Done -> True
+ Skip s' -> loop_all s'
+ Yield x s' | p x -> loop_all s'
+ | otherwise -> False
+{-# INLINE [0] all #-}
+
+-- | /O(n)/ maximum returns the maximum value from a stream, which must be
+-- non-empty.
+maximum :: Stream Char -> Char
+maximum (Stream next0 s0 _len) = loop0_maximum s0
+ where
+ loop0_maximum !s = case next0 s of
+ Done -> emptyError "maximum"
+ Skip s' -> loop0_maximum s'
+ Yield x s' -> loop_maximum x s'
+ loop_maximum !z !s = case next0 s of
+ Done -> z
+ Skip s' -> loop_maximum z s'
+ Yield x s'
+ | x > z -> loop_maximum x s'
+ | otherwise -> loop_maximum z s'
+{-# INLINE [0] maximum #-}
+
+-- | /O(n)/ minimum returns the minimum value from a 'Text', which must be
+-- non-empty.
+minimum :: Stream Char -> Char
+minimum (Stream next0 s0 _len) = loop0_minimum s0
+ where
+ loop0_minimum !s = case next0 s of
+ Done -> emptyError "minimum"
+ Skip s' -> loop0_minimum s'
+ Yield x s' -> loop_minimum x s'
+ loop_minimum !z !s = case next0 s of
+ Done -> z
+ Skip s' -> loop_minimum z s'
+ Yield x s'
+ | x < z -> loop_minimum x s'
+ | otherwise -> loop_minimum z s'
+{-# INLINE [0] minimum #-}
+
+-- -----------------------------------------------------------------------------
+-- * Building streams
+
+scanl :: (Char -> Char -> Char) -> Char -> Stream Char -> Stream Char
+scanl f z0 (Stream next0 s0 len) = Stream next (Scan1 z0 s0) (len+1) -- HINT maybe too low
+ where
+ {-# INLINE next #-}
+ next (Scan1 z s) = Yield z (Scan2 z s)
+ next (Scan2 z s) = case next0 s of
+ Yield x s' -> let !x' = f z x
+ in Yield x' (Scan2 x' s')
+ Skip s' -> Skip (Scan2 z s')
+ Done -> Done
+{-# INLINE [0] scanl #-}
+
+-- -----------------------------------------------------------------------------
+-- ** Generating and unfolding streams
+
+replicateCharI :: Integral a => a -> Char -> Stream Char
+replicateCharI !n !c
+ | n < 0 = empty
+ | otherwise = Stream next 0 (fromIntegral n) -- HINT maybe too low
+ where
+ next !i | i >= n = Done
+ | otherwise = Yield c (i + 1)
+{-# INLINE [0] replicateCharI #-}
+
+data RI s = RI !s {-# UNPACK #-} !Int64
+
+replicateI :: Int64 -> Stream Char -> Stream Char
+replicateI n (Stream next0 s0 len) =
+ Stream next (RI s0 0) (fromIntegral (max 0 n) * len)
+ where
+ next (RI s k)
+ | k >= n = Done
+ | otherwise = case next0 s of
+ Done -> Skip (RI s0 (k+1))
+ Skip s' -> Skip (RI s' k)
+ Yield x s' -> Yield x (RI s' k)
+{-# INLINE [0] replicateI #-}
+
+-- | /O(n)/, where @n@ is the length of the result. The unfoldr function
+-- is analogous to the List 'unfoldr'. unfoldr builds a stream
+-- from a seed value. The function takes the element and returns
+-- Nothing if it is done producing the stream or returns Just
+-- (a,b), in which case, a is the next Char in the string, and b is
+-- the seed value for further production.
+unfoldr :: (a -> Maybe (Char,a)) -> a -> Stream Char
+unfoldr f s0 = Stream next s0 unknownSize
+ where
+ {-# INLINE next #-}
+ next !s = case f s of
+ Nothing -> Done
+ Just (w, s') -> Yield w s'
+{-# INLINE [0] unfoldr #-}
+
+-- | /O(n)/ Like 'unfoldr', 'unfoldrNI' builds a stream from a seed
+-- value. However, the length of the result is limited by the
+-- first argument to 'unfoldrNI'. This function is more efficient than
+-- 'unfoldr' when the length of the result is known.
+unfoldrNI :: Integral a => a -> (b -> Maybe (Char,b)) -> b -> Stream Char
+unfoldrNI n f s0 | n < 0 = empty
+ | otherwise = Stream next (0 :*: s0) (maxSize $ fromIntegral (n*2))
+ where
+ {-# INLINE next #-}
+ next (z :*: s) = case f s of
+ Nothing -> Done
+ Just (w, s') | z >= n -> Done
+ | otherwise -> Yield w ((z + 1) :*: s')
+{-# INLINE unfoldrNI #-}
+
+-------------------------------------------------------------------------------
+-- * Substreams
+
+-- | /O(n)/ @'take' n@, applied to a stream, returns the prefix of the
+-- stream of length @n@, or the stream itself if @n@ is greater than the
+-- length of the stream.
+take :: Integral a => a -> Stream Char -> Stream Char
+take n0 (Stream next0 s0 len) =
+ Stream next (n0 :*: s0) (smaller len (codePointsSize $ fromIntegral n0))
+ where
+ {-# INLINE next #-}
+ next (n :*: s) | n <= 0 = Done
+ | otherwise = case next0 s of
+ Done -> Done
+ Skip s' -> Skip (n :*: s')
+ Yield x s' -> Yield x ((n-1) :*: s')
+{-# INLINE [0] take #-}
+
+data Drop a s = NS !s
+ | JS !a !s
+
+-- | /O(n)/ @'drop' n@, applied to a stream, returns the suffix of the
+-- stream after the first @n@ characters, or the empty stream if @n@
+-- is greater than the length of the stream.
+drop :: Integral a => a -> Stream Char -> Stream Char
+drop n0 (Stream next0 s0 len) =
+ Stream next (JS n0 s0) (len - codePointsSize (fromIntegral n0))
+ where
+ {-# INLINE next #-}
+ next (JS n s)
+ | n <= 0 = Skip (NS s)
+ | otherwise = case next0 s of
+ Done -> Done
+ Skip s' -> Skip (JS n s')
+ Yield _ s' -> Skip (JS (n-1) s')
+ next (NS s) = case next0 s of
+ Done -> Done
+ Skip s' -> Skip (NS s')
+ Yield x s' -> Yield x (NS s')
+{-# INLINE [0] drop #-}
+
+-- | 'takeWhile', applied to a predicate @p@ and a stream, returns the
+-- longest prefix (possibly empty) of elements that satisfy @p@.
+takeWhile :: (Char -> Bool) -> Stream Char -> Stream Char
+takeWhile p (Stream next0 s0 len) = Stream next s0 (len - unknownSize)
+ where
+ {-# INLINE next #-}
+ next !s = case next0 s of
+ Done -> Done
+ Skip s' -> Skip s'
+ Yield x s' | p x -> Yield x s'
+ | otherwise -> Done
+{-# INLINE [0] takeWhile #-}
+
+-- | @'dropWhile' p xs@ returns the suffix remaining after @'takeWhile' p xs@.
+dropWhile :: (Char -> Bool) -> Stream Char -> Stream Char
+dropWhile p (Stream next0 s0 len) = Stream next (L s0) (len - unknownSize)
+ where
+ {-# INLINE next #-}
+ next (L s) = case next0 s of
+ Done -> Done
+ Skip s' -> Skip (L s')
+ Yield x s' | p x -> Skip (L s')
+ | otherwise -> Yield x (R s')
+ next (R s) = case next0 s of
+ Done -> Done
+ Skip s' -> Skip (R s')
+ Yield x s' -> Yield x (R s')
+{-# INLINE [0] dropWhile #-}
+
+-- | /O(n)/ The 'isPrefixOf' function takes two 'Stream's and returns
+-- 'True' iff the first is a prefix of the second.
+isPrefixOf :: (Eq a) => Stream a -> Stream a -> Bool
+isPrefixOf (Stream next1 s1 _) (Stream next2 s2 _) = loop (next1 s1) (next2 s2)
+ where
+ loop Done _ = True
+ loop _ Done = False
+ loop (Skip s1') (Skip s2') = loop (next1 s1') (next2 s2')
+ loop (Skip s1') x2 = loop (next1 s1') x2
+ loop x1 (Skip s2') = loop x1 (next2 s2')
+ loop (Yield x1 s1') (Yield x2 s2') = x1 == x2 &&
+ loop (next1 s1') (next2 s2')
+{-# INLINE [0] isPrefixOf #-}
+
+-- ----------------------------------------------------------------------------
+-- * Searching
+
+-------------------------------------------------------------------------------
+-- ** Searching by equality
+
+-- | /O(n)/ 'elem' is the stream membership predicate.
+elem :: Char -> Stream Char -> Bool
+elem w (Stream next s0 _len) = loop_elem s0
+ where
+ loop_elem !s = case next s of
+ Done -> False
+ Skip s' -> loop_elem s'
+ Yield x s' | x == w -> True
+ | otherwise -> loop_elem s'
+{-# INLINE [0] elem #-}
+
+-------------------------------------------------------------------------------
+-- ** Searching with a predicate
+
+-- | /O(n)/ The 'findBy' function takes a predicate and a stream,
+-- and returns the first element in matching the predicate, or 'Nothing'
+-- if there is no such element.
+
+findBy :: (Char -> Bool) -> Stream Char -> Maybe Char
+findBy p (Stream next s0 _len) = loop_find s0
+ where
+ loop_find !s = case next s of
+ Done -> Nothing
+ Skip s' -> loop_find s'
+ Yield x s' | p x -> Just x
+ | otherwise -> loop_find s'
+{-# INLINE [0] findBy #-}
+
+-- | /O(n)/ Stream index (subscript) operator, starting from 0.
+indexI :: Integral a => Stream Char -> a -> Char
+indexI (Stream next s0 _len) n0
+ | n0 < 0 = streamError "index" "Negative index"
+ | otherwise = loop_index n0 s0
+ where
+ loop_index !n !s = case next s of
+ Done -> streamError "index" "Index too large"
+ Skip s' -> loop_index n s'
+ Yield x s' | n == 0 -> x
+ | otherwise -> loop_index (n-1) s'
+{-# INLINE [0] indexI #-}
+
+-- | /O(n)/ 'filter', applied to a predicate and a stream,
+-- returns a stream containing those characters that satisfy the
+-- predicate.
+filter :: (Char -> Bool) -> Stream Char -> Stream Char
+filter p (Stream next0 s0 len) =
+ Stream next s0 (len - unknownSize) -- HINT maybe too high
+ where
+ next !s = case next0 s of
+ Done -> Done
+ Skip s' -> Skip s'
+ Yield x s' | p x -> Yield x s'
+ | otherwise -> Skip s'
+{-# INLINE [0] filter #-}
+
+{-# RULES
+ "STREAM filter/filter fusion" forall p q s.
+ filter p (filter q s) = filter (\x -> q x && p x) s
+ #-}
+
+-- | The 'findIndexI' function takes a predicate and a stream and
+-- returns the index of the first element in the stream satisfying the
+-- predicate.
+findIndexI :: Integral a => (Char -> Bool) -> Stream Char -> Maybe a
+findIndexI p s = case findIndicesI p s of
+ (i:_) -> Just i
+ _ -> Nothing
+{-# INLINE [0] findIndexI #-}
+
+-- | The 'findIndicesI' function takes a predicate and a stream and
+-- returns all indices of the elements in the stream satisfying the
+-- predicate.
+findIndicesI :: Integral a => (Char -> Bool) -> Stream Char -> [a]
+findIndicesI p (Stream next s0 _len) = loop_findIndex 0 s0
+ where
+ loop_findIndex !i !s = case next s of
+ Done -> []
+ Skip s' -> loop_findIndex i s' -- hmm. not caught by QC
+ Yield x s' | p x -> i : loop_findIndex (i+1) s'
+ | otherwise -> loop_findIndex (i+1) s'
+{-# INLINE [0] findIndicesI #-}
+
+-------------------------------------------------------------------------------
+-- * Zipping
+
+-- | Strict triple.
+data Zip a b m = Z1 !a !b
+ | Z2 !a !b !m
+
+-- | zipWith generalises 'zip' by zipping with the function given as
+-- the first argument, instead of a tupling function.
+zipWith :: (a -> a -> b) -> Stream a -> Stream a -> Stream b
+zipWith f (Stream next0 sa0 len1) (Stream next1 sb0 len2) =
+ Stream next (Z1 sa0 sb0) (smaller len1 len2)
+ where
+ next (Z1 sa sb) = case next0 sa of
+ Done -> Done
+ Skip sa' -> Skip (Z1 sa' sb)
+ Yield a sa' -> Skip (Z2 sa' sb a)
+
+ next (Z2 sa' sb a) = case next1 sb of
+ Done -> Done
+ Skip sb' -> Skip (Z2 sa' sb' a)
+ Yield b sb' -> Yield (f a b) (Z1 sa' sb')
+{-# INLINE [0] zipWith #-}
+
+-- | /O(n)/ The 'countCharI' function returns the number of times the
+-- query element appears in the given stream.
+countCharI :: Integral a => Char -> Stream Char -> a
+countCharI a (Stream next s0 _len) = loop 0 s0
+ where
+ loop !i !s = case next s of
+ Done -> i
+ Skip s' -> loop i s'
+ Yield x s' | a == x -> loop (i+1) s'
+ | otherwise -> loop i s'
+{-# INLINE [0] countCharI #-}
+
+streamError :: String -> String -> a
+streamError func msg = P.error $ "Data.Text.Internal.Fusion.Common." ++ func ++ ": " ++ msg
+
+emptyError :: String -> a
+emptyError func = internalError func "Empty input"
+
+internalError :: String -> a
+internalError func = streamError func "Internal error"
diff --git a/Data/Text/Internal/Fusion/Size.hs b/Data/Text/Internal/Fusion/Size.hs
new file mode 100644
index 0000000..f0ac5de
--- /dev/null
+++ b/Data/Text/Internal/Fusion/Size.hs
@@ -0,0 +1,187 @@
+{-# LANGUAGE CPP, PatternGuards #-}
+{-# OPTIONS_GHC -fno-warn-missing-methods #-}
+-- |
+-- Module : Data.Text.Internal.Fusion.Internal
+-- Copyright : (c) Roman Leshchinskiy 2008,
+-- (c) Bryan O'Sullivan 2009
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : portable
+--
+-- /Warning/: this is an internal module, and does not have a stable
+-- API or name. Functions in this module may not check or enforce
+-- preconditions expected by public modules. Use at your own risk!
+--
+-- Size hints.
+
+module Data.Text.Internal.Fusion.Size
+ (
+ Size
+ -- * Sizes
+ , exactSize
+ , maxSize
+ , betweenSize
+ , unknownSize
+ , unionSize
+ , charSize
+ , codePointsSize
+ -- * Querying sizes
+ , exactly
+ , smaller
+ , larger
+ , upperBound
+ , lowerBound
+ , compareSize
+ , isEmpty
+ ) where
+
+import Data.Char (ord)
+import Data.Text.Internal (mul)
+#if defined(ASSERTS)
+import Control.Exception (assert)
+#endif
+
+-- | A size in UTF-8 code units.
+data Size = Between {-# UNPACK #-} !Int {-# UNPACK #-} !Int -- ^ Lower and upper bounds on size.
+ | Unknown -- ^ Unknown size.
+ deriving (Eq, Show)
+
+exactly :: Size -> Maybe Int
+exactly (Between na nb) | na == nb = Just na
+exactly _ = Nothing
+{-# INLINE exactly #-}
+
+-- | The 'Size' of the given code point.
+charSize :: Char -> Size
+charSize c
+ | c' < 0x80 = exactSize 1
+ | c' < 0x800 = exactSize 2
+ | c' < 0x10000 = exactSize 3
+ | otherwise = exactSize 4
+ where
+ c' = ord c
+
+-- | The 'Size' of @n@ code points.
+codePointsSize :: Int -> Size
+codePointsSize n = Between n (4*n)
+{-# INLINE codePointsSize #-}
+
+exactSize :: Int -> Size
+exactSize n =
+#if defined(ASSERTS)
+ assert (n >= 0)
+#endif
+ Between n n
+{-# INLINE exactSize #-}
+
+maxSize :: Int -> Size
+maxSize n =
+#if defined(ASSERTS)
+ assert (n >= 0)
+#endif
+ Between 0 n
+{-# INLINE maxSize #-}
+
+betweenSize :: Int -> Int -> Size
+betweenSize m n =
+#if defined(ASSERTS)
+ assert (m >= 0)
+ assert (n >= m)
+#endif
+ Between m n
+{-# INLINE betweenSize #-}
+
+unionSize :: Size -> Size -> Size
+unionSize (Between a b) (Between c d) = Between (min a c) (max b d)
+unionSize _ _ = Unknown
+
+unknownSize :: Size
+unknownSize = Unknown
+{-# INLINE unknownSize #-}
+
+instance Num Size where
+ (+) = addSize
+ (-) = subtractSize
+ (*) = mulSize
+
+ fromInteger = f where f = exactSize . fromInteger
+ {-# INLINE f #-}
+
+add :: Int -> Int -> Int
+add m n | mn >= 0 = mn
+ | otherwise = overflowError
+ where mn = m + n
+{-# INLINE add #-}
+
+addSize :: Size -> Size -> Size
+addSize (Between ma mb) (Between na nb) = Between (add ma na) (add mb nb)
+addSize _ _ = Unknown
+{-# INLINE addSize #-}
+
+subtractSize :: Size -> Size -> Size
+subtractSize (Between ma mb) (Between na nb) = Between (max (ma-nb) 0) (max (mb-na) 0)
+subtractSize a@(Between 0 _) Unknown = a
+subtractSize (Between _ mb) Unknown = Between 0 mb
+subtractSize _ _ = Unknown
+{-# INLINE subtractSize #-}
+
+mulSize :: Size -> Size -> Size
+mulSize (Between ma mb) (Between na nb) = Between (mul ma na) (mul mb nb)
+mulSize _ _ = Unknown
+{-# INLINE mulSize #-}
+
+-- | Minimum of two size hints.
+smaller :: Size -> Size -> Size
+smaller a@(Between ma mb) b@(Between na nb)
+ | mb <= na = a
+ | nb <= ma = b
+ | otherwise = Between (ma `min` na) (mb `min` nb)
+smaller a@(Between 0 _) Unknown = a
+smaller (Between _ mb) Unknown = Between 0 mb
+smaller Unknown b@(Between 0 _) = b
+smaller Unknown (Between _ nb) = Between 0 nb
+smaller Unknown Unknown = Unknown
+{-# INLINE smaller #-}
+
+-- | Maximum of two size hints.
+larger :: Size -> Size -> Size
+larger a@(Between ma mb) b@(Between na nb)
+ | ma >= nb = a
+ | na >= mb = b
+ | otherwise = Between (ma `max` na) (mb `max` nb)
+larger _ _ = Unknown
+{-# INLINE larger #-}
+
+-- | Compute the maximum size from a size hint, if possible.
+upperBound :: Int -> Size -> Int
+upperBound _ (Between _ n) = n
+upperBound k _ = k
+{-# INLINE upperBound #-}
+
+-- | Compute the maximum size from a size hint, if possible.
+lowerBound :: Int -> Size -> Int
+lowerBound _ (Between n _) = n
+lowerBound k _ = k
+{-# INLINE lowerBound #-}
+
+-- | Determine the ordering relationship between two 'Size's, or 'Nothing' in
+-- the indeterminate case.
+compareSize :: Size -> Size -> Maybe Ordering
+compareSize (Between ma mb) (Between na nb)
+ | mb < na = Just LT
+ | ma > nb = Just GT
+ | ma == mb
+ , ma == na
+ , ma == nb = Just EQ
+compareSize _ _ = Nothing
+
+
+isEmpty :: Size -> Bool
+isEmpty (Between _ n) = n <= 0
+isEmpty _ = False
+{-# INLINE isEmpty #-}
+
+overflowError :: Int
+overflowError = error "Data.Text.Internal.Fusion.Size: size overflow"
diff --git a/Data/Text/Internal/Fusion/Types.hs b/Data/Text/Internal/Fusion/Types.hs
new file mode 100644
index 0000000..791fea1
--- /dev/null
+++ b/Data/Text/Internal/Fusion/Types.hs
@@ -0,0 +1,122 @@
+{-# LANGUAGE BangPatterns, ExistentialQuantification #-}
+-- |
+-- Module : Data.Text.Internal.Fusion.Types
+-- Copyright : (c) Tom Harper 2008-2009,
+-- (c) Bryan O'Sullivan 2009,
+-- (c) Duncan Coutts 2009,
+-- (c) Jasper Van der Jeugt 2011
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : GHC
+--
+-- /Warning/: this is an internal module, and does not have a stable
+-- API or name. Functions in this module may not check or enforce
+-- preconditions expected by public modules. Use at your own risk!
+--
+-- Core stream fusion functionality for text.
+
+module Data.Text.Internal.Fusion.Types
+ (
+ CC(..)
+ , PairS(..)
+ , Scan(..)
+ , RS(..)
+ , Step(..)
+ , Stream(..)
+ , empty
+ ) where
+
+import Data.Text.Internal.Fusion.Size
+import Data.Word (Word8)
+
+-- | Specialised tuple for case conversion.
+data CC s = CC !s {-# UNPACK #-} !Char {-# UNPACK #-} !Char
+
+-- | Restreaming state.
+data RS s
+ = RS0 !s
+ | RS1 !s {-# UNPACK #-} !Word8
+ | RS2 !s {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
+ | RS3 !s {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
+
+-- | Strict pair.
+data PairS a b = !a :*: !b
+ -- deriving (Eq, Ord, Show)
+infixl 2 :*:
+
+-- | An intermediate result in a scan.
+data Scan s = Scan1 {-# UNPACK #-} !Char !s
+ | Scan2 {-# UNPACK #-} !Char !s
+
+-- | Intermediate result in a processing pipeline.
+data Step s a = Done
+ | Skip !s
+ | Yield !a !s
+
+{-
+instance (Show a) => Show (Step s a)
+ where show Done = "Done"
+ show (Skip _) = "Skip"
+ show (Yield x _) = "Yield " ++ show x
+-}
+
+instance (Eq a) => Eq (Stream a) where
+ (==) = eq
+
+instance (Ord a) => Ord (Stream a) where
+ compare = cmp
+
+-- The length hint in a Stream has two roles. If its value is zero,
+-- we trust it, and treat the stream as empty. Otherwise, we treat it
+-- as a hint: it should usually be accurate, so we use it when
+-- unstreaming to decide what size array to allocate. However, the
+-- unstreaming functions must be able to cope with the hint being too
+-- small or too large.
+--
+-- The size hint tries to track the UTF-8 code units in a stream,
+-- but often counts the number of code points instead. It can easily
+-- undercount if, for instance, a transformed stream contains astral
+-- plane code points (those above 0x10000).
+
+data Stream a =
+ forall s. Stream
+ (s -> Step s a) -- stepper function
+ !s -- current state
+ !Size -- size hint in code units
+
+-- | /O(n)/ Determines if two streams are equal.
+eq :: (Eq a) => Stream a -> Stream a -> Bool
+eq (Stream next1 s1 _) (Stream next2 s2 _) = loop (next1 s1) (next2 s2)
+ where
+ loop Done Done = True
+ loop (Skip s1') (Skip s2') = loop (next1 s1') (next2 s2')
+ loop (Skip s1') x2 = loop (next1 s1') x2
+ loop x1 (Skip s2') = loop x1 (next2 s2')
+ loop Done _ = False
+ loop _ Done = False
+ loop (Yield x1 s1') (Yield x2 s2') = x1 == x2 &&
+ loop (next1 s1') (next2 s2')
+{-# INLINE [0] eq #-}
+
+cmp :: (Ord a) => Stream a -> Stream a -> Ordering
+cmp (Stream next1 s1 _) (Stream next2 s2 _) = loop (next1 s1) (next2 s2)
+ where
+ loop Done Done = EQ
+ loop (Skip s1') (Skip s2') = loop (next1 s1') (next2 s2')
+ loop (Skip s1') x2 = loop (next1 s1') x2
+ loop x1 (Skip s2') = loop x1 (next2 s2')
+ loop Done _ = LT
+ loop _ Done = GT
+ loop (Yield x1 s1') (Yield x2 s2') =
+ case compare x1 x2 of
+ EQ -> loop (next1 s1') (next2 s2')
+ other -> other
+{-# INLINE [0] cmp #-}
+
+-- | The empty stream.
+empty :: Stream a
+empty = Stream next () 0
+ where next _ = Done
+{-# INLINE [0] empty #-}
diff --git a/Data/Text/Internal/IO.hs b/Data/Text/Internal/IO.hs
new file mode 100644
index 0000000..1cf9096
--- /dev/null
+++ b/Data/Text/Internal/IO.hs
@@ -0,0 +1,166 @@
+{-# LANGUAGE BangPatterns, CPP, RecordWildCards #-}
+-- |
+-- Module : Data.Text.Internal.IO
+-- Copyright : (c) 2009, 2010 Bryan O'Sullivan,
+-- (c) 2009 Simon Marlow
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : GHC
+--
+-- /Warning/: this is an internal module, and does not have a stable
+-- API or name. Functions in this module may not check or enforce
+-- preconditions expected by public modules. Use at your own risk!
+--
+-- Low-level support for text I\/O.
+
+module Data.Text.Internal.IO
+ (
+ hGetLineWith
+ , readChunk
+ ) where
+
+import qualified Control.Exception as E
+import Data.IORef (readIORef, writeIORef)
+import Data.Text (Text)
+import Data.Text.Internal.Fusion (unstream)
+import Data.Text.Internal.Fusion.Types (Step(..), Stream(..))
+import Data.Text.Internal.Fusion.Size (exactSize, maxSize)
+import Data.Text.Unsafe (inlinePerformIO)
+import Foreign.Storable (peekElemOff)
+import GHC.IO.Buffer (Buffer(..), CharBuffer, RawCharBuffer, bufferAdjustL,
+ bufferElems, charSize, isEmptyBuffer, readCharBuf,
+ withRawBuffer, writeCharBuf)
+import GHC.IO.Handle.Internals (ioe_EOF, readTextDevice, wantReadableHandle_)
+import GHC.IO.Handle.Types (Handle__(..), Newline(..))
+import System.IO (Handle)
+import System.IO.Error (isEOFError)
+import qualified Data.Text as T
+
+-- | Read a single line of input from a handle, constructing a list of
+-- decoded chunks as we go. When we're done, transform them into the
+-- destination type.
+hGetLineWith :: ([Text] -> t) -> Handle -> IO t
+hGetLineWith f h = wantReadableHandle_ "hGetLine" h go
+ where
+ go hh@Handle__{..} = readIORef haCharBuffer >>= fmap f . hGetLineLoop hh []
+
+hGetLineLoop :: Handle__ -> [Text] -> CharBuffer -> IO [Text]
+hGetLineLoop hh@Handle__{..} = go where
+ go ts buf@Buffer{ bufL=r0, bufR=w, bufRaw=raw0 } = do
+ let findEOL raw r | r == w = return (False, w)
+ | otherwise = do
+ (c,r') <- readCharBuf raw r
+ if c == '\n'
+ then return (True, r)
+ else findEOL raw r'
+ (eol, off) <- findEOL raw0 r0
+ (t,r') <- if haInputNL == CRLF
+ then unpack_nl raw0 r0 off
+ else do t <- unpack raw0 r0 off
+ return (t,off)
+ if eol
+ then do writeIORef haCharBuffer (bufferAdjustL (off+1) buf)
+ return $ reverse (t:ts)
+ else do
+ let buf1 = bufferAdjustL r' buf
+ maybe_buf <- maybeFillReadBuffer hh buf1
+ case maybe_buf of
+ -- Nothing indicates we caught an EOF, and we may have a
+ -- partial line to return.
+ Nothing -> do
+ -- we reached EOF. There might be a lone \r left
+ -- in the buffer, so check for that and
+ -- append it to the line if necessary.
+ let pre | isEmptyBuffer buf1 = T.empty
+ | otherwise = T.singleton '\r'
+ writeIORef haCharBuffer buf1{ bufL=0, bufR=0 }
+ let str = reverse . filter (not . T.null) $ pre:t:ts
+ if null str
+ then ioe_EOF
+ else return str
+ Just new_buf -> go (t:ts) new_buf
+
+-- This function is lifted almost verbatim from GHC.IO.Handle.Text.
+maybeFillReadBuffer :: Handle__ -> CharBuffer -> IO (Maybe CharBuffer)
+maybeFillReadBuffer handle_ buf
+ = E.catch (Just `fmap` getSomeCharacters handle_ buf) $ \e ->
+ if isEOFError e
+ then return Nothing
+ else ioError e
+
+unpack :: RawCharBuffer -> Int -> Int -> IO Text
+unpack !buf !r !w
+ | charSize /= 4 = sizeError "unpack"
+ | r >= w = return T.empty
+ | otherwise = withRawBuffer buf go
+ where
+ go pbuf = return $! unstream (Stream next r (exactSize (w-r)))
+ where
+ next !i | i >= w = Done
+ | otherwise = Yield (ix i) (i+1)
+ ix i = inlinePerformIO $ peekElemOff pbuf i
+
+unpack_nl :: RawCharBuffer -> Int -> Int -> IO (Text, Int)
+unpack_nl !buf !r !w
+ | charSize /= 4 = sizeError "unpack_nl"
+ | r >= w = return (T.empty, 0)
+ | otherwise = withRawBuffer buf $ go
+ where
+ go pbuf = do
+ let !t = unstream (Stream next r (maxSize (w-r)))
+ w' = w - 1
+ return $ if ix w' == '\r'
+ then (t,w')
+ else (t,w)
+ where
+ next !i | i >= w = Done
+ | c == '\r' = let i' = i + 1
+ in if i' < w
+ then if ix i' == '\n'
+ then Yield '\n' (i+2)
+ else Yield '\n' i'
+ else Done
+ | otherwise = Yield c (i+1)
+ where c = ix i
+ ix i = inlinePerformIO $ peekElemOff pbuf i
+
+-- This function is completely lifted from GHC.IO.Handle.Text.
+getSomeCharacters :: Handle__ -> CharBuffer -> IO CharBuffer
+getSomeCharacters handle_@Handle__{..} buf@Buffer{..} =
+ case bufferElems buf of
+ -- buffer empty: read some more
+ 0 -> {-# SCC "readTextDevice" #-} readTextDevice handle_ buf
+
+ -- if the buffer has a single '\r' in it and we're doing newline
+ -- translation: read some more
+ 1 | haInputNL == CRLF -> do
+ (c,_) <- readCharBuf bufRaw bufL
+ if c == '\r'
+ then do -- shuffle the '\r' to the beginning. This is only safe
+ -- if we're about to call readTextDevice, otherwise it
+ -- would mess up flushCharBuffer.
+ -- See [note Buffer Flushing], GHC.IO.Handle.Types
+ _ <- writeCharBuf bufRaw 0 '\r'
+ let buf' = buf{ bufL=0, bufR=1 }
+ readTextDevice handle_ buf'
+ else do
+ return buf
+
+ -- buffer has some chars in it already: just return it
+ _otherwise -> {-# SCC "otherwise" #-} return buf
+
+-- | Read a single chunk of strict text from a buffer. Used by both
+-- the strict and lazy implementations of hGetContents.
+readChunk :: Handle__ -> CharBuffer -> IO Text
+readChunk hh@Handle__{..} buf = do
+ buf'@Buffer{..} <- getSomeCharacters hh buf
+ (t,r) <- if haInputNL == CRLF
+ then unpack_nl bufRaw bufL bufR
+ else do t <- unpack bufRaw bufL bufR
+ return (t,bufR)
+ writeIORef haCharBuffer (bufferAdjustL r buf')
+ return t
+
+sizeError :: String -> a
+sizeError loc = error $ "Data.Text.IO." ++ loc ++ ": bad internal buffer size"
diff --git a/Data/Text/Internal/Lazy.hs b/Data/Text/Internal/Lazy.hs
new file mode 100644
index 0000000..f8080c9
--- /dev/null
+++ b/Data/Text/Internal/Lazy.hs
@@ -0,0 +1,119 @@
+{-# LANGUAGE BangPatterns, DeriveDataTypeable #-}
+{-# OPTIONS_HADDOCK not-home #-}
+
+-- |
+-- Module : Data.Text.Internal.Lazy
+-- Copyright : (c) 2009, 2010 Bryan O'Sullivan
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : GHC
+--
+-- /Warning/: this is an internal module, and does not have a stable
+-- API or name. Functions in this module may not check or enforce
+-- preconditions expected by public modules. Use at your own risk!
+--
+-- A module containing private 'Text' internals. This exposes the
+-- 'Text' representation and low level construction functions.
+-- Modules which extend the 'Text' system may need to use this module.
+
+module Data.Text.Internal.Lazy
+ (
+ Text(..)
+ , chunk
+ , empty
+ , foldrChunks
+ , foldlChunks
+ -- * Data type invariant and abstraction functions
+
+ -- $invariant
+ , strictInvariant
+ , lazyInvariant
+ , showStructure
+
+ -- * Chunk allocation sizes
+ , defaultChunkSize
+ , smallChunkSize
+ , chunkOverhead
+ ) where
+
+import Data.Text ()
+import Data.Text.Internal.Unsafe.Shift (shiftL)
+import Data.Typeable (Typeable)
+import Foreign.Storable (sizeOf)
+import qualified Data.Text.Internal as T
+
+data Text = Empty
+ | Chunk {-# UNPACK #-} !T.Text Text
+ deriving (Typeable)
+
+-- $invariant
+--
+-- The data type invariant for lazy 'Text': Every 'Text' is either 'Empty' or
+-- consists of non-null 'T.Text's. All functions must preserve this,
+-- and the QC properties must check this.
+
+-- | Check the invariant strictly.
+strictInvariant :: Text -> Bool
+strictInvariant Empty = True
+strictInvariant x@(Chunk (T.Text _ _ len) cs)
+ | len > 0 = strictInvariant cs
+ | otherwise = error $ "Data.Text.Lazy: invariant violation: "
+ ++ showStructure x
+
+-- | Check the invariant lazily.
+lazyInvariant :: Text -> Text
+lazyInvariant Empty = Empty
+lazyInvariant x@(Chunk c@(T.Text _ _ len) cs)
+ | len > 0 = Chunk c (lazyInvariant cs)
+ | otherwise = error $ "Data.Text.Lazy: invariant violation: "
+ ++ showStructure x
+
+-- | Display the internal structure of a lazy 'Text'.
+showStructure :: Text -> String
+showStructure Empty = "Empty"
+showStructure (Chunk t Empty) = "Chunk " ++ show t ++ " Empty"
+showStructure (Chunk t ts) =
+ "Chunk " ++ show t ++ " (" ++ showStructure ts ++ ")"
+
+-- | Smart constructor for 'Chunk'. Guarantees the data type invariant.
+chunk :: T.Text -> Text -> Text
+{-# INLINE chunk #-}
+chunk t@(T.Text _ _ len) ts | len == 0 = ts
+ | otherwise = Chunk t ts
+
+-- | Smart constructor for 'Empty'.
+empty :: Text
+{-# INLINE [0] empty #-}
+empty = Empty
+
+-- | Consume the chunks of a lazy 'Text' with a natural right fold.
+foldrChunks :: (T.Text -> a -> a) -> a -> Text -> a
+foldrChunks f z = go
+ where go Empty = z
+ go (Chunk c cs) = f c (go cs)
+{-# INLINE foldrChunks #-}
+
+-- | Consume the chunks of a lazy 'Text' with a strict, tail-recursive,
+-- accumulating left fold.
+foldlChunks :: (a -> T.Text -> a) -> a -> Text -> a
+foldlChunks f z = go z
+ where go !a Empty = a
+ go !a (Chunk c cs) = go (f a c) cs
+{-# INLINE foldlChunks #-}
+
+-- | Currently set to 16 KiB, less the memory management overhead.
+defaultChunkSize :: Int
+defaultChunkSize = 16384 - chunkOverhead
+{-# INLINE defaultChunkSize #-}
+
+-- | Currently set to 128 bytes, less the memory management overhead.
+smallChunkSize :: Int
+smallChunkSize = 128 - chunkOverhead
+{-# INLINE smallChunkSize #-}
+
+-- | The memory management overhead. Currently this is tuned for GHC only.
+chunkOverhead :: Int
+chunkOverhead = sizeOf (undefined :: Int) `shiftL` 1
+{-# INLINE chunkOverhead #-}
diff --git a/Data/Text/Internal/Lazy/Encoding/Fusion.hs b/Data/Text/Internal/Lazy/Encoding/Fusion.hs
new file mode 100644
index 0000000..7dafc0a
--- /dev/null
+++ b/Data/Text/Internal/Lazy/Encoding/Fusion.hs
@@ -0,0 +1,324 @@
+{-# LANGUAGE BangPatterns, CPP, Rank2Types #-}
+
+-- |
+-- Module : Data.Text.Lazy.Encoding.Fusion
+-- Copyright : (c) 2009, 2010 Bryan O'Sullivan
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : portable
+--
+-- /Warning/: this is an internal module, and does not have a stable
+-- API or name. Functions in this module may not check or enforce
+-- preconditions expected by public modules. Use at your own risk!
+--
+-- Fusible 'Stream'-oriented functions for converting between lazy
+-- 'Text' and several common encodings.
+
+module Data.Text.Internal.Lazy.Encoding.Fusion
+ (
+ -- * Streaming
+ -- streamASCII
+ streamUtf8
+ , streamUtf16LE
+ , streamUtf16BE
+ , streamUtf32LE
+ , streamUtf32BE
+
+ -- * Unstreaming
+ , unstream
+
+ , module Data.Text.Internal.Encoding.Fusion.Common
+ ) where
+
+import Data.ByteString.Lazy.Internal (ByteString(..), defaultChunkSize)
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Unsafe as B
+import Data.Text.Internal.Encoding.Fusion.Common
+import Data.Text.Encoding.Error
+import Data.Text.Internal.Fusion (Step(..), Stream(..))
+import Data.Text.Internal.Fusion.Size
+import Data.Text.Internal.Unsafe.Char (unsafeChr, unsafeChr8, unsafeChr32)
+import Data.Text.Internal.Unsafe.Shift (shiftL)
+import Data.Word (Word8, Word16, Word32)
+import qualified Data.Text.Internal.Encoding.Utf8 as U8
+import qualified Data.Text.Internal.Encoding.Utf16 as U16
+import qualified Data.Text.Internal.Encoding.Utf32 as U32
+import Data.Text.Unsafe (unsafeDupablePerformIO)
+import Foreign.ForeignPtr (withForeignPtr, ForeignPtr)
+import Foreign.Storable (pokeByteOff)
+import Data.ByteString.Internal (mallocByteString, memcpy)
+#if defined(ASSERTS)
+import Control.Exception (assert)
+#endif
+import qualified Data.ByteString.Internal as B
+
+data S = S0
+ | S1 {-# UNPACK #-} !Word8
+ | S2 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
+ | S3 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
+ | S4 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
+
+data T = T !ByteString !S {-# UNPACK #-} !Int
+
+-- | /O(n)/ Convert a lazy 'ByteString' into a 'Stream Char', using
+-- UTF-8 encoding.
+streamUtf8 :: OnDecodeError -> ByteString -> Stream Char
+streamUtf8 onErr bs0 = Stream next (T bs0 S0 0) unknownSize
+ where
+ next (T bs@(Chunk ps _) S0 i)
+ | i < len && U8.validate1 a =
+ Yield (unsafeChr8 a) (T bs S0 (i+1))
+ | i + 1 < len && U8.validate2 a b =
+ Yield (U8.chr2 a b) (T bs S0 (i+2))
+ | i + 2 < len && U8.validate3 a b c =
+ Yield (U8.chr3 a b c) (T bs S0 (i+3))
+ | i + 3 < len && U8.validate4 a b c d =
+ Yield (U8.chr4 a b c d) (T bs S0 (i+4))
+ where len = B.length ps
+ a = B.unsafeIndex ps i
+ b = B.unsafeIndex ps (i+1)
+ c = B.unsafeIndex ps (i+2)
+ d = B.unsafeIndex ps (i+3)
+ next st@(T bs s i) =
+ case s of
+ S1 a | U8.validate1 a -> Yield (unsafeChr8 a) es
+ S2 a b | U8.validate2 a b -> Yield (U8.chr2 a b) es
+ S3 a b c | U8.validate3 a b c -> Yield (U8.chr3 a b c) es
+ S4 a b c d | U8.validate4 a b c d -> Yield (U8.chr4 a b c d) es
+ _ -> consume st
+ where es = T bs S0 i
+ consume (T bs@(Chunk ps rest) s i)
+ | i >= B.length ps = consume (T rest s 0)
+ | otherwise =
+ case s of
+ S0 -> next (T bs (S1 x) (i+1))
+ S1 a -> next (T bs (S2 a x) (i+1))
+ S2 a b -> next (T bs (S3 a b x) (i+1))
+ S3 a b c -> next (T bs (S4 a b c x) (i+1))
+ S4 a b c d -> decodeError "streamUtf8" "UTF-8" onErr (Just a)
+ (T bs (S3 b c d) (i+1))
+ where x = B.unsafeIndex ps i
+ consume (T Empty S0 _) = Done
+ consume st = decodeError "streamUtf8" "UTF-8" onErr Nothing st
+{-# INLINE [0] streamUtf8 #-}
+
+-- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using little
+-- endian UTF-16 encoding.
+streamUtf16LE :: OnDecodeError -> ByteString -> Stream Char
+streamUtf16LE onErr bs0 = Stream next (T bs0 S0 0) unknownSize
+ where
+ next (T bs@(Chunk ps _) S0 i)
+ | i + 1 < len && U16.validate1 x1 =
+ Yield (unsafeChr x1) (T bs S0 (i+2))
+ | i + 3 < len && U16.validate2 x1 x2 =
+ Yield (U16.chr2 x1 x2) (T bs S0 (i+4))
+ where len = B.length ps
+ x1 = c (idx i) (idx (i + 1))
+ x2 = c (idx (i + 2)) (idx (i + 3))
+ c w1 w2 = w1 + (w2 `shiftL` 8)
+ idx = fromIntegral . B.unsafeIndex ps :: Int -> Word16
+ next st@(T bs s i) =
+ case s of
+ S2 w1 w2 | U16.validate1 (c w1 w2) ->
+ Yield (unsafeChr (c w1 w2)) es
+ S4 w1 w2 w3 w4 | U16.validate2 (c w1 w2) (c w3 w4) ->
+ Yield (U16.chr2 (c w1 w2) (c w3 w4)) es
+ _ -> consume st
+ where es = T bs S0 i
+ c :: Word8 -> Word8 -> Word16
+ c w1 w2 = fromIntegral w1 + (fromIntegral w2 `shiftL` 8)
+ consume (T bs@(Chunk ps rest) s i)
+ | i >= B.length ps = consume (T rest s 0)
+ | otherwise =
+ case s of
+ S0 -> next (T bs (S1 x) (i+1))
+ S1 w1 -> next (T bs (S2 w1 x) (i+1))
+ S2 w1 w2 -> next (T bs (S3 w1 w2 x) (i+1))
+ S3 w1 w2 w3 -> next (T bs (S4 w1 w2 w3 x) (i+1))
+ S4 w1 w2 w3 w4 -> decodeError "streamUtf16LE" "UTF-16LE" onErr (Just w1)
+ (T bs (S3 w2 w3 w4) (i+1))
+ where x = B.unsafeIndex ps i
+ consume (T Empty S0 _) = Done
+ consume st = decodeError "streamUtf16LE" "UTF-16LE" onErr Nothing st
+{-# INLINE [0] streamUtf16LE #-}
+
+-- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using big
+-- endian UTF-16 encoding.
+streamUtf16BE :: OnDecodeError -> ByteString -> Stream Char
+streamUtf16BE onErr bs0 = Stream next (T bs0 S0 0) unknownSize
+ where
+ next (T bs@(Chunk ps _) S0 i)
+ | i + 1 < len && U16.validate1 x1 =
+ Yield (unsafeChr x1) (T bs S0 (i+2))
+ | i + 3 < len && U16.validate2 x1 x2 =
+ Yield (U16.chr2 x1 x2) (T bs S0 (i+4))
+ where len = B.length ps
+ x1 = c (idx i) (idx (i + 1))
+ x2 = c (idx (i + 2)) (idx (i + 3))
+ c w1 w2 = (w1 `shiftL` 8) + w2
+ idx = fromIntegral . B.unsafeIndex ps :: Int -> Word16
+ next st@(T bs s i) =
+ case s of
+ S2 w1 w2 | U16.validate1 (c w1 w2) ->
+ Yield (unsafeChr (c w1 w2)) es
+ S4 w1 w2 w3 w4 | U16.validate2 (c w1 w2) (c w3 w4) ->
+ Yield (U16.chr2 (c w1 w2) (c w3 w4)) es
+ _ -> consume st
+ where es = T bs S0 i
+ c :: Word8 -> Word8 -> Word16
+ c w1 w2 = (fromIntegral w1 `shiftL` 8) + fromIntegral w2
+ consume (T bs@(Chunk ps rest) s i)
+ | i >= B.length ps = consume (T rest s 0)
+ | otherwise =
+ case s of
+ S0 -> next (T bs (S1 x) (i+1))
+ S1 w1 -> next (T bs (S2 w1 x) (i+1))
+ S2 w1 w2 -> next (T bs (S3 w1 w2 x) (i+1))
+ S3 w1 w2 w3 -> next (T bs (S4 w1 w2 w3 x) (i+1))
+ S4 w1 w2 w3 w4 -> decodeError "streamUtf16BE" "UTF-16BE" onErr (Just w1)
+ (T bs (S3 w2 w3 w4) (i+1))
+ where x = B.unsafeIndex ps i
+ consume (T Empty S0 _) = Done
+ consume st = decodeError "streamUtf16BE" "UTF-16BE" onErr Nothing st
+{-# INLINE [0] streamUtf16BE #-}
+
+-- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using big
+-- endian UTF-32 encoding.
+streamUtf32BE :: OnDecodeError -> ByteString -> Stream Char
+streamUtf32BE onErr bs0 = Stream next (T bs0 S0 0) unknownSize
+ where
+ next (T bs@(Chunk ps _) S0 i)
+ | i + 3 < len && U32.validate x =
+ Yield (unsafeChr32 x) (T bs S0 (i+4))
+ where len = B.length ps
+ x = shiftL x1 24 + shiftL x2 16 + shiftL x3 8 + x4
+ x1 = idx i
+ x2 = idx (i+1)
+ x3 = idx (i+2)
+ x4 = idx (i+3)
+ idx = fromIntegral . B.unsafeIndex ps :: Int -> Word32
+ next st@(T bs s i) =
+ case s of
+ S4 w1 w2 w3 w4 | U32.validate (c w1 w2 w3 w4) ->
+ Yield (unsafeChr32 (c w1 w2 w3 w4)) es
+ _ -> consume st
+ where es = T bs S0 i
+ c :: Word8 -> Word8 -> Word8 -> Word8 -> Word32
+ c w1 w2 w3 w4 = shifted
+ where
+ shifted = shiftL x1 24 + shiftL x2 16 + shiftL x3 8 + x4
+ x1 = fromIntegral w1
+ x2 = fromIntegral w2
+ x3 = fromIntegral w3
+ x4 = fromIntegral w4
+ consume (T bs@(Chunk ps rest) s i)
+ | i >= B.length ps = consume (T rest s 0)
+ | otherwise =
+ case s of
+ S0 -> next (T bs (S1 x) (i+1))
+ S1 w1 -> next (T bs (S2 w1 x) (i+1))
+ S2 w1 w2 -> next (T bs (S3 w1 w2 x) (i+1))
+ S3 w1 w2 w3 -> next (T bs (S4 w1 w2 w3 x) (i+1))
+ S4 w1 w2 w3 w4 -> decodeError "streamUtf32BE" "UTF-32BE" onErr (Just w1)
+ (T bs (S3 w2 w3 w4) (i+1))
+ where x = B.unsafeIndex ps i
+ consume (T Empty S0 _) = Done
+ consume st = decodeError "streamUtf32BE" "UTF-32BE" onErr Nothing st
+{-# INLINE [0] streamUtf32BE #-}
+
+-- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using little
+-- endian UTF-32 encoding.
+streamUtf32LE :: OnDecodeError -> ByteString -> Stream Char
+streamUtf32LE onErr bs0 = Stream next (T bs0 S0 0) unknownSize
+ where
+ next (T bs@(Chunk ps _) S0 i)
+ | i + 3 < len && U32.validate x =
+ Yield (unsafeChr32 x) (T bs S0 (i+4))
+ where len = B.length ps
+ x = shiftL x4 24 + shiftL x3 16 + shiftL x2 8 + x1
+ x1 = idx i
+ x2 = idx (i+1)
+ x3 = idx (i+2)
+ x4 = idx (i+3)
+ idx = fromIntegral . B.unsafeIndex ps :: Int -> Word32
+ next st@(T bs s i) =
+ case s of
+ S4 w1 w2 w3 w4 | U32.validate (c w1 w2 w3 w4) ->
+ Yield (unsafeChr32 (c w1 w2 w3 w4)) es
+ _ -> consume st
+ where es = T bs S0 i
+ c :: Word8 -> Word8 -> Word8 -> Word8 -> Word32
+ c w1 w2 w3 w4 = shifted
+ where
+ shifted = shiftL x4 24 + shiftL x3 16 + shiftL x2 8 + x1
+ x1 = fromIntegral w1
+ x2 = fromIntegral w2
+ x3 = fromIntegral w3
+ x4 = fromIntegral w4
+ consume (T bs@(Chunk ps rest) s i)
+ | i >= B.length ps = consume (T rest s 0)
+ | otherwise =
+ case s of
+ S0 -> next (T bs (S1 x) (i+1))
+ S1 w1 -> next (T bs (S2 w1 x) (i+1))
+ S2 w1 w2 -> next (T bs (S3 w1 w2 x) (i+1))
+ S3 w1 w2 w3 -> next (T bs (S4 w1 w2 w3 x) (i+1))
+ S4 w1 w2 w3 w4 -> decodeError "streamUtf32LE" "UTF-32LE" onErr (Just w1)
+ (T bs (S3 w2 w3 w4) (i+1))
+ where x = B.unsafeIndex ps i
+ consume (T Empty S0 _) = Done
+ consume st = decodeError "streamUtf32LE" "UTF-32LE" onErr Nothing st
+{-# INLINE [0] streamUtf32LE #-}
+
+-- | /O(n)/ Convert a 'Stream' 'Word8' to a lazy 'ByteString'.
+unstreamChunks :: Int -> Stream Word8 -> ByteString
+unstreamChunks chunkSize (Stream next s0 len0) = chunk s0 (upperBound 4 len0)
+ where chunk s1 len1 = unsafeDupablePerformIO $ do
+ let len = max 4 (min len1 chunkSize)
+ mallocByteString len >>= loop len 0 s1
+ where
+ loop !n !off !s fp = case next s of
+ Done | off == 0 -> return Empty
+ | otherwise -> return $! Chunk (trimUp fp off) Empty
+ Skip s' -> loop n off s' fp
+ Yield x s'
+ | off == chunkSize -> do
+ let !newLen = n - off
+ return $! Chunk (trimUp fp off) (chunk s newLen)
+ | off == n -> realloc fp n off s' x
+ | otherwise -> do
+ withForeignPtr fp $ \p -> pokeByteOff p off x
+ loop n (off+1) s' fp
+ {-# NOINLINE realloc #-}
+ realloc fp n off s x = do
+ let n' = min (n+n) chunkSize
+ fp' <- copy0 fp n n'
+ withForeignPtr fp' $ \p -> pokeByteOff p off x
+ loop n' (off+1) s fp'
+ trimUp fp off = B.PS fp 0 off
+ copy0 :: ForeignPtr Word8 -> Int -> Int -> IO (ForeignPtr Word8)
+ copy0 !src !srcLen !destLen =
+#if defined(ASSERTS)
+ assert (srcLen <= destLen) $
+#endif
+ do
+ dest <- mallocByteString destLen
+ withForeignPtr src $ \src' ->
+ withForeignPtr dest $ \dest' ->
+ memcpy dest' src' (fromIntegral srcLen)
+ return dest
+
+-- | /O(n)/ Convert a 'Stream' 'Word8' to a lazy 'ByteString'.
+unstream :: Stream Word8 -> ByteString
+unstream = unstreamChunks defaultChunkSize
+
+decodeError :: forall s. String -> String -> OnDecodeError -> Maybe Word8
+ -> s -> Step s Char
+decodeError func kind onErr mb i =
+ case onErr desc mb of
+ Nothing -> Skip i
+ Just c -> Yield c i
+ where desc = "Data.Text.Lazy.Encoding.Fusion." ++ func ++ ": Invalid " ++
+ kind ++ " stream"
diff --git a/Data/Text/Internal/Lazy/Fusion.hs b/Data/Text/Internal/Lazy/Fusion.hs
new file mode 100644
index 0000000..9abbfb0
--- /dev/null
+++ b/Data/Text/Internal/Lazy/Fusion.hs
@@ -0,0 +1,120 @@
+{-# LANGUAGE BangPatterns #-}
+-- |
+-- Module : Data.Text.Lazy.Fusion
+-- Copyright : (c) 2009, 2010 Bryan O'Sullivan
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : GHC
+--
+-- /Warning/: this is an internal module, and does not have a stable
+-- API or name. Functions in this module may not check or enforce
+-- preconditions expected by public modules. Use at your own risk!
+--
+-- Core stream fusion functionality for text.
+
+module Data.Text.Internal.Lazy.Fusion
+ (
+ stream
+ , unstream
+ , unstreamChunks
+ , length
+ , unfoldrN
+ , index
+ , countChar
+ ) where
+
+import Prelude hiding (length)
+import qualified Data.Text.Internal.Fusion.Common as S
+import Control.Monad.ST (runST)
+import Data.Text.Internal.Fusion.Types
+import Data.Text.Internal.Fusion.Size (isEmpty, unknownSize)
+import Data.Text.Internal.Lazy
+import qualified Data.Text.Internal as I
+import qualified Data.Text.Array as A
+import Data.Text.Internal.Unsafe.Char (unsafeWrite)
+import Data.Text.Internal.Unsafe.Shift (shiftL)
+import Data.Text.Unsafe (Iter(..), iter)
+import Data.Int (Int64)
+
+default(Int64)
+
+-- | /O(n)/ Convert a 'Text' into a 'Stream Char'.
+stream :: Text -> Stream Char
+stream text = Stream next (text :*: 0) unknownSize
+ where
+ next (Empty :*: _) = Done
+ next (txt@(Chunk t@(I.Text _ _ len) ts) :*: i)
+ | i >= len = next (ts :*: 0)
+ | otherwise = Yield c (txt :*: i+d)
+ where Iter c d = iter t i
+{-# INLINE [0] stream #-}
+
+-- | /O(n)/ Convert a 'Stream Char' into a 'Text', using the given
+-- chunk size.
+unstreamChunks :: Int -> Stream Char -> Text
+unstreamChunks !chunkSize (Stream next s0 len0)
+ | isEmpty len0 = Empty
+ | otherwise = outer s0
+ where
+ outer so = {-# SCC "unstreamChunks/outer" #-}
+ case next so of
+ Done -> Empty
+ Skip s' -> outer s'
+ Yield x s' -> runST $ do
+ a <- A.new unknownLength
+ unsafeWrite a 0 x >>= inner a unknownLength s'
+ where unknownLength = 4
+ where
+ inner marr !len s !i
+ | i + 3 >= chunkSize = finish marr i s
+ | i + 3 >= len = {-# SCC "unstreamChunks/resize" #-} do
+ let newLen = min (len `shiftL` 1) chunkSize
+ marr' <- A.new newLen
+ A.copyM marr' 0 marr 0 len
+ inner marr' newLen s i
+ | otherwise =
+ {-# SCC "unstreamChunks/inner" #-}
+ case next s of
+ Done -> finish marr i s
+ Skip s' -> inner marr len s' i
+ Yield x s' -> do d <- unsafeWrite marr i x
+ inner marr len s' (i+d)
+ finish marr len s' = do
+ arr <- A.unsafeFreeze marr
+ return (I.Text arr 0 len `Chunk` outer s')
+{-# INLINE [0] unstreamChunks #-}
+
+-- | /O(n)/ Convert a 'Stream Char' into a 'Text', using
+-- 'defaultChunkSize'.
+unstream :: Stream Char -> Text
+unstream = unstreamChunks defaultChunkSize
+{-# INLINE [0] unstream #-}
+
+-- | /O(n)/ Returns the number of characters in a text.
+length :: Stream Char -> Int64
+length = S.lengthI
+{-# INLINE[0] length #-}
+
+{-# RULES "LAZY STREAM stream/unstream fusion" forall s.
+ stream (unstream s) = s #-}
+
+-- | /O(n)/ Like 'unfoldr', 'unfoldrN' builds a stream from a seed
+-- value. However, the length of the result is limited by the
+-- first argument to 'unfoldrN'. This function is more efficient than
+-- 'unfoldr' when the length of the result is known.
+unfoldrN :: Int64 -> (a -> Maybe (Char,a)) -> a -> Stream Char
+unfoldrN n = S.unfoldrNI n
+{-# INLINE [0] unfoldrN #-}
+
+-- | /O(n)/ stream index (subscript) operator, starting from 0.
+index :: Stream Char -> Int64 -> Char
+index = S.indexI
+{-# INLINE [0] index #-}
+
+-- | /O(n)/ The 'count' function returns the number of times the query
+-- element appears in the given stream.
+countChar :: Char -> Stream Char -> Int64
+countChar = S.countCharI
+{-# INLINE [0] countChar #-}
diff --git a/Data/Text/Internal/Lazy/Search.hs b/Data/Text/Internal/Lazy/Search.hs
new file mode 100644
index 0000000..4451b61
--- /dev/null
+++ b/Data/Text/Internal/Lazy/Search.hs
@@ -0,0 +1,134 @@
+{-# LANGUAGE BangPatterns, ScopedTypeVariables #-}
+
+-- |
+-- Module : Data.Text.Lazy.Search
+-- Copyright : (c) 2009, 2010 Bryan O'Sullivan
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : GHC
+--
+-- /Warning/: this is an internal module, and does not have a stable
+-- API or name. Functions in this module may not check or enforce
+-- preconditions expected by public modules. Use at your own risk!
+--
+-- Fast substring search for lazy 'Text', based on work by Boyer,
+-- Moore, Horspool, Sunday, and Lundh. Adapted from the strict
+-- implementation.
+
+module Data.Text.Internal.Lazy.Search
+ (
+ indices
+ ) where
+
+import qualified Data.Text.Array as A
+import Data.Int (Int64)
+import Data.Word (Word8, Word64)
+import qualified Data.Text.Internal as T
+import Data.Text.Internal.Fusion.Types (PairS(..))
+import Data.Text.Internal.Lazy (Text(..), foldlChunks)
+import Data.Bits ((.|.), (.&.))
+import Data.Text.Internal.Unsafe.Shift (shiftL)
+
+-- | /O(n+m)/ Find the offsets of all non-overlapping indices of
+-- @needle@ within @haystack@.
+--
+-- This function is strict in @needle@, and lazy (as far as possible)
+-- in the chunks of @haystack@.
+--
+-- In (unlikely) bad cases, this algorithm's complexity degrades
+-- towards /O(n*m)/.
+indices :: Text -- ^ Substring to search for (@needle@)
+ -> Text -- ^ Text to search in (@haystack@)
+ -> [Int64]
+indices needle@(Chunk n ns) _haystack@(Chunk k ks)
+ | nlen <= 0 = []
+ | nlen == 1 = indicesOne (nindex 0) 0 k ks
+ | otherwise = advance k ks 0 0
+ where
+ advance x@(T.Text _ _ l) xs = scan
+ where
+ scan !g !i
+ | i >= m = case xs of
+ Empty -> []
+ Chunk y ys -> advance y ys g (i-m)
+ | lackingHay (i + nlen) x xs = []
+ | c == z && candidateMatch 0 = g : scan (g+nlen) (i+nlen)
+ | otherwise = scan (g+delta) (i+delta)
+ where
+ m = fromIntegral l
+ c = hindex (i + nlast)
+ delta | nextInPattern = nlen + 1
+ | c == z = skip + 1
+ | otherwise = 1
+ nextInPattern = mask .&. swizzle (hindex (i+nlen)) == 0
+ candidateMatch !j
+ | j >= nlast = True
+ | hindex (i+j) /= nindex j = False
+ | otherwise = candidateMatch (j+1)
+ hindex = index x xs
+ nlen = wordLength needle
+ nlast = nlen - 1
+ nindex = index n ns
+ z = foldlChunks fin 0 needle
+ where fin _ (T.Text farr foff flen) = A.unsafeIndex farr (foff+flen-1)
+ (mask :: Word64) :*: skip = buildTable n ns 0 0 0 (nlen-2)
+ swizzle w = 1 `shiftL` (fromIntegral w .&. 0x3f)
+ buildTable (T.Text xarr xoff xlen) xs = go
+ where
+ go !(g::Int64) !i !msk !skp
+ | i >= xlast = case xs of
+ Empty -> (msk .|. swizzle z) :*: skp
+ Chunk y ys -> buildTable y ys g 0 msk' skp'
+ | otherwise = go (g+1) (i+1) msk' skp'
+ where c = A.unsafeIndex xarr (xoff+i)
+ msk' = msk .|. swizzle c
+ skp' | c == z = nlen - g - 2
+ | otherwise = skp
+ xlast = xlen - 1
+ -- | Check whether an attempt to index into the haystack at the
+ -- given offset would fail.
+ lackingHay q = go 0
+ where
+ go p (T.Text _ _ l) ps = p' < q && case ps of
+ Empty -> True
+ Chunk r rs -> go p' r rs
+ where p' = p + fromIntegral l
+indices _ _ = []
+
+-- | Fast index into a partly unpacked 'Text'. We take into account
+-- the possibility that the caller might try to access one element
+-- past the end.
+index :: T.Text -> Text -> Int64 -> Word8
+index (T.Text arr off len) xs !i
+ | j < len = A.unsafeIndex arr (off+j)
+ | otherwise = case xs of
+ Empty
+ -- out of bounds, but legal
+ | j == len -> 0
+ -- should never happen, due to lackingHay above
+ | otherwise -> emptyError "index"
+ Chunk c cs -> index c cs (i-fromIntegral len)
+ where j = fromIntegral i
+
+-- | A variant of 'indices' that scans linearly for a single 'Word16'.
+indicesOne :: Word8 -> Int64 -> T.Text -> Text -> [Int64]
+indicesOne c = chunk
+ where
+ chunk !i (T.Text oarr ooff olen) os = go 0
+ where
+ go h | h >= olen = case os of
+ Empty -> []
+ Chunk y ys -> chunk (i+fromIntegral olen) y ys
+ | on == c = i + fromIntegral h : go (h+1)
+ | otherwise = go (h+1)
+ where on = A.unsafeIndex oarr (ooff+h)
+
+-- | The number of 'Word16' values in a 'Text'.
+wordLength :: Text -> Int64
+wordLength = foldlChunks sumLength 0
+ where sumLength i (T.Text _ _ l) = i + fromIntegral l
+
+emptyError :: String -> a
+emptyError fun = error ("Data.Text.Lazy.Search." ++ fun ++ ": empty input")
diff --git a/Data/Text/Internal/Private.hs b/Data/Text/Internal/Private.hs
new file mode 100644
index 0000000..7e6fe23
--- /dev/null
+++ b/Data/Text/Internal/Private.hs
@@ -0,0 +1,37 @@
+{-# LANGUAGE BangPatterns, Rank2Types, UnboxedTuples #-}
+
+-- |
+-- Module : Data.Text.Internal.Private
+-- Copyright : (c) 2011 Bryan O'Sullivan
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : GHC
+
+module Data.Text.Internal.Private
+ (
+ runText
+ , span_
+ ) where
+
+import Control.Monad.ST (ST, runST)
+import Data.Text.Internal (Text(..), text)
+import Data.Text.Unsafe (Iter(..), iter)
+import qualified Data.Text.Array as A
+
+span_ :: (Char -> Bool) -> Text -> (# Text, Text #)
+span_ p t@(Text arr off len) = (# hd,tl #)
+ where hd = text arr off k
+ tl = text arr (off+k) (len-k)
+ !k = loop 0
+ loop !i | i < len && p c = loop (i+d)
+ | otherwise = i
+ where Iter c d = iter t i
+{-# INLINE span_ #-}
+
+runText :: (forall s. (A.MArray s -> Int -> ST s Text) -> ST s Text) -> Text
+runText act = runST (act $ \ !marr !len -> do
+ arr <- A.unsafeFreeze marr
+ return $! text arr 0 len)
+{-# INLINE runText #-}
diff --git a/Data/Text/Internal/Read.hs b/Data/Text/Internal/Read.hs
new file mode 100644
index 0000000..5dbd221
--- /dev/null
+++ b/Data/Text/Internal/Read.hs
@@ -0,0 +1,62 @@
+-- |
+-- Module : Data.Text.Internal.Read
+-- Copyright : (c) 2014 Bryan O'Sullivan
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : GHC
+--
+-- Common internal functions for reading textual data.
+module Data.Text.Internal.Read
+ (
+ IReader
+ , IParser(..)
+ , T(..)
+ , digitToInt
+ , hexDigitToInt
+ , perhaps
+ ) where
+
+import Control.Applicative as App (Applicative(..))
+import Control.Arrow (first)
+import Control.Monad (ap)
+import Data.Char (ord)
+
+type IReader t a = t -> Either String (a,t)
+
+newtype IParser t a = P {
+ runP :: IReader t a
+ }
+
+instance Functor (IParser t) where
+ fmap f m = P $ fmap (first f) . runP m
+
+instance Applicative (IParser t) where
+ pure a = P $ \t -> Right (a,t)
+ {-# INLINE pure #-}
+ (<*>) = ap
+
+instance Monad (IParser t) where
+ return = App.pure
+ m >>= k = P $ \t -> case runP m t of
+ Left err -> Left err
+ Right (a,t') -> runP (k a) t'
+ {-# INLINE (>>=) #-}
+ fail msg = P $ \_ -> Left msg
+
+data T = T !Integer !Int
+
+perhaps :: a -> IParser t a -> IParser t a
+perhaps def m = P $ \t -> case runP m t of
+ Left _ -> Right (def,t)
+ r@(Right _) -> r
+
+hexDigitToInt :: Char -> Int
+hexDigitToInt c
+ | c >= '0' && c <= '9' = ord c - ord '0'
+ | c >= 'a' && c <= 'f' = ord c - (ord 'a' - 10)
+ | otherwise = ord c - (ord 'A' - 10)
+
+digitToInt :: Char -> Int
+digitToInt c = ord c - ord '0'
diff --git a/Data/Text/Internal/Search.hs b/Data/Text/Internal/Search.hs
new file mode 100644
index 0000000..20eda8d
--- /dev/null
+++ b/Data/Text/Internal/Search.hs
@@ -0,0 +1,89 @@
+{-# LANGUAGE BangPatterns, ScopedTypeVariables #-}
+
+-- |
+-- Module : Data.Text.Internal.Search
+-- Copyright : (c) Bryan O'Sullivan 2009
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : GHC
+--
+-- Fast substring search for 'Text', based on work by Boyer, Moore,
+-- Horspool, Sunday, and Lundh.
+--
+-- References:
+--
+-- * R. S. Boyer, J. S. Moore: A Fast String Searching Algorithm.
+-- Communications of the ACM, 20, 10, 762-772 (1977)
+--
+-- * R. N. Horspool: Practical Fast Searching in Strings. Software -
+-- Practice and Experience 10, 501-506 (1980)
+--
+-- * D. M. Sunday: A Very Fast Substring Search Algorithm.
+-- Communications of the ACM, 33, 8, 132-142 (1990)
+--
+-- * F. Lundh: The Fast Search Algorithm.
+-- <http://effbot.org/zone/stringlib.htm> (2006)
+
+module Data.Text.Internal.Search
+ (
+ indices
+ ) where
+
+import qualified Data.Text.Array as A
+import Data.Word (Word64)
+import Data.Text.Internal (Text(..))
+import Data.Bits ((.|.), (.&.))
+import Data.Text.Internal.Unsafe.Shift (shiftL)
+
+data T = {-# UNPACK #-} !Word64 :* {-# UNPACK #-} !Int
+
+-- | /O(n+m)/ Find the offsets of all non-overlapping indices of
+-- @needle@ within @haystack@. The offsets returned represent
+-- uncorrected indices in the low-level \"needle\" array, to which its
+-- offset must be added.
+--
+-- In (unlikely) bad cases, this algorithm's complexity degrades
+-- towards /O(n*m)/.
+indices :: Text -- ^ Substring to search for (@needle@)
+ -> Text -- ^ Text to search in (@haystack@)
+ -> [Int]
+indices _needle@(Text narr noff nlen) _haystack@(Text harr hoff hlen)
+ | nlen == 1 = scanOne (nindex 0)
+ | nlen <= 0 || ldiff < 0 = []
+ | otherwise = scan 0
+ where
+ ldiff = hlen - nlen
+ nlast = nlen - 1
+ z = nindex nlast
+ nindex k = A.unsafeIndex narr (noff+k)
+ hindex k = A.unsafeIndex harr (hoff+k)
+ hindex' k | k == hlen = 0
+ | otherwise = A.unsafeIndex harr (hoff+k)
+ buildTable !i !msk !skp
+ | i >= nlast = (msk .|. swizzle z) :* skp
+ | otherwise = buildTable (i+1) (msk .|. swizzle c) skp'
+ where c = nindex i
+ skp' | c == z = nlen - i - 2
+ | otherwise = skp
+ swizzle k = 1 `shiftL` (fromIntegral k .&. 0x3f)
+ scan !i
+ | i > ldiff = []
+ | c == z && candidateMatch 0 = i : scan (i + nlen)
+ | otherwise = scan (i + delta)
+ where c = hindex (i + nlast)
+ candidateMatch !j
+ | j >= nlast = True
+ | hindex (i+j) /= nindex j = False
+ | otherwise = candidateMatch (j+1)
+ delta | nextInPattern = nlen + 1
+ | c == z = skip + 1
+ | otherwise = 1
+ where nextInPattern = mask .&. swizzle (hindex' (i+nlen)) == 0
+ !(mask :* skip) = buildTable 0 0 (nlen-2)
+ scanOne c = loop 0
+ where loop !i | i >= hlen = []
+ | hindex i == c = i : loop (i+1)
+ | otherwise = loop (i+1)
+{-# INLINE indices #-}
diff --git a/Data/Text/Internal/Unsafe.hs b/Data/Text/Internal/Unsafe.hs
new file mode 100644
index 0000000..1e42a1c
--- /dev/null
+++ b/Data/Text/Internal/Unsafe.hs
@@ -0,0 +1,56 @@
+{-# LANGUAGE CPP, MagicHash, UnboxedTuples #-}
+{-# OPTIONS_HADDOCK not-home #-}
+
+-- |
+-- Module : Data.Text.Internal.Unsafe
+-- Copyright : (c) 2009, 2010, 2011 Bryan O'Sullivan
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : portable
+--
+-- /Warning/: this is an internal module, and does not have a stable
+-- API or name. Functions in this module may not check or enforce
+-- preconditions expected by public modules. Use at your own risk!
+--
+-- A module containing /unsafe/ operations, for /very very careful/ use
+-- in /heavily tested/ code.
+module Data.Text.Internal.Unsafe
+ (
+ inlineInterleaveST
+ , inlinePerformIO
+ ) where
+
+import GHC.ST (ST(..))
+#if defined(__GLASGOW_HASKELL__)
+import GHC.IO (IO(IO))
+import GHC.Base (realWorld#)
+#endif
+
+
+-- | Just like unsafePerformIO, but we inline it. Big performance gains as
+-- it exposes lots of things to further inlining. /Very unsafe/. In
+-- particular, you should do no memory allocation inside an
+-- 'inlinePerformIO' block. On Hugs this is just @unsafePerformIO@.
+--
+{-# INLINE inlinePerformIO #-}
+inlinePerformIO :: IO a -> a
+#if defined(__GLASGOW_HASKELL__)
+inlinePerformIO (IO m) = case m realWorld# of (# _, r #) -> r
+#else
+inlinePerformIO = unsafePerformIO
+#endif
+
+-- | Allow an 'ST' computation to be deferred lazily. When passed an
+-- action of type 'ST' @s@ @a@, the action will only be performed when
+-- the value of @a@ is demanded.
+--
+-- This function is identical to the normal unsafeInterleaveST, but is
+-- inlined and hence faster.
+--
+-- /Note/: This operation is highly unsafe, as it can introduce
+-- externally visible non-determinism into an 'ST' action.
+inlineInterleaveST :: ST s a -> ST s a
+inlineInterleaveST (ST m) = ST $ \ s ->
+ let r = case m s of (# _, res #) -> res in (# s, r #)
+{-# INLINE inlineInterleaveST #-}
diff --git a/Data/Text/Internal/Unsafe/Char.hs b/Data/Text/Internal/Unsafe/Char.hs
new file mode 100644
index 0000000..541295e
--- /dev/null
+++ b/Data/Text/Internal/Unsafe/Char.hs
@@ -0,0 +1,119 @@
+{-# LANGUAGE CPP, MagicHash #-}
+
+-- |
+-- Module : Data.Text.Internal.Unsafe.Char
+-- Copyright : (c) 2008, 2009 Tom Harper,
+-- (c) 2009, 2010 Bryan O'Sullivan,
+-- (c) 2009 Duncan Coutts
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : GHC
+--
+-- /Warning/: this is an internal module, and does not have a stable
+-- API or name. Functions in this module may not check or enforce
+-- preconditions expected by public modules. Use at your own risk!
+--
+-- Fast character manipulation functions.
+module Data.Text.Internal.Unsafe.Char
+ (
+ ord
+ , unsafeChr
+ , unsafeChr8
+ , unsafeChr32
+ , unsafeWrite
+ -- , unsafeWriteRev
+ ) where
+
+#ifdef ASSERTS
+import Control.Exception (assert)
+#endif
+import Control.Monad.ST (ST)
+import Data.Bits ((.&.))
+import Data.Text.Internal.Unsafe.Shift (shiftR)
+import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#)
+import GHC.Word (Word8(..), Word16(..), Word32(..))
+import qualified Data.Text.Array as A
+
+ord :: Char -> Int
+ord (C# c#) = I# (ord# c#)
+{-# INLINE ord #-}
+
+unsafeChr :: Word16 -> Char
+unsafeChr (W16# w#) = C# (chr# (word2Int# w#))
+{-# INLINE unsafeChr #-}
+
+unsafeChr8 :: Word8 -> Char
+unsafeChr8 (W8# w#) = C# (chr# (word2Int# w#))
+{-# INLINE unsafeChr8 #-}
+
+unsafeChr32 :: Word32 -> Char
+unsafeChr32 (W32# w#) = C# (chr# (word2Int# w#))
+{-# INLINE unsafeChr32 #-}
+
+-- | Write a character into the array at the given offset. Returns
+-- the number of bytes written.
+unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
+unsafeWrite marr i c
+ -- One-byte character
+ | n < 0x80 = do
+#if defined(ASSERTS)
+ assert (i >= 0) . assert (i < A.length marr) $ return ()
+#endif
+ writeAt i n
+ return 1
+
+ -- Two-byte character
+ | n < 0x0800 = do
+#if defined(ASSERTS)
+ assert (i >= 0) . assert (i + 1 < A.length marr) $ return ()
+#endif
+ writeAt i $ (n `shiftR` 6) + 0xC0
+ writeAt (i + 1) $ (n .&. 0x3F) + 0x80
+ return 2
+
+ -- Three-byte character
+ | n < 0x10000 = do
+#if defined(ASSERTS)
+ assert (i >= 0) . assert (i + 2 < A.length marr) $ return ()
+#endif
+ writeAt i $ (n `shiftR` 12) + 0xE0
+ writeAt (i + 1) $ ((n `shiftR` 6) .&. 0x3F) + 0x80
+ writeAt (i + 2) $ (n .&. 0x3F) + 0x80
+ return 3
+
+ -- Four-byte character
+ | otherwise = do
+#if defined(ASSERTS)
+ assert (i >= 0) . assert (i + 3 < A.length marr) $ return ()
+#endif
+ writeAt i $ (n `shiftR` 18) + 0xF0
+ writeAt (i + 1) $ ((n `shiftR` 12) .&. 0x3F) + 0x80
+ writeAt (i + 2) $ ((n `shiftR` 6) .&. 0x3F) + 0x80
+ writeAt (i + 3) $ (n .&. 0x3F) + 0x80
+ return 4
+ where
+ n = ord c
+ writeAt i' n' = A.unsafeWrite marr i' (fromIntegral n')
+ {-# INLINE writeAt #-}
+{-# INLINE unsafeWrite #-}
+
+{-
+unsafeWriteRev :: A.MArray s Word16 -> Int -> Char -> ST s Int
+unsafeWriteRev marr i c
+ | n < 0x10000 = do
+ assert (i >= 0) . assert (i < A.length marr) $
+ A.unsafeWrite marr i (fromIntegral n)
+ return (i-1)
+ | otherwise = do
+ assert (i >= 1) . assert (i < A.length marr) $
+ A.unsafeWrite marr (i-1) lo
+ A.unsafeWrite marr i hi
+ return (i-2)
+ where n = ord c
+ m = n - 0x10000
+ lo = fromIntegral $ (m `shiftR` 10) + 0xD800
+ hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
+{-# INLINE unsafeWriteRev #-}
+-}
diff --git a/Data/Text/Internal/Unsafe/Shift.hs b/Data/Text/Internal/Unsafe/Shift.hs
new file mode 100644
index 0000000..b2fef9b
--- /dev/null
+++ b/Data/Text/Internal/Unsafe/Shift.hs
@@ -0,0 +1,72 @@
+{-# LANGUAGE MagicHash #-}
+
+-- |
+-- Module : Data.Text.Internal.Unsafe.Shift
+-- Copyright : (c) Bryan O'Sullivan 2009
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : GHC
+--
+-- /Warning/: this is an internal module, and does not have a stable
+-- API or name. Functions in this module may not check or enforce
+-- preconditions expected by public modules. Use at your own risk!
+--
+-- Fast, unchecked bit shifting functions.
+
+module Data.Text.Internal.Unsafe.Shift
+ (
+ UnsafeShift(..)
+ ) where
+
+-- import qualified Data.Bits as Bits
+import GHC.Base
+import GHC.Word
+
+-- | This is a workaround for poor optimisation in GHC 6.8.2. It
+-- fails to notice constant-width shifts, and adds a test and branch
+-- to every shift. This imposes about a 10% performance hit.
+--
+-- These functions are undefined when the amount being shifted by is
+-- greater than the size in bits of a machine Int#.
+class UnsafeShift a where
+ shiftL :: a -> Int -> a
+ shiftR :: a -> Int -> a
+
+instance UnsafeShift Word16 where
+ {-# INLINE shiftL #-}
+ shiftL (W16# x#) (I# i#) = W16# (narrow16Word# (x# `uncheckedShiftL#` i#))
+
+ {-# INLINE shiftR #-}
+ shiftR (W16# x#) (I# i#) = W16# (x# `uncheckedShiftRL#` i#)
+
+instance UnsafeShift Word32 where
+ {-# INLINE shiftL #-}
+ shiftL (W32# x#) (I# i#) = W32# (narrow32Word# (x# `uncheckedShiftL#` i#))
+
+ {-# INLINE shiftR #-}
+ shiftR (W32# x#) (I# i#) = W32# (x# `uncheckedShiftRL#` i#)
+
+instance UnsafeShift Word64 where
+ {-# INLINE shiftL #-}
+ shiftL (W64# x#) (I# i#) = W64# (x# `uncheckedShiftL64#` i#)
+
+ {-# INLINE shiftR #-}
+ shiftR (W64# x#) (I# i#) = W64# (x# `uncheckedShiftRL64#` i#)
+
+instance UnsafeShift Int where
+ {-# INLINE shiftL #-}
+ shiftL (I# x#) (I# i#) = I# (x# `iShiftL#` i#)
+
+ {-# INLINE shiftR #-}
+ shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
+
+{-
+instance UnsafeShift Integer where
+ {-# INLINE shiftL #-}
+ shiftL = Bits.shiftL
+
+ {-# INLINE shiftR #-}
+ shiftR = Bits.shiftR
+-}
diff --git a/Data/Text/Lazy.hs b/Data/Text/Lazy.hs
new file mode 100644
index 0000000..ae744ad
--- /dev/null
+++ b/Data/Text/Lazy.hs
@@ -0,0 +1,1729 @@
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+{-# LANGUAGE BangPatterns, MagicHash, CPP #-}
+#if __GLASGOW_HASKELL__ >= 702
+{-# LANGUAGE Trustworthy #-}
+#endif
+#if __GLASGOW_HASKELL__ >= 708
+{-# LANGUAGE TypeFamilies #-}
+#endif
+
+-- |
+-- Module : Data.Text.Lazy
+-- Copyright : (c) 2009, 2010, 2012 Bryan O'Sullivan
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Portability : GHC
+--
+-- A time and space-efficient implementation of Unicode text using
+-- lists of packed arrays.
+--
+-- /Note/: Read below the synopsis for important notes on the use of
+-- this module.
+--
+-- The representation used by this module is suitable for high
+-- performance use and for streaming large quantities of data. It
+-- provides a means to manipulate a large body of text without
+-- requiring that the entire content be resident in memory.
+--
+-- Some operations, such as 'concat', 'append', 'reverse' and 'cons',
+-- have better time complexity than their "Data.Text" equivalents, due
+-- to the underlying representation being a list of chunks. For other
+-- operations, lazy 'Text's are usually within a few percent of strict
+-- ones, but often with better heap usage if used in a streaming
+-- fashion. For data larger than available memory, or if you have
+-- tight memory constraints, this module will be the only option.
+--
+-- This module is intended to be imported @qualified@, to avoid name
+-- clashes with "Prelude" functions. eg.
+--
+-- > import qualified Data.Text.Lazy as L
+
+module Data.Text.Lazy
+ (
+ -- * Fusion
+ -- $fusion
+
+ -- * Acceptable data
+ -- $replacement
+
+ -- * Types
+ Text
+
+ -- * Creation and elimination
+ , pack
+ , unpack
+ , singleton
+ , empty
+ , fromChunks
+ , toChunks
+ , toStrict
+ , fromStrict
+ , foldrChunks
+ , foldlChunks
+
+ -- * Basic interface
+ , cons
+ , snoc
+ , append
+ , uncons
+ , unsnoc
+ , head
+ , last
+ , tail
+ , init
+ , null
+ , length
+ , compareLength
+
+ -- * Transformations
+ , map
+ , intercalate
+ , intersperse
+ , transpose
+ , reverse
+ , replace
+
+ -- ** Case conversion
+ -- $case
+ , toCaseFold
+ , toLower
+ , toUpper
+ , toTitle
+
+ -- ** Justification
+ , justifyLeft
+ , justifyRight
+ , center
+
+ -- * Folds
+ , foldl
+ , foldl'
+ , foldl1
+ , foldl1'
+ , foldr
+ , foldr1
+
+ -- ** Special folds
+ , concat
+ , concatMap
+ , any
+ , all
+ , maximum
+ , minimum
+
+ -- * Construction
+
+ -- ** Scans
+ , scanl
+ , scanl1
+ , scanr
+ , scanr1
+
+ -- ** Accumulating maps
+ , mapAccumL
+ , mapAccumR
+
+ -- ** Generation and unfolding
+ , repeat
+ , replicate
+ , cycle
+ , iterate
+ , unfoldr
+ , unfoldrN
+
+ -- * Substrings
+
+ -- ** Breaking strings
+ , take
+ , takeEnd
+ , drop
+ , dropEnd
+ , takeWhile
+ , takeWhileEnd
+ , dropWhile
+ , dropWhileEnd
+ , dropAround
+ , strip
+ , stripStart
+ , stripEnd
+ , splitAt
+ , span
+ , breakOn
+ , breakOnEnd
+ , break
+ , group
+ , groupBy
+ , inits
+ , tails
+
+ -- ** Breaking into many substrings
+ -- $split
+ , splitOn
+ , split
+ , chunksOf
+ -- , breakSubstring
+
+ -- ** Breaking into lines and words
+ , lines
+ , words
+ , unlines
+ , unwords
+
+ -- * Predicates
+ , isPrefixOf
+ , isSuffixOf
+ , isInfixOf
+
+ -- ** View patterns
+ , stripPrefix
+ , stripSuffix
+ , commonPrefixes
+
+ -- * Searching
+ , filter
+ , find
+ , breakOnAll
+ , partition
+
+ -- , findSubstring
+
+ -- * Indexing
+ , index
+ , count
+
+ -- * Zipping and unzipping
+ , zip
+ , zipWith
+
+ -- -* Ordered text
+ -- , sort
+ ) where
+
+import Prelude (Char, Bool(..), Maybe(..), String,
+ Eq(..), Ord(..), Ordering(..), Read(..), Show(..),
+ (&&), (||), (+), (-), (.), ($), (++),
+ error, flip, fmap, fromIntegral, not, otherwise, quot)
+import qualified Prelude as P
+#if defined(HAVE_DEEPSEQ)
+import Control.DeepSeq (NFData(..))
+#endif
+import Data.Int (Int64)
+import qualified Data.List as L
+import Data.Char (isSpace)
+import Data.Data (Data(gfoldl, toConstr, gunfold, dataTypeOf), constrIndex,
+ Constr, mkConstr, DataType, mkDataType, Fixity(Prefix))
+import Data.Binary (Binary(get, put))
+import Data.Monoid (Monoid(..))
+#if MIN_VERSION_base(4,9,0)
+import Data.Semigroup (Semigroup(..))
+#endif
+import Data.String (IsString(..))
+import qualified Data.Text as T
+import qualified Data.Text.Internal as T
+import qualified Data.Text.Internal.Fusion.Common as S
+import qualified Data.Text.Unsafe as T
+import qualified Data.Text.Internal.Lazy.Fusion as S
+import Data.Text.Internal.Fusion.Types (PairS(..))
+import Data.Text.Internal.Lazy.Fusion (stream, unstream)
+import Data.Text.Internal.Lazy (Text(..), chunk, empty, foldlChunks,
+ foldrChunks, smallChunkSize)
+import Data.Text.Internal (firstf, safe, text)
+import Data.Text.Lazy.Encoding (decodeUtf8', encodeUtf8)
+import qualified Data.Text.Internal.Functions as F
+import Data.Text.Internal.Lazy.Search (indices)
+#if __GLASGOW_HASKELL__ >= 702
+import qualified GHC.CString as GHC
+#else
+import qualified GHC.Base as GHC
+#endif
+#if __GLASGOW_HASKELL__ >= 708
+import qualified GHC.Exts as Exts
+#endif
+import GHC.Prim (Addr#)
+#if MIN_VERSION_base(4,7,0)
+import Text.Printf (PrintfArg, formatArg, formatString)
+#endif
+
+-- $fusion
+--
+-- Most of the functions in this module are subject to /fusion/,
+-- meaning that a pipeline of such functions will usually allocate at
+-- most one 'Text' value.
+--
+-- As an example, consider the following pipeline:
+--
+-- > import Data.Text.Lazy as T
+-- > import Data.Text.Lazy.Encoding as E
+-- > import Data.ByteString.Lazy (ByteString)
+-- >
+-- > countChars :: ByteString -> Int
+-- > countChars = T.length . T.toUpper . E.decodeUtf8
+--
+-- From the type signatures involved, this looks like it should
+-- allocate one 'ByteString' value, and two 'Text' values. However,
+-- when a module is compiled with optimisation enabled under GHC, the
+-- two intermediate 'Text' values will be optimised away, and the
+-- function will be compiled down to a single loop over the source
+-- 'ByteString'.
+--
+-- Functions that can be fused by the compiler are documented with the
+-- phrase \"Subject to fusion\".
+
+-- $replacement
+--
+-- A 'Text' value is a sequence of Unicode scalar values, as defined
+-- in
+-- <http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#page=35 §3.9, definition D76 of the Unicode 5.2 standard >.
+-- As such, a 'Text' cannot contain values in the range U+D800 to
+-- U+DFFF inclusive. Haskell implementations admit all Unicode code
+-- points
+-- (<http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#page=13 §3.4, definition D10 >)
+-- as 'Char' values, including code points from this invalid range.
+-- This means that there are some 'Char' values that are not valid
+-- Unicode scalar values, and the functions in this module must handle
+-- those cases.
+--
+-- Within this module, many functions construct a 'Text' from one or
+-- more 'Char' values. Those functions will substitute 'Char' values
+-- that are not valid Unicode scalar values with the replacement
+-- character \"&#xfffd;\" (U+FFFD). Functions that perform this
+-- inspection and replacement are documented with the phrase
+-- \"Performs replacement on invalid scalar values\".
+--
+-- (One reason for this policy of replacement is that internally, a
+-- 'Text' value is represented as packed UTF-8 data. Values in the
+-- range U+D800 through U+DFFF are used by UTF-16 to denote surrogate
+-- code points, and so cannot be represented. The functions replace
+-- invalid scalar values, instead of dropping them, as a security
+-- measure. For details, see
+-- <http://unicode.org/reports/tr36/#Deletion_of_Noncharacters Unicode Technical Report 36, §3.5 >.)
+
+equal :: Text -> Text -> Bool
+equal Empty Empty = True
+equal Empty _ = False
+equal _ Empty = False
+equal (Chunk a as) (Chunk b bs) =
+ case compare lenA lenB of
+ LT -> a == (T.takeWord8 lenA b) &&
+ as `equal` Chunk (T.dropWord8 lenA b) bs
+ EQ -> a == b && as `equal` bs
+ GT -> T.takeWord8 lenB a == b &&
+ Chunk (T.dropWord8 lenB a) as `equal` bs
+ where lenA = T.lengthWord8 a
+ lenB = T.lengthWord8 b
+
+instance Eq Text where
+ (==) = equal
+ {-# INLINE (==) #-}
+
+instance Ord Text where
+ compare = compareText
+
+compareText :: Text -> Text -> Ordering
+compareText Empty Empty = EQ
+compareText Empty _ = LT
+compareText _ Empty = GT
+compareText (Chunk a0 as) (Chunk b0 bs) = outer a0 b0
+ where
+ outer ta@(T.Text arrA offA lenA) tb@(T.Text arrB offB lenB) = go 0 0
+ where
+ go !i !j
+ | i >= lenA = compareText as (chunk (T.Text arrB (offB+j) (lenB-j)) bs)
+ | j >= lenB = compareText (chunk (T.Text arrA (offA+i) (lenA-i)) as) bs
+ | a < b = LT
+ | a > b = GT
+ | otherwise = go (i+di) (j+dj)
+ where T.Iter a di = T.iter ta i
+ T.Iter b dj = T.iter tb j
+
+instance Show Text where
+ showsPrec p ps r = showsPrec p (unpack ps) r
+
+instance Read Text where
+ readsPrec p str = [(pack x,y) | (x,y) <- readsPrec p str]
+
+#if MIN_VERSION_base(4,9,0)
+-- | Non-orphan 'Semigroup' instance only defined for
+-- @base-4.9.0.0@ and later; orphan instances for older GHCs are
+-- provided by
+-- the [semigroups](http://hackage.haskell.org/package/semigroups)
+-- package
+--
+-- @since 1.2.2.0
+instance Semigroup Text where
+ (<>) = append
+#endif
+
+instance Monoid Text where
+ mempty = empty
+#if MIN_VERSION_base(4,9,0)
+ mappend = (<>) -- future-proof definition
+#else
+ mappend = append
+#endif
+ mconcat = concat
+
+instance IsString Text where
+ fromString = pack
+
+#if __GLASGOW_HASKELL__ >= 708
+-- | @since 1.2.0.0
+instance Exts.IsList Text where
+ type Item Text = Char
+ fromList = pack
+ toList = unpack
+#endif
+
+#if defined(HAVE_DEEPSEQ)
+instance NFData Text where
+ rnf Empty = ()
+ rnf (Chunk _ ts) = rnf ts
+#endif
+
+-- | @since 1.2.1.0
+instance Binary Text where
+ put t = put (encodeUtf8 t)
+ get = do
+ bs <- get
+ case decodeUtf8' bs of
+ P.Left exn -> P.fail (P.show exn)
+ P.Right a -> P.return a
+
+-- | This instance preserves data abstraction at the cost of inefficiency.
+-- We omit reflection services for the sake of data abstraction.
+--
+-- This instance was created by copying the updated behavior of
+-- @"Data.Text".@'Data.Text.Text'
+instance Data Text where
+ gfoldl f z txt = z pack `f` (unpack txt)
+ toConstr _ = packConstr
+ gunfold k z c = case constrIndex c of
+ 1 -> k (z pack)
+ _ -> error "Data.Text.Lazy.Text.gunfold"
+ dataTypeOf _ = textDataType
+
+#if MIN_VERSION_base(4,7,0)
+-- | Only defined for @base-4.7.0.0@ and later
+--
+-- @since 1.2.2.0
+instance PrintfArg Text where
+ formatArg txt = formatString $ unpack txt
+#endif
+
+packConstr :: Constr
+packConstr = mkConstr textDataType "pack" [] Prefix
+
+textDataType :: DataType
+textDataType = mkDataType "Data.Text.Lazy.Text" [packConstr]
+
+-- | /O(n)/ Convert a 'String' into a 'Text'.
+--
+-- Subject to fusion. Performs replacement on invalid scalar values.
+pack :: String -> Text
+pack = unstream . S.streamList . L.map safe
+{-# INLINE [1] pack #-}
+
+-- | /O(n)/ Convert a 'Text' into a 'String'.
+-- Subject to fusion.
+unpack :: Text -> String
+unpack t = S.unstreamList (stream t)
+{-# INLINE [1] unpack #-}
+
+-- | /O(n)/ Convert a literal string into a Text.
+unpackCString# :: Addr# -> Text
+unpackCString# addr# = unstream (S.streamCString# addr#)
+{-# NOINLINE unpackCString# #-}
+
+{-# RULES "TEXT literal" forall a.
+ unstream (S.streamList (L.map safe (GHC.unpackCString# a)))
+ = unpackCString# a #-}
+
+{-# RULES "TEXT literal UTF8" forall a.
+ unstream (S.streamList (L.map safe (GHC.unpackCStringUtf8# a)))
+ = unpackCString# a #-}
+
+{-# RULES "LAZY TEXT empty literal"
+ unstream (S.streamList (L.map safe []))
+ = Empty #-}
+
+{-# RULES "LAZY TEXT empty literal" forall a.
+ unstream (S.streamList (L.map safe [a]))
+ = Chunk (T.singleton a) Empty #-}
+
+-- | /O(1)/ Convert a character into a Text. Subject to fusion.
+-- Performs replacement on invalid scalar values.
+singleton :: Char -> Text
+singleton c = Chunk (T.singleton c) Empty
+{-# INLINE [1] singleton #-}
+
+{-# RULES
+"LAZY TEXT singleton -> fused" [~1] forall c.
+ singleton c = unstream (S.singleton c)
+"LAZY TEXT singleton -> unfused" [1] forall c.
+ unstream (S.singleton c) = singleton c
+ #-}
+
+-- | /O(c)/ Convert a list of strict 'T.Text's into a lazy 'Text'.
+fromChunks :: [T.Text] -> Text
+fromChunks cs = L.foldr chunk Empty cs
+
+-- | /O(n)/ Convert a lazy 'Text' into a list of strict 'T.Text's.
+toChunks :: Text -> [T.Text]
+toChunks cs = foldrChunks (:) [] cs
+
+-- | /O(n)/ Convert a lazy 'Text' into a strict 'T.Text'.
+toStrict :: Text -> T.Text
+toStrict t = T.concat (toChunks t)
+{-# INLINE [1] toStrict #-}
+
+-- | /O(c)/ Convert a strict 'T.Text' into a lazy 'Text'.
+fromStrict :: T.Text -> Text
+fromStrict t = chunk t Empty
+{-# INLINE [1] fromStrict #-}
+
+-- -----------------------------------------------------------------------------
+-- * Basic functions
+
+-- | /O(1)/ Adds a character to the front of a 'Text'. Subject to fusion.
+cons :: Char -> Text -> Text
+cons c t = Chunk (T.singleton c) t
+{-# INLINE [1] cons #-}
+
+infixr 5 `cons`
+
+{-# RULES
+"LAZY TEXT cons -> fused" [~1] forall c t.
+ cons c t = unstream (S.cons c (stream t))
+"LAZY TEXT cons -> unfused" [1] forall c t.
+ unstream (S.cons c (stream t)) = cons c t
+ #-}
+
+-- | /O(n)/ Adds a character to the end of a 'Text'. This copies the
+-- entire array in the process, unless fused. Subject to fusion.
+snoc :: Text -> Char -> Text
+snoc t c = foldrChunks Chunk (singleton c) t
+{-# INLINE [1] snoc #-}
+
+{-# RULES
+"LAZY TEXT snoc -> fused" [~1] forall t c.
+ snoc t c = unstream (S.snoc (stream t) c)
+"LAZY TEXT snoc -> unfused" [1] forall t c.
+ unstream (S.snoc (stream t) c) = snoc t c
+ #-}
+
+-- | /O(n\/c)/ Appends one 'Text' to another. Subject to fusion.
+append :: Text -> Text -> Text
+append xs ys = foldrChunks Chunk ys xs
+{-# INLINE [1] append #-}
+
+{-# RULES
+"LAZY TEXT append -> fused" [~1] forall t1 t2.
+ append t1 t2 = unstream (S.append (stream t1) (stream t2))
+"LAZY TEXT append -> unfused" [1] forall t1 t2.
+ unstream (S.append (stream t1) (stream t2)) = append t1 t2
+ #-}
+
+-- | /O(1)/ Returns the first character and rest of a 'Text', or
+-- 'Nothing' if empty. Subject to fusion.
+uncons :: Text -> Maybe (Char, Text)
+uncons Empty = Nothing
+uncons (Chunk t ts) = Just (T.unsafeHead t, ts')
+ where ts' | T.compareLength t 1 == EQ = ts
+ | otherwise = Chunk (T.unsafeTail t) ts
+{-# INLINE uncons #-}
+
+-- | /O(1)/ Returns the first character of a 'Text', which must be
+-- non-empty. Subject to fusion.
+head :: Text -> Char
+head t = S.head (stream t)
+{-# INLINE head #-}
+
+-- | /O(1)/ Returns all characters after the head of a 'Text', which
+-- must be non-empty. Subject to fusion.
+tail :: Text -> Text
+tail (Chunk t ts) = chunk (T.tail t) ts
+tail Empty = emptyError "tail"
+{-# INLINE [1] tail #-}
+
+{-# RULES
+"LAZY TEXT tail -> fused" [~1] forall t.
+ tail t = unstream (S.tail (stream t))
+"LAZY TEXT tail -> unfused" [1] forall t.
+ unstream (S.tail (stream t)) = tail t
+ #-}
+
+-- | /O(n\/c)/ Returns all but the last character of a 'Text', which must
+-- be non-empty. Subject to fusion.
+init :: Text -> Text
+init (Chunk t0 ts0) = go t0 ts0
+ where go t (Chunk t' ts) = Chunk t (go t' ts)
+ go t Empty = chunk (T.init t) Empty
+init Empty = emptyError "init"
+{-# INLINE [1] init #-}
+
+{-# RULES
+"LAZY TEXT init -> fused" [~1] forall t.
+ init t = unstream (S.init (stream t))
+"LAZY TEXT init -> unfused" [1] forall t.
+ unstream (S.init (stream t)) = init t
+ #-}
+
+-- | /O(n\/c)/ Returns the 'init' and 'last' of a 'Text', or 'Nothing' if
+-- empty.
+--
+-- * It is no faster than using 'init' and 'last'.
+--
+-- @since 1.2.3.0
+unsnoc :: Text -> Maybe (Text, Char)
+unsnoc Empty = Nothing
+unsnoc ts@(Chunk _ _) = Just (init ts, last ts)
+{-# INLINE unsnoc #-}
+
+-- | /O(1)/ Tests whether a 'Text' is empty or not. Subject to
+-- fusion.
+null :: Text -> Bool
+null Empty = True
+null _ = False
+{-# INLINE [1] null #-}
+
+{-# RULES
+"LAZY TEXT null -> fused" [~1] forall t.
+ null t = S.null (stream t)
+"LAZY TEXT null -> unfused" [1] forall t.
+ S.null (stream t) = null t
+ #-}
+
+-- | /O(1)/ Tests whether a 'Text' contains exactly one character.
+-- Subject to fusion.
+isSingleton :: Text -> Bool
+isSingleton = S.isSingleton . stream
+{-# INLINE isSingleton #-}
+
+-- | /O(n\/c)/ Returns the last character of a 'Text', which must be
+-- non-empty. Subject to fusion.
+last :: Text -> Char
+last Empty = emptyError "last"
+last (Chunk t ts) = go t ts
+ where go _ (Chunk t' ts') = go t' ts'
+ go t' Empty = T.last t'
+{-# INLINE [1] last #-}
+
+{-# RULES
+"LAZY TEXT last -> fused" [~1] forall t.
+ last t = S.last (stream t)
+"LAZY TEXT last -> unfused" [1] forall t.
+ S.last (stream t) = last t
+ #-}
+
+-- | /O(n)/ Returns the number of characters in a 'Text'.
+-- Subject to fusion.
+length :: Text -> Int64
+length = foldlChunks go 0
+ where go l t = l + fromIntegral (T.length t)
+{-# INLINE [1] length #-}
+
+{-# RULES
+"LAZY TEXT length -> fused" [~1] forall t.
+ length t = S.length (stream t)
+"LAZY TEXT length -> unfused" [1] forall t.
+ S.length (stream t) = length t
+ #-}
+
+-- | /O(n)/ Compare the count of characters in a 'Text' to a number.
+-- Subject to fusion.
+--
+-- This function gives the same answer as comparing against the result
+-- of 'length', but can short circuit if the count of characters is
+-- greater than the number, and hence be more efficient.
+compareLength :: Text -> Int64 -> Ordering
+compareLength t n = S.compareLengthI (stream t) n
+{-# INLINE [1] compareLength #-}
+
+-- We don't apply those otherwise appealing length-to-compareLength
+-- rewrite rules here, because they can change the strictness
+-- properties of code.
+
+-- | /O(n)/ 'map' @f@ @t@ is the 'Text' obtained by applying @f@ to
+-- each element of @t@. Subject to fusion. Performs replacement on
+-- invalid scalar values.
+map :: (Char -> Char) -> Text -> Text
+map f t = unstream (S.map (safe . f) (stream t))
+{-# INLINE [1] map #-}
+
+-- | /O(n)/ The 'intercalate' function takes a 'Text' and a list of
+-- 'Text's and concatenates the list after interspersing the first
+-- argument between each element of the list.
+intercalate :: Text -> [Text] -> Text
+intercalate t = concat . (F.intersperse t)
+{-# INLINE intercalate #-}
+
+-- | /O(n)/ The 'intersperse' function takes a character and places it
+-- between the characters of a 'Text'. Subject to fusion. Performs
+-- replacement on invalid scalar values.
+intersperse :: Char -> Text -> Text
+intersperse c t = unstream (S.intersperse (safe c) (stream t))
+{-# INLINE intersperse #-}
+
+-- | /O(n)/ Left-justify a string to the given length, using the
+-- specified fill character on the right. Subject to fusion. Performs
+-- replacement on invalid scalar values.
+--
+-- Examples:
+--
+-- > justifyLeft 7 'x' "foo" == "fooxxxx"
+-- > justifyLeft 3 'x' "foobar" == "foobar"
+justifyLeft :: Int64 -> Char -> Text -> Text
+justifyLeft k c t
+ | len >= k = t
+ | otherwise = t `append` replicateChar (k-len) c
+ where len = length t
+{-# INLINE [1] justifyLeft #-}
+
+{-# RULES
+"LAZY TEXT justifyLeft -> fused" [~1] forall k c t.
+ justifyLeft k c t = unstream (S.justifyLeftI k c (stream t))
+"LAZY TEXT justifyLeft -> unfused" [1] forall k c t.
+ unstream (S.justifyLeftI k c (stream t)) = justifyLeft k c t
+ #-}
+
+-- | /O(n)/ Right-justify a string to the given length, using the
+-- specified fill character on the left. Performs replacement on
+-- invalid scalar values.
+--
+-- Examples:
+--
+-- > justifyRight 7 'x' "bar" == "xxxxbar"
+-- > justifyRight 3 'x' "foobar" == "foobar"
+justifyRight :: Int64 -> Char -> Text -> Text
+justifyRight k c t
+ | len >= k = t
+ | otherwise = replicateChar (k-len) c `append` t
+ where len = length t
+{-# INLINE justifyRight #-}
+
+-- | /O(n)/ Center a string to the given length, using the specified
+-- fill character on either side. Performs replacement on invalid
+-- scalar values.
+--
+-- Examples:
+--
+-- > center 8 'x' "HS" = "xxxHSxxx"
+center :: Int64 -> Char -> Text -> Text
+center k c t
+ | len >= k = t
+ | otherwise = replicateChar l c `append` t `append` replicateChar r c
+ where len = length t
+ d = k - len
+ r = d `quot` 2
+ l = d - r
+{-# INLINE center #-}
+
+-- | /O(n)/ The 'transpose' function transposes the rows and columns
+-- of its 'Text' argument. Note that this function uses 'pack',
+-- 'unpack', and the list version of transpose, and is thus not very
+-- efficient.
+transpose :: [Text] -> [Text]
+transpose ts = L.map (\ss -> Chunk (T.pack ss) Empty)
+ (L.transpose (L.map unpack ts))
+-- TODO: make this fast
+
+-- | /O(n)/ 'reverse' @t@ returns the elements of @t@ in reverse order.
+reverse :: Text -> Text
+reverse = rev Empty
+ where rev a Empty = a
+ rev a (Chunk t ts) = rev (Chunk (T.reverse t) a) ts
+
+-- | /O(m+n)/ Replace every non-overlapping occurrence of @needle@ in
+-- @haystack@ with @replacement@.
+--
+-- This function behaves as though it was defined as follows:
+--
+-- @
+-- replace needle replacement haystack =
+-- 'intercalate' replacement ('splitOn' needle haystack)
+-- @
+--
+-- As this suggests, each occurrence is replaced exactly once. So if
+-- @needle@ occurs in @replacement@, that occurrence will /not/ itself
+-- be replaced recursively:
+--
+-- > replace "oo" "foo" "oo" == "foo"
+--
+-- In cases where several instances of @needle@ overlap, only the
+-- first one will be replaced:
+--
+-- > replace "ofo" "bar" "ofofo" == "barfo"
+--
+-- In (unlikely) bad cases, this function's time complexity degrades
+-- towards /O(n*m)/.
+replace :: Text
+ -- ^ @needle@ to search for. If this string is empty, an
+ -- error will occur.
+ -> Text
+ -- ^ @replacement@ to replace @needle@ with.
+ -> Text
+ -- ^ @haystack@ in which to search.
+ -> Text
+replace s d = intercalate d . splitOn s
+{-# INLINE replace #-}
+
+-- ----------------------------------------------------------------------------
+-- ** Case conversions (folds)
+
+-- $case
+--
+-- With Unicode text, it is incorrect to use combinators like @map
+-- toUpper@ to case convert each character of a string individually.
+-- Instead, use the whole-string case conversion functions from this
+-- module. For correctness in different writing systems, these
+-- functions may map one input character to two or three output
+-- characters.
+
+-- | /O(n)/ Convert a string to folded case. Subject to fusion.
+--
+-- This function is mainly useful for performing caseless (or case
+-- insensitive) string comparisons.
+--
+-- A string @x@ is a caseless match for a string @y@ if and only if:
+--
+-- @toCaseFold x == toCaseFold y@
+--
+-- The result string may be longer than the input string, and may
+-- differ from applying 'toLower' to the input string. For instance,
+-- the Armenian small ligature men now (U+FB13) is case folded to the
+-- bigram men now (U+0574 U+0576), while the micro sign (U+00B5) is
+-- case folded to the Greek small letter letter mu (U+03BC) instead of
+-- itself.
+toCaseFold :: Text -> Text
+toCaseFold t = unstream (S.toCaseFold (stream t))
+{-# INLINE [0] toCaseFold #-}
+
+-- | /O(n)/ Convert a string to lower case, using simple case
+-- conversion. Subject to fusion.
+--
+-- The result string may be longer than the input string. For
+-- instance, the Latin capital letter I with dot above (U+0130) maps
+-- to the sequence Latin small letter i (U+0069) followed by combining
+-- dot above (U+0307).
+toLower :: Text -> Text
+toLower t = unstream (S.toLower (stream t))
+{-# INLINE toLower #-}
+
+-- | /O(n)/ Convert a string to upper case, using simple case
+-- conversion. Subject to fusion.
+--
+-- The result string may be longer than the input string. For
+-- instance, the German eszett (U+00DF) maps to the two-letter
+-- sequence SS.
+toUpper :: Text -> Text
+toUpper t = unstream (S.toUpper (stream t))
+{-# INLINE toUpper #-}
+
+
+-- | /O(n)/ Convert a string to title case, using simple case
+-- conversion. Subject to fusion.
+--
+-- The first letter of the input is converted to title case, as is
+-- every subsequent letter that immediately follows a non-letter.
+-- Every letter that immediately follows another letter is converted
+-- to lower case.
+--
+-- The result string may be longer than the input string. For example,
+-- the Latin small ligature &#xfb02; (U+FB02) is converted to the
+-- sequence Latin capital letter F (U+0046) followed by Latin small
+-- letter l (U+006C).
+--
+-- /Note/: this function does not take language or culture specific
+-- rules into account. For instance, in English, different style
+-- guides disagree on whether the book name \"The Hill of the Red
+-- Fox\" is correctly title cased&#x2014;but this function will
+-- capitalize /every/ word.
+--
+-- @since 1.0.0.0
+toTitle :: Text -> Text
+toTitle t = unstream (S.toTitle (stream t))
+{-# INLINE toTitle #-}
+
+-- | /O(n)/ 'foldl', applied to a binary operator, a starting value
+-- (typically the left-identity of the operator), and a 'Text',
+-- reduces the 'Text' using the binary operator, from left to right.
+-- Subject to fusion.
+foldl :: (a -> Char -> a) -> a -> Text -> a
+foldl f z t = S.foldl f z (stream t)
+{-# INLINE foldl #-}
+
+-- | /O(n)/ A strict version of 'foldl'.
+-- Subject to fusion.
+foldl' :: (a -> Char -> a) -> a -> Text -> a
+foldl' f z t = S.foldl' f z (stream t)
+{-# INLINE foldl' #-}
+
+-- | /O(n)/ A variant of 'foldl' that has no starting value argument,
+-- and thus must be applied to a non-empty 'Text'. Subject to fusion.
+foldl1 :: (Char -> Char -> Char) -> Text -> Char
+foldl1 f t = S.foldl1 f (stream t)
+{-# INLINE foldl1 #-}
+
+-- | /O(n)/ A strict version of 'foldl1'. Subject to fusion.
+foldl1' :: (Char -> Char -> Char) -> Text -> Char
+foldl1' f t = S.foldl1' f (stream t)
+{-# INLINE foldl1' #-}
+
+-- | /O(n)/ 'foldr', applied to a binary operator, a starting value
+-- (typically the right-identity of the operator), and a 'Text',
+-- reduces the 'Text' using the binary operator, from right to left.
+-- Subject to fusion.
+foldr :: (Char -> a -> a) -> a -> Text -> a
+foldr f z t = S.foldr f z (stream t)
+{-# INLINE foldr #-}
+
+-- | /O(n)/ A variant of 'foldr' that has no starting value argument,
+-- and thus must be applied to a non-empty 'Text'. Subject to
+-- fusion.
+foldr1 :: (Char -> Char -> Char) -> Text -> Char
+foldr1 f t = S.foldr1 f (stream t)
+{-# INLINE foldr1 #-}
+
+-- | /O(n)/ Concatenate a list of 'Text's.
+concat :: [Text] -> Text
+concat = to
+ where
+ go Empty css = to css
+ go (Chunk c cs) css = Chunk c (go cs css)
+ to [] = Empty
+ to (cs:css) = go cs css
+{-# INLINE concat #-}
+
+-- | /O(n)/ Map a function over a 'Text' that results in a 'Text', and
+-- concatenate the results.
+concatMap :: (Char -> Text) -> Text -> Text
+concatMap f = concat . foldr ((:) . f) []
+{-# INLINE concatMap #-}
+
+-- | /O(n)/ 'any' @p@ @t@ determines whether any character in the
+-- 'Text' @t@ satisfies the predicate @p@. Subject to fusion.
+any :: (Char -> Bool) -> Text -> Bool
+any p t = S.any p (stream t)
+{-# INLINE any #-}
+
+-- | /O(n)/ 'all' @p@ @t@ determines whether all characters in the
+-- 'Text' @t@ satisfy the predicate @p@. Subject to fusion.
+all :: (Char -> Bool) -> Text -> Bool
+all p t = S.all p (stream t)
+{-# INLINE all #-}
+
+-- | /O(n)/ 'maximum' returns the maximum value from a 'Text', which
+-- must be non-empty. Subject to fusion.
+maximum :: Text -> Char
+maximum t = S.maximum (stream t)
+{-# INLINE maximum #-}
+
+-- | /O(n)/ 'minimum' returns the minimum value from a 'Text', which
+-- must be non-empty. Subject to fusion.
+minimum :: Text -> Char
+minimum t = S.minimum (stream t)
+{-# INLINE minimum #-}
+
+-- | /O(n)/ 'scanl' is similar to 'foldl', but returns a list of
+-- successive reduced values from the left. Subject to fusion.
+-- Performs replacement on invalid scalar values.
+--
+-- > scanl f z [x1, x2, ...] == [z, z `f` x1, (z `f` x1) `f` x2, ...]
+--
+-- Note that
+--
+-- > last (scanl f z xs) == foldl f z xs.
+scanl :: (Char -> Char -> Char) -> Char -> Text -> Text
+scanl f z t = unstream (S.scanl g z (stream t))
+ where g a b = safe (f a b)
+{-# INLINE scanl #-}
+
+-- | /O(n)/ 'scanl1' is a variant of 'scanl' that has no starting
+-- value argument. Subject to fusion. Performs replacement on
+-- invalid scalar values.
+--
+-- > scanl1 f [x1, x2, ...] == [x1, x1 `f` x2, ...]
+scanl1 :: (Char -> Char -> Char) -> Text -> Text
+scanl1 f t0 = case uncons t0 of
+ Nothing -> empty
+ Just (t,ts) -> scanl f t ts
+{-# INLINE scanl1 #-}
+
+-- | /O(n)/ 'scanr' is the right-to-left dual of 'scanl'. Performs
+-- replacement on invalid scalar values.
+--
+-- > scanr f v == reverse . scanl (flip f) v . reverse
+scanr :: (Char -> Char -> Char) -> Char -> Text -> Text
+scanr f v = reverse . scanl g v . reverse
+ where g a b = safe (f b a)
+
+-- | /O(n)/ 'scanr1' is a variant of 'scanr' that has no starting
+-- value argument. Performs replacement on invalid scalar values.
+scanr1 :: (Char -> Char -> Char) -> Text -> Text
+scanr1 f t | null t = empty
+ | otherwise = scanr f (last t) (init t)
+
+-- | /O(n)/ Like a combination of 'map' and 'foldl''. Applies a
+-- function to each element of a 'Text', passing an accumulating
+-- parameter from left to right, and returns a final 'Text'. Performs
+-- replacement on invalid scalar values.
+mapAccumL :: (a -> Char -> (a,Char)) -> a -> Text -> (a, Text)
+mapAccumL f = go
+ where
+ go z (Chunk c cs) = (z'', Chunk c' cs')
+ where (z', c') = T.mapAccumL f z c
+ (z'', cs') = go z' cs
+ go z Empty = (z, Empty)
+{-# INLINE mapAccumL #-}
+
+-- | The 'mapAccumR' function behaves like a combination of 'map' and
+-- a strict 'foldr'; it applies a function to each element of a
+-- 'Text', passing an accumulating parameter from right to left, and
+-- returning a final value of this accumulator together with the new
+-- 'Text'. Performs replacement on invalid scalar values.
+mapAccumR :: (a -> Char -> (a,Char)) -> a -> Text -> (a, Text)
+mapAccumR f = go
+ where
+ go z (Chunk c cs) = (z'', Chunk c' cs')
+ where (z'', c') = T.mapAccumR f z' c
+ (z', cs') = go z cs
+ go z Empty = (z, Empty)
+{-# INLINE mapAccumR #-}
+
+-- | @'repeat' x@ is an infinite 'Text', with @x@ the value of every
+-- element.
+--
+-- @since 1.2.0.5
+repeat :: Char -> Text
+repeat c = let t = Chunk (T.replicate smallChunkSize (T.singleton c)) t
+ in t
+
+-- | /O(n*m)/ 'replicate' @n@ @t@ is a 'Text' consisting of the input
+-- @t@ repeated @n@ times.
+replicate :: Int64 -> Text -> Text
+replicate n t
+ | null t || n <= 0 = empty
+ | isSingleton t = replicateChar n (head t)
+ | otherwise = concat (rep 0)
+ where rep !i | i >= n = []
+ | otherwise = t : rep (i+1)
+{-# INLINE [1] replicate #-}
+
+-- | 'cycle' ties a finite, non-empty 'Text' into a circular one, or
+-- equivalently, the infinite repetition of the original 'Text'.
+--
+-- @since 1.2.0.5
+cycle :: Text -> Text
+cycle Empty = emptyError "cycle"
+cycle t = let t' = foldrChunks Chunk t' t
+ in t'
+
+-- | @'iterate' f x@ returns an infinite 'Text' of repeated applications
+-- of @f@ to @x@:
+--
+-- > iterate f x == [x, f x, f (f x), ...]
+--
+-- @since 1.2.0.5
+iterate :: (Char -> Char) -> Char -> Text
+iterate f c = let t c' = Chunk (T.singleton c') (t (f c'))
+ in t c
+
+-- | /O(n)/ 'replicateChar' @n@ @c@ is a 'Text' of length @n@ with @c@ the
+-- value of every element. Subject to fusion.
+replicateChar :: Int64 -> Char -> Text
+replicateChar n c = unstream (S.replicateCharI n (safe c))
+{-# INLINE replicateChar #-}
+
+{-# RULES
+"LAZY TEXT replicate/singleton -> replicateChar" [~1] forall n c.
+ replicate n (singleton c) = replicateChar n c
+ #-}
+
+-- | /O(n)/, where @n@ is the length of the result. The 'unfoldr'
+-- function is analogous to the List 'L.unfoldr'. 'unfoldr' builds a
+-- 'Text' from a seed value. The function takes the element and
+-- returns 'Nothing' if it is done producing the 'Text', otherwise
+-- 'Just' @(a,b)@. In this case, @a@ is the next 'Char' in the
+-- string, and @b@ is the seed value for further production. Performs
+-- replacement on invalid scalar values.
+unfoldr :: (a -> Maybe (Char,a)) -> a -> Text
+unfoldr f s = unstream (S.unfoldr (firstf safe . f) s)
+{-# INLINE unfoldr #-}
+
+-- | /O(n)/ Like 'unfoldr', 'unfoldrN' builds a 'Text' from a seed
+-- value. However, the length of the result should be limited by the
+-- first argument to 'unfoldrN'. This function is more efficient than
+-- 'unfoldr' when the maximum length of the result is known and
+-- correct, otherwise its performance is similar to 'unfoldr'.
+-- Performs replacement on invalid scalar values.
+unfoldrN :: Int64 -> (a -> Maybe (Char,a)) -> a -> Text
+unfoldrN n f s = unstream (S.unfoldrN n (firstf safe . f) s)
+{-# INLINE unfoldrN #-}
+
+-- | /O(n)/ 'take' @n@, applied to a 'Text', returns the prefix of the
+-- 'Text' of length @n@, or the 'Text' itself if @n@ is greater than
+-- the length of the Text. Subject to fusion.
+take :: Int64 -> Text -> Text
+take i _ | i <= 0 = Empty
+take i t0 = take' i t0
+ where take' 0 _ = Empty
+ take' _ Empty = Empty
+ take' n (Chunk t ts)
+ | n < len = Chunk (T.take (fromIntegral n) t) Empty
+ | otherwise = Chunk t (take' (n - len) ts)
+ where len = fromIntegral (T.length t)
+{-# INLINE [1] take #-}
+
+{-# RULES
+"LAZY TEXT take -> fused" [~1] forall n t.
+ take n t = unstream (S.take n (stream t))
+"LAZY TEXT take -> unfused" [1] forall n t.
+ unstream (S.take n (stream t)) = take n t
+ #-}
+
+-- | /O(n)/ 'takeEnd' @n@ @t@ returns the suffix remaining after
+-- taking @n@ characters from the end of @t@.
+--
+-- Examples:
+--
+-- > takeEnd 3 "foobar" == "bar"
+--
+-- @since 1.1.1.0
+takeEnd :: Int64 -> Text -> Text
+takeEnd n t0
+ | n <= 0 = empty
+ | otherwise = takeChunk n empty . L.reverse . toChunks $ t0
+ where takeChunk _ acc [] = acc
+ takeChunk i acc (t:ts)
+ | i <= l = chunk (T.takeEnd (fromIntegral i) t) acc
+ | otherwise = takeChunk (i-l) (Chunk t acc) ts
+ where l = fromIntegral (T.length t)
+
+-- | /O(n)/ 'drop' @n@, applied to a 'Text', returns the suffix of the
+-- 'Text' after the first @n@ characters, or the empty 'Text' if @n@
+-- is greater than the length of the 'Text'. Subject to fusion.
+drop :: Int64 -> Text -> Text
+drop i t0
+ | i <= 0 = t0
+ | otherwise = drop' i t0
+ where drop' 0 ts = ts
+ drop' _ Empty = Empty
+ drop' n (Chunk t ts)
+ | n < len = Chunk (T.drop (fromIntegral n) t) ts
+ | otherwise = drop' (n - len) ts
+ where len = fromIntegral (T.length t)
+{-# INLINE [1] drop #-}
+
+{-# RULES
+"LAZY TEXT drop -> fused" [~1] forall n t.
+ drop n t = unstream (S.drop n (stream t))
+"LAZY TEXT drop -> unfused" [1] forall n t.
+ unstream (S.drop n (stream t)) = drop n t
+ #-}
+
+-- | /O(n)/ 'dropEnd' @n@ @t@ returns the prefix remaining after
+-- dropping @n@ characters from the end of @t@.
+--
+-- Examples:
+--
+-- > dropEnd 3 "foobar" == "foo"
+--
+-- @since 1.1.1.0
+dropEnd :: Int64 -> Text -> Text
+dropEnd n t0
+ | n <= 0 = t0
+ | otherwise = dropChunk n . L.reverse . toChunks $ t0
+ where dropChunk _ [] = empty
+ dropChunk m (t:ts)
+ | m >= l = dropChunk (m-l) ts
+ | otherwise = fromChunks . L.reverse $
+ T.dropEnd (fromIntegral m) t : ts
+ where l = fromIntegral (T.length t)
+
+-- | /O(n)/ 'dropWords' @n@ returns the suffix with @n@ 'Word16'
+-- values dropped, or the empty 'Text' if @n@ is greater than the
+-- number of 'Word16' values present.
+dropWords :: Int64 -> Text -> Text
+dropWords i t0
+ | i <= 0 = t0
+ | otherwise = drop' i t0
+ where drop' 0 ts = ts
+ drop' _ Empty = Empty
+ drop' n (Chunk (T.Text arr off len) ts)
+ | n < len' = chunk (text arr (off+n') (len-n')) ts
+ | otherwise = drop' (n - len') ts
+ where len' = fromIntegral len
+ n' = fromIntegral n
+
+-- | /O(n)/ 'takeWhile', applied to a predicate @p@ and a 'Text',
+-- returns the longest prefix (possibly empty) of elements that
+-- satisfy @p@. Subject to fusion.
+takeWhile :: (Char -> Bool) -> Text -> Text
+takeWhile p t0 = takeWhile' t0
+ where takeWhile' Empty = Empty
+ takeWhile' (Chunk t ts) =
+ case T.findIndex (not . p) t of
+ Just n | n > 0 -> Chunk (T.take n t) Empty
+ | otherwise -> Empty
+ Nothing -> Chunk t (takeWhile' ts)
+{-# INLINE [1] takeWhile #-}
+
+{-# RULES
+"LAZY TEXT takeWhile -> fused" [~1] forall p t.
+ takeWhile p t = unstream (S.takeWhile p (stream t))
+"LAZY TEXT takeWhile -> unfused" [1] forall p t.
+ unstream (S.takeWhile p (stream t)) = takeWhile p t
+ #-}
+-- | /O(n)/ 'takeWhileEnd', applied to a predicate @p@ and a 'Text',
+-- returns the longest suffix (possibly empty) of elements that
+-- satisfy @p@.
+-- Examples:
+--
+-- > takeWhileEnd (=='o') "foo" == "oo"
+--
+-- @since 1.2.2.0
+takeWhileEnd :: (Char -> Bool) -> Text -> Text
+takeWhileEnd p = takeChunk empty . L.reverse . toChunks
+ where takeChunk acc [] = acc
+ takeChunk acc (t:ts)
+ | T.lengthWord8 t' < T.lengthWord8 t
+ = chunk t' acc
+ | otherwise = takeChunk (Chunk t' acc) ts
+ where t' = T.takeWhileEnd p t
+{-# INLINE takeWhileEnd #-}
+
+-- | /O(n)/ 'dropWhile' @p@ @t@ returns the suffix remaining after
+-- 'takeWhile' @p@ @t@. Subject to fusion.
+dropWhile :: (Char -> Bool) -> Text -> Text
+dropWhile p t0 = dropWhile' t0
+ where dropWhile' Empty = Empty
+ dropWhile' (Chunk t ts) =
+ case T.findIndex (not . p) t of
+ Just n -> Chunk (T.drop n t) ts
+ Nothing -> dropWhile' ts
+{-# INLINE [1] dropWhile #-}
+
+{-# RULES
+"LAZY TEXT dropWhile -> fused" [~1] forall p t.
+ dropWhile p t = unstream (S.dropWhile p (stream t))
+"LAZY TEXT dropWhile -> unfused" [1] forall p t.
+ unstream (S.dropWhile p (stream t)) = dropWhile p t
+ #-}
+
+-- | /O(n)/ 'dropWhileEnd' @p@ @t@ returns the prefix remaining after
+-- dropping characters that satisfy the predicate @p@ from the end of
+-- @t@.
+--
+-- Examples:
+--
+-- > dropWhileEnd (=='.') "foo..." == "foo"
+dropWhileEnd :: (Char -> Bool) -> Text -> Text
+dropWhileEnd p = go
+ where go Empty = Empty
+ go (Chunk t Empty) = if T.null t'
+ then Empty
+ else Chunk t' Empty
+ where t' = T.dropWhileEnd p t
+ go (Chunk t ts) = case go ts of
+ Empty -> go (Chunk t Empty)
+ ts' -> Chunk t ts'
+{-# INLINE dropWhileEnd #-}
+
+-- | /O(n)/ 'dropAround' @p@ @t@ returns the substring remaining after
+-- dropping characters that satisfy the predicate @p@ from both the
+-- beginning and end of @t@. Subject to fusion.
+dropAround :: (Char -> Bool) -> Text -> Text
+dropAround p = dropWhile p . dropWhileEnd p
+{-# INLINE [1] dropAround #-}
+
+-- | /O(n)/ Remove leading white space from a string. Equivalent to:
+--
+-- > dropWhile isSpace
+stripStart :: Text -> Text
+stripStart = dropWhile isSpace
+{-# INLINE [1] stripStart #-}
+
+-- | /O(n)/ Remove trailing white space from a string. Equivalent to:
+--
+-- > dropWhileEnd isSpace
+stripEnd :: Text -> Text
+stripEnd = dropWhileEnd isSpace
+{-# INLINE [1] stripEnd #-}
+
+-- | /O(n)/ Remove leading and trailing white space from a string.
+-- Equivalent to:
+--
+-- > dropAround isSpace
+strip :: Text -> Text
+strip = dropAround isSpace
+{-# INLINE [1] strip #-}
+
+-- | /O(n)/ 'splitAt' @n t@ returns a pair whose first element is a
+-- prefix of @t@ of length @n@, and whose second is the remainder of
+-- the string. It is equivalent to @('take' n t, 'drop' n t)@.
+splitAt :: Int64 -> Text -> (Text, Text)
+splitAt = loop
+ where loop _ Empty = (empty, empty)
+ loop n t | n <= 0 = (empty, t)
+ loop n (Chunk t ts)
+ | n < len = let (t',t'') = T.splitAt (fromIntegral n) t
+ in (Chunk t' Empty, Chunk t'' ts)
+ | otherwise = let (ts',ts'') = loop (n - len) ts
+ in (Chunk t ts', ts'')
+ where len = fromIntegral (T.length t)
+
+-- | /O(n)/ 'splitAtWord' @n t@ returns a strict pair whose first
+-- element is a prefix of @t@ whose chunks contain @n@ 'Word16'
+-- values, and whose second is the remainder of the string.
+splitAtWord :: Int64 -> Text -> PairS Text Text
+splitAtWord _ Empty = empty :*: empty
+splitAtWord x (Chunk c@(T.Text arr off len) cs)
+ | y >= len = let h :*: t = splitAtWord (x-fromIntegral len) cs
+ in Chunk c h :*: t
+ | otherwise = chunk (text arr off y) empty :*:
+ chunk (text arr (off+y) (len-y)) cs
+ where y = fromIntegral x
+
+-- | /O(n+m)/ Find the first instance of @needle@ (which must be
+-- non-'null') in @haystack@. The first element of the returned tuple
+-- is the prefix of @haystack@ before @needle@ is matched. The second
+-- is the remainder of @haystack@, starting with the match.
+--
+-- Examples:
+--
+-- > breakOn "::" "a::b::c" ==> ("a", "::b::c")
+-- > breakOn "/" "foobar" ==> ("foobar", "")
+--
+-- Laws:
+--
+-- > append prefix match == haystack
+-- > where (prefix, match) = breakOn needle haystack
+--
+-- If you need to break a string by a substring repeatedly (e.g. you
+-- want to break on every instance of a substring), use 'breakOnAll'
+-- instead, as it has lower startup overhead.
+--
+-- This function is strict in its first argument, and lazy in its
+-- second.
+--
+-- In (unlikely) bad cases, this function's time complexity degrades
+-- towards /O(n*m)/.
+breakOn :: Text -> Text -> (Text, Text)
+breakOn pat src
+ | null pat = emptyError "breakOn"
+ | otherwise = case indices pat src of
+ [] -> (src, empty)
+ (x:_) -> let h :*: t = splitAtWord x src
+ in (h, t)
+
+-- | /O(n+m)/ Similar to 'breakOn', but searches from the end of the string.
+--
+-- The first element of the returned tuple is the prefix of @haystack@
+-- up to and including the last match of @needle@. The second is the
+-- remainder of @haystack@, following the match.
+--
+-- > breakOnEnd "::" "a::b::c" ==> ("a::b::", "c")
+breakOnEnd :: Text -> Text -> (Text, Text)
+breakOnEnd pat src = let (a,b) = breakOn (reverse pat) (reverse src)
+ in (reverse b, reverse a)
+{-# INLINE breakOnEnd #-}
+
+-- | /O(n+m)/ Find all non-overlapping instances of @needle@ in
+-- @haystack@. Each element of the returned list consists of a pair:
+--
+-- * The entire string prior to the /k/th match (i.e. the prefix)
+--
+-- * The /k/th match, followed by the remainder of the string
+--
+-- Examples:
+--
+-- > breakOnAll "::" ""
+-- > ==> []
+-- > breakOnAll "/" "a/b/c/"
+-- > ==> [("a", "/b/c/"), ("a/b", "/c/"), ("a/b/c", "/")]
+--
+-- This function is strict in its first argument, and lazy in its
+-- second.
+--
+-- In (unlikely) bad cases, this function's time complexity degrades
+-- towards /O(n*m)/.
+--
+-- The @needle@ parameter may not be empty.
+breakOnAll :: Text -- ^ @needle@ to search for
+ -> Text -- ^ @haystack@ in which to search
+ -> [(Text, Text)]
+breakOnAll pat src
+ | null pat = emptyError "breakOnAll"
+ | otherwise = go 0 empty src (indices pat src)
+ where
+ go !n p s (x:xs) = let h :*: t = splitAtWord (x-n) s
+ h' = append p h
+ in (h',t) : go x h' t xs
+ go _ _ _ _ = []
+
+-- | /O(n)/ 'break' is like 'span', but the prefix returned is over
+-- elements that fail the predicate @p@.
+break :: (Char -> Bool) -> Text -> (Text, Text)
+break p t0 = break' t0
+ where break' Empty = (empty, empty)
+ break' c@(Chunk t ts) =
+ case T.findIndex p t of
+ Nothing -> let (ts', ts'') = break' ts
+ in (Chunk t ts', ts'')
+ Just n | n == 0 -> (Empty, c)
+ | otherwise -> let (a,b) = T.splitAt n t
+ in (Chunk a Empty, Chunk b ts)
+
+-- | /O(n)/ 'span', applied to a predicate @p@ and text @t@, returns
+-- a pair whose first element is the longest prefix (possibly empty)
+-- of @t@ of elements that satisfy @p@, and whose second is the
+-- remainder of the list.
+span :: (Char -> Bool) -> Text -> (Text, Text)
+span p = break (not . p)
+{-# INLINE span #-}
+
+-- | The 'group' function takes a 'Text' and returns a list of 'Text's
+-- such that the concatenation of the result is equal to the argument.
+-- Moreover, each sublist in the result contains only equal elements.
+-- For example,
+--
+-- > group "Mississippi" = ["M","i","ss","i","ss","i","pp","i"]
+--
+-- It is a special case of 'groupBy', which allows the programmer to
+-- supply their own equality test.
+group :: Text -> [Text]
+group = groupBy (==)
+{-# INLINE group #-}
+
+-- | The 'groupBy' function is the non-overloaded version of 'group'.
+groupBy :: (Char -> Char -> Bool) -> Text -> [Text]
+groupBy _ Empty = []
+groupBy eq (Chunk t ts) = cons x ys : groupBy eq zs
+ where (ys,zs) = span (eq x) xs
+ x = T.unsafeHead t
+ xs = chunk (T.unsafeTail t) ts
+
+-- | /O(n)/ Return all initial segments of the given 'Text',
+-- shortest first.
+inits :: Text -> [Text]
+inits = (Empty :) . inits'
+ where inits' Empty = []
+ inits' (Chunk t ts) = L.map (\t' -> Chunk t' Empty) (L.tail (T.inits t))
+ ++ L.map (Chunk t) (inits' ts)
+
+-- | /O(n)/ Return all final segments of the given 'Text', longest
+-- first.
+tails :: Text -> [Text]
+tails Empty = Empty : []
+tails ts@(Chunk t ts')
+ | T.length t == 1 = ts : tails ts'
+ | otherwise = ts : tails (Chunk (T.unsafeTail t) ts')
+
+-- $split
+--
+-- Splitting functions in this library do not perform character-wise
+-- copies to create substrings; they just construct new 'Text's that
+-- are slices of the original.
+
+-- | /O(m+n)/ Break a 'Text' into pieces separated by the first 'Text'
+-- argument (which cannot be an empty string), consuming the
+-- delimiter. An empty delimiter is invalid, and will cause an error
+-- to be raised.
+--
+-- Examples:
+--
+-- > splitOn "\r\n" "a\r\nb\r\nd\r\ne" == ["a","b","d","e"]
+-- > splitOn "aaa" "aaaXaaaXaaaXaaa" == ["","X","X","X",""]
+-- > splitOn "x" "x" == ["",""]
+--
+-- and
+--
+-- > intercalate s . splitOn s == id
+-- > splitOn (singleton c) == split (==c)
+--
+-- (Note: the string @s@ to split on above cannot be empty.)
+--
+-- This function is strict in its first argument, and lazy in its
+-- second.
+--
+-- In (unlikely) bad cases, this function's time complexity degrades
+-- towards /O(n*m)/.
+splitOn :: Text
+ -- ^ String to split on. If this string is empty, an error
+ -- will occur.
+ -> Text
+ -- ^ Input text.
+ -> [Text]
+splitOn pat src
+ | null pat = emptyError "splitOn"
+ | isSingleton pat = split (== head pat) src
+ | otherwise = go 0 (indices pat src) src
+ where
+ go _ [] cs = [cs]
+ go !i (x:xs) cs = let h :*: t = splitAtWord (x-i) cs
+ in h : go (x+l) xs (dropWords l t)
+ l = foldlChunks (\a (T.Text _ _ b) -> a + fromIntegral b) 0 pat
+{-# INLINE [1] splitOn #-}
+
+{-# RULES
+"LAZY TEXT splitOn/singleton -> split/==" [~1] forall c t.
+ splitOn (singleton c) t = split (==c) t
+ #-}
+
+-- | /O(n)/ Splits a 'Text' into components delimited by separators,
+-- where the predicate returns True for a separator element. The
+-- resulting components do not contain the separators. Two adjacent
+-- separators result in an empty component in the output. eg.
+--
+-- > split (=='a') "aabbaca" == ["","","bb","c",""]
+-- > split (=='a') [] == [""]
+split :: (Char -> Bool) -> Text -> [Text]
+split _ Empty = [Empty]
+split p (Chunk t0 ts0) = comb [] (T.split p t0) ts0
+ where comb acc (s:[]) Empty = revChunks (s:acc) : []
+ comb acc (s:[]) (Chunk t ts) = comb (s:acc) (T.split p t) ts
+ comb acc (s:ss) ts = revChunks (s:acc) : comb [] ss ts
+ comb _ [] _ = impossibleError "split"
+{-# INLINE split #-}
+
+-- | /O(n)/ Splits a 'Text' into components of length @k@. The last
+-- element may be shorter than the other chunks, depending on the
+-- length of the input. Examples:
+--
+-- > chunksOf 3 "foobarbaz" == ["foo","bar","baz"]
+-- > chunksOf 4 "haskell.org" == ["hask","ell.","org"]
+chunksOf :: Int64 -> Text -> [Text]
+chunksOf k = go
+ where
+ go t = case splitAt k t of
+ (a,b) | null a -> []
+ | otherwise -> a : go b
+{-# INLINE chunksOf #-}
+
+-- | /O(n)/ Breaks a 'Text' up into a list of 'Text's at
+-- newline 'Char's. The resulting strings do not contain newlines.
+lines :: Text -> [Text]
+lines Empty = []
+lines t = let (l,t') = break ((==) '\n') t
+ in l : if null t' then []
+ else lines (tail t')
+
+-- | /O(n)/ Breaks a 'Text' up into a list of words, delimited by 'Char's
+-- representing white space.
+words :: Text -> [Text]
+words = L.filter (not . null) . split isSpace
+{-# INLINE words #-}
+
+-- | /O(n)/ Joins lines, after appending a terminating newline to
+-- each.
+unlines :: [Text] -> Text
+unlines = concat . L.map (`snoc` '\n')
+{-# INLINE unlines #-}
+
+-- | /O(n)/ Joins words using single space characters.
+unwords :: [Text] -> Text
+unwords = intercalate (singleton ' ')
+{-# INLINE unwords #-}
+
+-- | /O(n)/ The 'isPrefixOf' function takes two 'Text's and returns
+-- 'True' iff the first is a prefix of the second. Subject to fusion.
+isPrefixOf :: Text -> Text -> Bool
+isPrefixOf Empty _ = True
+isPrefixOf _ Empty = False
+isPrefixOf (Chunk x xs) (Chunk y ys)
+ | lx == ly = x == y && isPrefixOf xs ys
+ | lx < ly = x == yh && isPrefixOf xs (Chunk yt ys)
+ | otherwise = xh == y && isPrefixOf (Chunk xt xs) ys
+ where (xh,xt) = T.splitAt ly x
+ (yh,yt) = T.splitAt lx y
+ lx = T.length x
+ ly = T.length y
+{-# INLINE [1] isPrefixOf #-}
+
+{-# RULES
+"LAZY TEXT isPrefixOf -> fused" [~1] forall s t.
+ isPrefixOf s t = S.isPrefixOf (stream s) (stream t)
+"LAZY TEXT isPrefixOf -> unfused" [1] forall s t.
+ S.isPrefixOf (stream s) (stream t) = isPrefixOf s t
+ #-}
+
+-- | /O(n)/ The 'isSuffixOf' function takes two 'Text's and returns
+-- 'True' iff the first is a suffix of the second.
+isSuffixOf :: Text -> Text -> Bool
+isSuffixOf x y = reverse x `isPrefixOf` reverse y
+{-# INLINE isSuffixOf #-}
+-- TODO: a better implementation
+
+-- | /O(n+m)/ The 'isInfixOf' function takes two 'Text's and returns
+-- 'True' iff the first is contained, wholly and intact, anywhere
+-- within the second.
+--
+-- This function is strict in its first argument, and lazy in its
+-- second.
+--
+-- In (unlikely) bad cases, this function's time complexity degrades
+-- towards /O(n*m)/.
+isInfixOf :: Text -> Text -> Bool
+isInfixOf needle haystack
+ | null needle = True
+ | isSingleton needle = S.elem (head needle) . S.stream $ haystack
+ | otherwise = not . L.null . indices needle $ haystack
+{-# INLINE [1] isInfixOf #-}
+
+{-# RULES
+"LAZY TEXT isInfixOf/singleton -> S.elem/S.stream" [~1] forall n h.
+ isInfixOf (singleton n) h = S.elem n (S.stream h)
+ #-}
+
+-------------------------------------------------------------------------------
+-- * View patterns
+
+-- | /O(n)/ Return the suffix of the second string if its prefix
+-- matches the entire first string.
+--
+-- Examples:
+--
+-- > stripPrefix "foo" "foobar" == Just "bar"
+-- > stripPrefix "" "baz" == Just "baz"
+-- > stripPrefix "foo" "quux" == Nothing
+--
+-- This is particularly useful with the @ViewPatterns@ extension to
+-- GHC, as follows:
+--
+-- > {-# LANGUAGE ViewPatterns #-}
+-- > import Data.Text.Lazy as T
+-- >
+-- > fnordLength :: Text -> Int
+-- > fnordLength (stripPrefix "fnord" -> Just suf) = T.length suf
+-- > fnordLength _ = -1
+stripPrefix :: Text -> Text -> Maybe Text
+stripPrefix p t
+ | null p = Just t
+ | otherwise = case commonPrefixes p t of
+ Just (_,c,r) | null c -> Just r
+ _ -> Nothing
+
+-- | /O(n)/ Find the longest non-empty common prefix of two strings
+-- and return it, along with the suffixes of each string at which they
+-- no longer match.
+--
+-- If the strings do not have a common prefix or either one is empty,
+-- this function returns 'Nothing'.
+--
+-- Examples:
+--
+-- > commonPrefixes "foobar" "fooquux" == Just ("foo","bar","quux")
+-- > commonPrefixes "veeble" "fetzer" == Nothing
+-- > commonPrefixes "" "baz" == Nothing
+commonPrefixes :: Text -> Text -> Maybe (Text,Text,Text)
+commonPrefixes Empty _ = Nothing
+commonPrefixes _ Empty = Nothing
+commonPrefixes a0 b0 = Just (go a0 b0 [])
+ where
+ go t0@(Chunk x xs) t1@(Chunk y ys) ps
+ = case T.commonPrefixes x y of
+ Just (p,a,b)
+ | T.null a -> go xs (chunk b ys) (p:ps)
+ | T.null b -> go (chunk a xs) ys (p:ps)
+ | otherwise -> (fromChunks (L.reverse (p:ps)),chunk a xs, chunk b ys)
+ Nothing -> (fromChunks (L.reverse ps),t0,t1)
+ go t0 t1 ps = (fromChunks (L.reverse ps),t0,t1)
+
+-- | /O(n)/ Return the prefix of the second string if its suffix
+-- matches the entire first string.
+--
+-- Examples:
+--
+-- > stripSuffix "bar" "foobar" == Just "foo"
+-- > stripSuffix "" "baz" == Just "baz"
+-- > stripSuffix "foo" "quux" == Nothing
+--
+-- This is particularly useful with the @ViewPatterns@ extension to
+-- GHC, as follows:
+--
+-- > {-# LANGUAGE ViewPatterns #-}
+-- > import Data.Text.Lazy as T
+-- >
+-- > quuxLength :: Text -> Int
+-- > quuxLength (stripSuffix "quux" -> Just pre) = T.length pre
+-- > quuxLength _ = -1
+stripSuffix :: Text -> Text -> Maybe Text
+stripSuffix p t = reverse `fmap` stripPrefix (reverse p) (reverse t)
+
+-- | /O(n)/ 'filter', applied to a predicate and a 'Text',
+-- returns a 'Text' containing those characters that satisfy the
+-- predicate.
+filter :: (Char -> Bool) -> Text -> Text
+filter p t = unstream (S.filter p (stream t))
+{-# INLINE filter #-}
+
+-- | /O(n)/ The 'find' function takes a predicate and a 'Text', and
+-- returns the first element in matching the predicate, or 'Nothing'
+-- if there is no such element.
+find :: (Char -> Bool) -> Text -> Maybe Char
+find p t = S.findBy p (stream t)
+{-# INLINE find #-}
+
+-- | /O(n)/ The 'partition' function takes a predicate and a 'Text',
+-- and returns the pair of 'Text's with elements which do and do not
+-- satisfy the predicate, respectively; i.e.
+--
+-- > partition p t == (filter p t, filter (not . p) t)
+partition :: (Char -> Bool) -> Text -> (Text, Text)
+partition p t = (filter p t, filter (not . p) t)
+{-# INLINE partition #-}
+
+-- | /O(n)/ 'Text' index (subscript) operator, starting from 0.
+index :: Text -> Int64 -> Char
+index t n = S.index (stream t) n
+{-# INLINE index #-}
+
+-- | /O(n+m)/ The 'count' function returns the number of times the
+-- query string appears in the given 'Text'. An empty query string is
+-- invalid, and will cause an error to be raised.
+--
+-- In (unlikely) bad cases, this function's time complexity degrades
+-- towards /O(n*m)/.
+count :: Text -> Text -> Int64
+count pat src
+ | null pat = emptyError "count"
+ | otherwise = go 0 (indices pat src)
+ where go !n [] = n
+ go !n (_:xs) = go (n+1) xs
+{-# INLINE [1] count #-}
+
+{-# RULES
+"LAZY TEXT count/singleton -> countChar" [~1] forall c t.
+ count (singleton c) t = countChar c t
+ #-}
+
+-- | /O(n)/ The 'countChar' function returns the number of times the
+-- query element appears in the given 'Text'. Subject to fusion.
+countChar :: Char -> Text -> Int64
+countChar c t = S.countChar c (stream t)
+
+-- | /O(n)/ 'zip' takes two 'Text's and returns a list of
+-- corresponding pairs of bytes. If one input 'Text' is short,
+-- excess elements of the longer 'Text' are discarded. This is
+-- equivalent to a pair of 'unpack' operations.
+zip :: Text -> Text -> [(Char,Char)]
+zip a b = S.unstreamList $ S.zipWith (,) (stream a) (stream b)
+{-# INLINE [0] zip #-}
+
+-- | /O(n)/ 'zipWith' generalises 'zip' by zipping with the function
+-- given as the first argument, instead of a tupling function.
+-- Performs replacement on invalid scalar values.
+zipWith :: (Char -> Char -> Char) -> Text -> Text -> Text
+zipWith f t1 t2 = unstream (S.zipWith g (stream t1) (stream t2))
+ where g a b = safe (f a b)
+{-# INLINE [0] zipWith #-}
+
+revChunks :: [T.Text] -> Text
+revChunks = L.foldl' (flip chunk) Empty
+
+emptyError :: String -> a
+emptyError fun = P.error ("Data.Text.Lazy." ++ fun ++ ": empty input")
+
+impossibleError :: String -> a
+impossibleError fun = P.error ("Data.Text.Lazy." ++ fun ++ ": impossible case")
diff --git a/Data/Text/Lazy/Builder.hs b/Data/Text/Lazy/Builder.hs
new file mode 100644
index 0000000..6c4c24a
--- /dev/null
+++ b/Data/Text/Lazy/Builder.hs
@@ -0,0 +1,57 @@
+{-# LANGUAGE BangPatterns, CPP, Rank2Types #-}
+#if __GLASGOW_HASKELL__ >= 702
+{-# LANGUAGE Trustworthy #-}
+#endif
+
+-----------------------------------------------------------------------------
+-- |
+-- Module : Data.Text.Lazy.Builder
+-- Copyright : (c) 2013 Bryan O'Sullivan
+-- (c) 2010 Johan Tibell
+-- License : BSD-style (see LICENSE)
+--
+-- Maintainer : Johan Tibell <johan.tibell@gmail.com>
+-- Portability : portable to Hugs and GHC
+--
+-- Efficient construction of lazy @Text@ values. The principal
+-- operations on a @Builder@ are @singleton@, @fromText@, and
+-- @fromLazyText@, which construct new builders, and 'mappend', which
+-- concatenates two builders.
+--
+-- To get maximum performance when building lazy @Text@ values using a
+-- builder, associate @mappend@ calls to the right. For example,
+-- prefer
+--
+-- > singleton 'a' `mappend` (singleton 'b' `mappend` singleton 'c')
+--
+-- to
+--
+-- > singleton 'a' `mappend` singleton 'b' `mappend` singleton 'c'
+--
+-- as the latter associates @mappend@ to the left. Or, equivalently,
+-- prefer
+--
+-- > singleton 'a' <> singleton 'b' <> singleton 'c'
+--
+-- since the '<>' from recent versions of 'Data.Monoid' associates
+-- to the right.
+
+-----------------------------------------------------------------------------
+
+module Data.Text.Lazy.Builder
+ ( -- * The Builder type
+ Builder
+ , toLazyText
+ , toLazyTextWith
+
+ -- * Constructing Builders
+ , singleton
+ , fromText
+ , fromLazyText
+ , fromString
+
+ -- * Flushing the buffer state
+ , flush
+ ) where
+
+import Data.Text.Internal.Builder
diff --git a/Data/Text/Lazy/Builder/Int.hs b/Data/Text/Lazy/Builder/Int.hs
new file mode 100644
index 0000000..b0fb042
--- /dev/null
+++ b/Data/Text/Lazy/Builder/Int.hs
@@ -0,0 +1,264 @@
+{-# LANGUAGE BangPatterns, CPP, MagicHash, RankNTypes, ScopedTypeVariables,
+ UnboxedTuples #-}
+#if __GLASGOW_HASKELL__ >= 702
+{-# LANGUAGE Trustworthy #-}
+#endif
+
+-- Module: Data.Text.Lazy.Builder.Int
+-- Copyright: (c) 2013 Bryan O'Sullivan
+-- (c) 2011 MailRank, Inc.
+-- License: BSD-style
+-- Maintainer: Bryan O'Sullivan <bos@serpentine.com>
+-- Portability: portable
+--
+-- Efficiently write an integral value to a 'Builder'.
+
+module Data.Text.Lazy.Builder.Int
+ (
+ decimal
+ , hexadecimal
+ ) where
+
+import Data.Int (Int8, Int16, Int32, Int64)
+import Data.Monoid (mempty)
+import qualified Data.ByteString.Unsafe as B
+import Data.Text.Internal.Builder.Functions ((<>), i2d)
+import Data.Text.Internal.Builder
+import Data.Text.Internal.Builder.Int.Digits (digits)
+import Data.Text.Array
+import Data.Word (Word, Word8, Word16, Word32, Word64)
+import GHC.Base (quotInt, remInt)
+import GHC.Num (quotRemInteger)
+import GHC.Types (Int(..))
+import Control.Monad.ST
+#if MIN_VERSION_base(4,11,0)
+import Prelude hiding ((<>))
+#endif
+
+#ifdef __GLASGOW_HASKELL__
+# if defined(INTEGER_GMP)
+import GHC.Integer.GMP.Internals (Integer(S#))
+# elif defined(INTEGER_SIMPLE)
+import GHC.Integer
+# else
+# error "You need to use either GMP or integer-simple."
+# endif
+#endif
+
+#if defined(INTEGER_GMP) || defined(INTEGER_SIMPLE)
+# define PAIR(a,b) (# a,b #)
+#else
+# define PAIR(a,b) (a,b)
+#endif
+
+decimal :: Integral a => a -> Builder
+{-# RULES "decimal/Int8" decimal = boundedDecimal :: Int8 -> Builder #-}
+{-# RULES "decimal/Int" decimal = boundedDecimal :: Int -> Builder #-}
+{-# RULES "decimal/Int16" decimal = boundedDecimal :: Int16 -> Builder #-}
+{-# RULES "decimal/Int32" decimal = boundedDecimal :: Int32 -> Builder #-}
+{-# RULES "decimal/Int64" decimal = boundedDecimal :: Int64 -> Builder #-}
+{-# RULES "decimal/Word" decimal = positive :: Data.Word.Word -> Builder #-}
+{-# RULES "decimal/Word8" decimal = positive :: Word8 -> Builder #-}
+{-# RULES "decimal/Word16" decimal = positive :: Word16 -> Builder #-}
+{-# RULES "decimal/Word32" decimal = positive :: Word32 -> Builder #-}
+{-# RULES "decimal/Word64" decimal = positive :: Word64 -> Builder #-}
+{-# RULES "decimal/Integer" decimal = integer 10 :: Integer -> Builder #-}
+decimal i = decimal' (<= -128) i
+{-# NOINLINE decimal #-}
+
+boundedDecimal :: (Integral a, Bounded a) => a -> Builder
+{-# SPECIALIZE boundedDecimal :: Int -> Builder #-}
+{-# SPECIALIZE boundedDecimal :: Int8 -> Builder #-}
+{-# SPECIALIZE boundedDecimal :: Int16 -> Builder #-}
+{-# SPECIALIZE boundedDecimal :: Int32 -> Builder #-}
+{-# SPECIALIZE boundedDecimal :: Int64 -> Builder #-}
+boundedDecimal i = decimal' (== minBound) i
+
+decimal' :: (Integral a) => (a -> Bool) -> a -> Builder
+{-# INLINE decimal' #-}
+decimal' p i
+ | i < 0 = if p i
+ then let (q, r) = i `quotRem` 10
+ qq = -q
+ !n = countDigits qq
+ in writeN (n + 2) $ \marr off -> do
+ unsafeWrite marr off minus
+ posDecimal marr (off+1) n qq
+ unsafeWrite marr (off+n+1) (i2w (-r))
+ else let j = -i
+ !n = countDigits j
+ in writeN (n + 1) $ \marr off ->
+ unsafeWrite marr off minus >> posDecimal marr (off+1) n j
+ | otherwise = positive i
+
+positive :: (Integral a) => a -> Builder
+{-# SPECIALIZE positive :: Int -> Builder #-}
+{-# SPECIALIZE positive :: Int8 -> Builder #-}
+{-# SPECIALIZE positive :: Int16 -> Builder #-}
+{-# SPECIALIZE positive :: Int32 -> Builder #-}
+{-# SPECIALIZE positive :: Int64 -> Builder #-}
+{-# SPECIALIZE positive :: Word -> Builder #-}
+{-# SPECIALIZE positive :: Word8 -> Builder #-}
+{-# SPECIALIZE positive :: Word16 -> Builder #-}
+{-# SPECIALIZE positive :: Word32 -> Builder #-}
+{-# SPECIALIZE positive :: Word64 -> Builder #-}
+positive i
+ | i < 10 = writeN 1 $ \marr off -> unsafeWrite marr off (i2w i)
+ | otherwise = let !n = countDigits i
+ in writeN n $ \marr off -> posDecimal marr off n i
+
+posDecimal :: (Integral a) =>
+ forall s. MArray s -> Int -> Int -> a -> ST s ()
+{-# INLINE posDecimal #-}
+posDecimal marr off0 ds v0 = go (off0 + ds - 1) v0
+ where go off v
+ | v >= 100 = do
+ let (q, r) = v `quotRem` 100
+ write2 off r
+ go (off - 2) q
+ | v < 10 = unsafeWrite marr off (i2w v)
+ | otherwise = write2 off v
+ write2 off i0 = do
+ let i = fromIntegral i0; j = i + i
+ unsafeWrite marr off $ get (j + 1)
+ unsafeWrite marr (off - 1) $ get j
+ get = fromIntegral . B.unsafeIndex digits
+
+minus, zero :: Word8
+{-# INLINE minus #-}
+{-# INLINE zero #-}
+minus = 45
+zero = 48
+
+i2w :: (Integral a) => a -> Word8
+{-# INLINE i2w #-}
+i2w v = zero + fromIntegral v
+
+countDigits :: (Integral a) => a -> Int
+{-# INLINE countDigits #-}
+countDigits v0
+ | fromIntegral v64 == v0 = go 1 v64
+ | otherwise = goBig 1 (fromIntegral v0)
+ where v64 = fromIntegral v0
+ goBig !k (v :: Integer)
+ | v > big = goBig (k + 19) (v `quot` big)
+ | otherwise = go k (fromIntegral v)
+ big = 10000000000000000000
+ go !k (v :: Word64)
+ | v < 10 = k
+ | v < 100 = k + 1
+ | v < 1000 = k + 2
+ | v < 1000000000000 =
+ k + if v < 100000000
+ then if v < 1000000
+ then if v < 10000
+ then 3
+ else 4 + fin v 100000
+ else 6 + fin v 10000000
+ else if v < 10000000000
+ then 8 + fin v 1000000000
+ else 10 + fin v 100000000000
+ | otherwise = go (k + 12) (v `quot` 1000000000000)
+ fin v n = if v >= n then 1 else 0
+
+hexadecimal :: Integral a => a -> Builder
+{-# SPECIALIZE hexadecimal :: Int -> Builder #-}
+{-# SPECIALIZE hexadecimal :: Int8 -> Builder #-}
+{-# SPECIALIZE hexadecimal :: Int16 -> Builder #-}
+{-# SPECIALIZE hexadecimal :: Int32 -> Builder #-}
+{-# SPECIALIZE hexadecimal :: Int64 -> Builder #-}
+{-# SPECIALIZE hexadecimal :: Word -> Builder #-}
+{-# SPECIALIZE hexadecimal :: Word8 -> Builder #-}
+{-# SPECIALIZE hexadecimal :: Word16 -> Builder #-}
+{-# SPECIALIZE hexadecimal :: Word32 -> Builder #-}
+{-# SPECIALIZE hexadecimal :: Word64 -> Builder #-}
+{-# RULES "hexadecimal/Integer"
+ hexadecimal = hexInteger :: Integer -> Builder #-}
+hexadecimal i
+ | i < 0 = error hexErrMsg
+ | otherwise = go i
+ where
+ go n | n < 16 = hexDigit n
+ | otherwise = go (n `quot` 16) <> hexDigit (n `rem` 16)
+{-# NOINLINE[0] hexadecimal #-}
+
+hexInteger :: Integer -> Builder
+hexInteger i
+ | i < 0 = error hexErrMsg
+ | otherwise = integer 16 i
+
+hexErrMsg :: String
+hexErrMsg = "Data.Text.Lazy.Builder.Int.hexadecimal: applied to negative number"
+
+hexDigit :: Integral a => a -> Builder
+hexDigit n
+ | n <= 9 = singleton $! i2d (fromIntegral n)
+ | otherwise = singleton $! toEnum (fromIntegral n + 87)
+{-# INLINE hexDigit #-}
+
+data T = T !Integer !Int
+
+integer :: Int -> Integer -> Builder
+#ifdef INTEGER_GMP
+integer 10 (S# i#) = decimal (I# i#)
+integer 16 (S# i#) = hexadecimal (I# i#)
+#endif
+integer base i
+ | i < 0 = singleton '-' <> go (-i)
+ | otherwise = go i
+ where
+ go n | n < maxInt = int (fromInteger n)
+ | otherwise = putH (splitf (maxInt * maxInt) n)
+
+ splitf p n
+ | p > n = [n]
+ | otherwise = splith p (splitf (p*p) n)
+
+ splith p (n:ns) = case n `quotRemInteger` p of
+ PAIR(q,r) | q > 0 -> q : r : splitb p ns
+ | otherwise -> r : splitb p ns
+ splith _ _ = error "splith: the impossible happened."
+
+ splitb p (n:ns) = case n `quotRemInteger` p of
+ PAIR(q,r) -> q : r : splitb p ns
+ splitb _ _ = []
+
+ T maxInt10 maxDigits10 =
+ until ((>mi) . (*10) . fstT) (\(T n d) -> T (n*10) (d+1)) (T 10 1)
+ where mi = fromIntegral (maxBound :: Int)
+ T maxInt16 maxDigits16 =
+ until ((>mi) . (*16) . fstT) (\(T n d) -> T (n*16) (d+1)) (T 16 1)
+ where mi = fromIntegral (maxBound :: Int)
+
+ fstT (T a _) = a
+
+ maxInt | base == 10 = maxInt10
+ | otherwise = maxInt16
+ maxDigits | base == 10 = maxDigits10
+ | otherwise = maxDigits16
+
+ putH (n:ns) = case n `quotRemInteger` maxInt of
+ PAIR(x,y)
+ | q > 0 -> int q <> pblock r <> putB ns
+ | otherwise -> int r <> putB ns
+ where q = fromInteger x
+ r = fromInteger y
+ putH _ = error "putH: the impossible happened"
+
+ putB (n:ns) = case n `quotRemInteger` maxInt of
+ PAIR(x,y) -> pblock q <> pblock r <> putB ns
+ where q = fromInteger x
+ r = fromInteger y
+ putB _ = Data.Monoid.mempty
+
+ int :: Int -> Builder
+ int x | base == 10 = decimal x
+ | otherwise = hexadecimal x
+
+ pblock = loop maxDigits
+ where
+ loop !d !n
+ | d == 1 = hexDigit n
+ | otherwise = loop (d-1) q <> hexDigit r
+ where q = n `quotInt` base
+ r = n `remInt` base
diff --git a/Data/Text/Lazy/Builder/RealFloat.hs b/Data/Text/Lazy/Builder/RealFloat.hs
new file mode 100644
index 0000000..3853c6e
--- /dev/null
+++ b/Data/Text/Lazy/Builder/RealFloat.hs
@@ -0,0 +1,245 @@
+{-# LANGUAGE CPP, OverloadedStrings #-}
+#if __GLASGOW_HASKELL__ >= 702
+{-# LANGUAGE Trustworthy #-}
+#endif
+
+-- |
+-- Module: Data.Text.Lazy.Builder.RealFloat
+-- Copyright: (c) The University of Glasgow 1994-2002
+-- License: see libraries/base/LICENSE
+--
+-- Write a floating point value to a 'Builder'.
+
+module Data.Text.Lazy.Builder.RealFloat
+ (
+ FPFormat(..)
+ , realFloat
+ , formatRealFloat
+ ) where
+
+import Data.Array.Base (unsafeAt)
+import Data.Array.IArray
+import Data.Text.Internal.Builder.Functions ((<>), i2d)
+import Data.Text.Lazy.Builder.Int (decimal)
+import Data.Text.Internal.Builder.RealFloat.Functions (roundTo)
+import Data.Text.Lazy.Builder
+import qualified Data.Text as T
+#if MIN_VERSION_base(4,11,0)
+import Prelude hiding ((<>))
+#endif
+
+-- | Control the rendering of floating point numbers.
+data FPFormat = Exponent
+ -- ^ Scientific notation (e.g. @2.3e123@).
+ | Fixed
+ -- ^ Standard decimal notation.
+ | Generic
+ -- ^ Use decimal notation for values between @0.1@ and
+ -- @9,999,999@, and scientific notation otherwise.
+ deriving (Enum, Read, Show)
+
+-- | Show a signed 'RealFloat' value to full precision,
+-- using standard decimal notation for arguments whose absolute value lies
+-- between @0.1@ and @9,999,999@, and scientific notation otherwise.
+realFloat :: (RealFloat a) => a -> Builder
+{-# SPECIALIZE realFloat :: Float -> Builder #-}
+{-# SPECIALIZE realFloat :: Double -> Builder #-}
+realFloat x = formatRealFloat Generic Nothing x
+
+formatRealFloat :: (RealFloat a) =>
+ FPFormat
+ -> Maybe Int -- ^ Number of decimal places to render.
+ -> a
+ -> Builder
+{-# SPECIALIZE formatRealFloat :: FPFormat -> Maybe Int -> Float -> Builder #-}
+{-# SPECIALIZE formatRealFloat :: FPFormat -> Maybe Int -> Double -> Builder #-}
+formatRealFloat fmt decs x
+ | isNaN x = "NaN"
+ | isInfinite x = if x < 0 then "-Infinity" else "Infinity"
+ | x < 0 || isNegativeZero x = singleton '-' <> doFmt fmt (floatToDigits (-x))
+ | otherwise = doFmt fmt (floatToDigits x)
+ where
+ doFmt format (is, e) =
+ let ds = map i2d is in
+ case format of
+ Generic ->
+ doFmt (if e < 0 || e > 7 then Exponent else Fixed)
+ (is,e)
+ Exponent ->
+ case decs of
+ Nothing ->
+ let show_e' = decimal (e-1) in
+ case ds of
+ "0" -> "0.0e0"
+ [d] -> singleton d <> ".0e" <> show_e'
+ (d:ds') -> singleton d <> singleton '.' <> fromString ds' <> singleton 'e' <> show_e'
+ [] -> error "formatRealFloat/doFmt/Exponent: []"
+ Just dec ->
+ let dec' = max dec 1 in
+ case is of
+ [0] -> "0." <> fromText (T.replicate dec' "0") <> "e0"
+ _ ->
+ let
+ (ei,is') = roundTo (dec'+1) is
+ (d:ds') = map i2d (if ei > 0 then init is' else is')
+ in
+ singleton d <> singleton '.' <> fromString ds' <> singleton 'e' <> decimal (e-1+ei)
+ Fixed ->
+ let
+ mk0 ls = case ls of { "" -> "0" ; _ -> fromString ls}
+ in
+ case decs of
+ Nothing
+ | e <= 0 -> "0." <> fromText (T.replicate (-e) "0") <> fromString ds
+ | otherwise ->
+ let
+ f 0 s rs = mk0 (reverse s) <> singleton '.' <> mk0 rs
+ f n s "" = f (n-1) ('0':s) ""
+ f n s (r:rs) = f (n-1) (r:s) rs
+ in
+ f e "" ds
+ Just dec ->
+ let dec' = max dec 0 in
+ if e >= 0 then
+ let
+ (ei,is') = roundTo (dec' + e) is
+ (ls,rs) = splitAt (e+ei) (map i2d is')
+ in
+ mk0 ls <> (if null rs then "" else singleton '.' <> fromString rs)
+ else
+ let
+ (ei,is') = roundTo dec' (replicate (-e) 0 ++ is)
+ d:ds' = map i2d (if ei > 0 then is' else 0:is')
+ in
+ singleton d <> (if null ds' then "" else singleton '.' <> fromString ds')
+
+
+-- Based on "Printing Floating-Point Numbers Quickly and Accurately"
+-- by R.G. Burger and R.K. Dybvig in PLDI 96.
+-- This version uses a much slower logarithm estimator. It should be improved.
+
+-- | 'floatToDigits' takes a base and a non-negative 'RealFloat' number,
+-- and returns a list of digits and an exponent.
+-- In particular, if @x>=0@, and
+--
+-- > floatToDigits base x = ([d1,d2,...,dn], e)
+--
+-- then
+--
+-- (1) @n >= 1@
+--
+-- (2) @x = 0.d1d2...dn * (base**e)@
+--
+-- (3) @0 <= di <= base-1@
+
+floatToDigits :: (RealFloat a) => a -> ([Int], Int)
+{-# SPECIALIZE floatToDigits :: Float -> ([Int], Int) #-}
+{-# SPECIALIZE floatToDigits :: Double -> ([Int], Int) #-}
+floatToDigits 0 = ([0], 0)
+floatToDigits x =
+ let
+ (f0, e0) = decodeFloat x
+ (minExp0, _) = floatRange x
+ p = floatDigits x
+ b = floatRadix x
+ minExp = minExp0 - p -- the real minimum exponent
+ -- Haskell requires that f be adjusted so denormalized numbers
+ -- will have an impossibly low exponent. Adjust for this.
+ (f, e) =
+ let n = minExp - e0 in
+ if n > 0 then (f0 `quot` (expt b n), e0+n) else (f0, e0)
+ (r, s, mUp, mDn) =
+ if e >= 0 then
+ let be = expt b e in
+ if f == expt b (p-1) then
+ (f*be*b*2, 2*b, be*b, be) -- according to Burger and Dybvig
+ else
+ (f*be*2, 2, be, be)
+ else
+ if e > minExp && f == expt b (p-1) then
+ (f*b*2, expt b (-e+1)*2, b, 1)
+ else
+ (f*2, expt b (-e)*2, 1, 1)
+ k :: Int
+ k =
+ let
+ k0 :: Int
+ k0 =
+ if b == 2 then
+ -- logBase 10 2 is very slightly larger than 8651/28738
+ -- (about 5.3558e-10), so if log x >= 0, the approximation
+ -- k1 is too small, hence we add one and need one fixup step less.
+ -- If log x < 0, the approximation errs rather on the high side.
+ -- That is usually more than compensated for by ignoring the
+ -- fractional part of logBase 2 x, but when x is a power of 1/2
+ -- or slightly larger and the exponent is a multiple of the
+ -- denominator of the rational approximation to logBase 10 2,
+ -- k1 is larger than logBase 10 x. If k1 > 1 + logBase 10 x,
+ -- we get a leading zero-digit we don't want.
+ -- With the approximation 3/10, this happened for
+ -- 0.5^1030, 0.5^1040, ..., 0.5^1070 and values close above.
+ -- The approximation 8651/28738 guarantees k1 < 1 + logBase 10 x
+ -- for IEEE-ish floating point types with exponent fields
+ -- <= 17 bits and mantissae of several thousand bits, earlier
+ -- convergents to logBase 10 2 would fail for long double.
+ -- Using quot instead of div is a little faster and requires
+ -- fewer fixup steps for negative lx.
+ let lx = p - 1 + e0
+ k1 = (lx * 8651) `quot` 28738
+ in if lx >= 0 then k1 + 1 else k1
+ else
+ -- f :: Integer, log :: Float -> Float,
+ -- ceiling :: Float -> Int
+ ceiling ((log (fromInteger (f+1) :: Float) +
+ fromIntegral e * log (fromInteger b)) /
+ log 10)
+--WAS: fromInt e * log (fromInteger b))
+
+ fixup n =
+ if n >= 0 then
+ if r + mUp <= expt 10 n * s then n else fixup (n+1)
+ else
+ if expt 10 (-n) * (r + mUp) <= s then n else fixup (n+1)
+ in
+ fixup k0
+
+ gen ds rn sN mUpN mDnN =
+ let
+ (dn, rn') = (rn * 10) `quotRem` sN
+ mUpN' = mUpN * 10
+ mDnN' = mDnN * 10
+ in
+ case (rn' < mDnN', rn' + mUpN' > sN) of
+ (True, False) -> dn : ds
+ (False, True) -> dn+1 : ds
+ (True, True) -> if rn' * 2 < sN then dn : ds else dn+1 : ds
+ (False, False) -> gen (dn:ds) rn' sN mUpN' mDnN'
+
+ rds =
+ if k >= 0 then
+ gen [] r (s * expt 10 k) mUp mDn
+ else
+ let bk = expt 10 (-k) in
+ gen [] (r * bk) s (mUp * bk) (mDn * bk)
+ in
+ (map fromIntegral (reverse rds), k)
+
+-- Exponentiation with a cache for the most common numbers.
+minExpt, maxExpt :: Int
+minExpt = 0
+maxExpt = 1100
+
+expt :: Integer -> Int -> Integer
+expt base n
+ | base == 2 && n >= minExpt && n <= maxExpt = expts `unsafeAt` n
+ | base == 10 && n <= maxExpt10 = expts10 `unsafeAt` n
+ | otherwise = base^n
+
+expts :: Array Int Integer
+expts = array (minExpt,maxExpt) [(n,2^n) | n <- [minExpt .. maxExpt]]
+
+maxExpt10 :: Int
+maxExpt10 = 324
+
+expts10 :: Array Int Integer
+expts10 = array (minExpt,maxExpt10) [(n,10^n) | n <- [minExpt .. maxExpt10]]
diff --git a/Data/Text/Lazy/Encoding.hs b/Data/Text/Lazy/Encoding.hs
new file mode 100644
index 0000000..ac1464d
--- /dev/null
+++ b/Data/Text/Lazy/Encoding.hs
@@ -0,0 +1,250 @@
+{-# LANGUAGE BangPatterns,CPP #-}
+#if __GLASGOW_HASKELL__ >= 702
+{-# LANGUAGE Trustworthy #-}
+#endif
+-- |
+-- Module : Data.Text.Lazy.Encoding
+-- Copyright : (c) 2009, 2010 Bryan O'Sullivan
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Portability : portable
+--
+-- Functions for converting lazy 'Text' values to and from lazy
+-- 'ByteString', using several standard encodings.
+--
+-- To gain access to a much larger family of encodings, use the
+-- <http://hackage.haskell.org/package/text-icu text-icu package>.
+
+module Data.Text.Lazy.Encoding
+ (
+ -- * Decoding ByteStrings to Text
+ -- $strict
+ decodeASCII
+ , decodeLatin1
+ , decodeUtf8
+ , decodeUtf16LE
+ , decodeUtf16BE
+ , decodeUtf32LE
+ , decodeUtf32BE
+
+ -- ** Catchable failure
+ , decodeUtf8'
+
+ -- ** Controllable error handling
+ , decodeUtf8With
+ , decodeUtf16LEWith
+ , decodeUtf16BEWith
+ , decodeUtf32LEWith
+ , decodeUtf32BEWith
+
+ -- * Encoding Text to ByteStrings
+ , encodeUtf8
+ , encodeUtf16LE
+ , encodeUtf16BE
+ , encodeUtf32LE
+ , encodeUtf32BE
+
+ -- * Encoding Text using ByteString Builders
+ , encodeUtf8Builder
+ , encodeUtf8BuilderEscaped
+ ) where
+
+import Control.Exception (evaluate, try)
+import Data.Monoid (Monoid(..))
+import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
+import Data.Text.Internal.Lazy (Text(..), chunk, empty, foldrChunks)
+import Data.Word (Word8)
+import qualified Data.ByteString as S
+import qualified Data.ByteString.Builder as B
+import qualified Data.ByteString.Builder.Extra as B (safeStrategy, toLazyByteStringWith)
+import qualified Data.ByteString.Builder.Prim as BP
+import qualified Data.ByteString.Lazy as B
+import qualified Data.ByteString.Lazy.Internal as B
+import qualified Data.ByteString.Unsafe as B
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as TE
+import qualified Data.Text.Internal.Lazy.Encoding.Fusion as E
+import qualified Data.Text.Internal.Lazy.Fusion as F
+import Data.Text.Unsafe (unsafeDupablePerformIO)
+
+-- $strict
+--
+-- All of the single-parameter functions for decoding bytestrings
+-- encoded in one of the Unicode Transformation Formats (UTF) operate
+-- in a /strict/ mode: each will throw an exception if given invalid
+-- input.
+--
+-- Each function has a variant, whose name is suffixed with -'With',
+-- that gives greater control over the handling of decoding errors.
+-- For instance, 'decodeUtf8' will throw an exception, but
+-- 'decodeUtf8With' allows the programmer to determine what to do on a
+-- decoding error.
+
+-- | /Deprecated/. Decode a 'ByteString' containing 7-bit ASCII
+-- encoded text.
+decodeASCII :: B.ByteString -> Text
+decodeASCII = decodeUtf8
+{-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-}
+
+-- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
+decodeLatin1 :: B.ByteString -> Text
+decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks
+
+-- | Decode a 'ByteString' containing UTF-8 encoded text.
+decodeUtf8With :: OnDecodeError -> B.ByteString -> Text
+decodeUtf8With onErr (B.Chunk b0 bs0) =
+ case TE.streamDecodeUtf8With onErr b0 of
+ TE.Some t l f -> chunk t (go f l bs0)
+ where
+ go f0 _ (B.Chunk b bs) =
+ case f0 b of
+ TE.Some t l f -> chunk t (go f l bs)
+ go _ l _
+ | S.null l = empty
+ | otherwise = case onErr desc (Just (B.unsafeHead l)) of
+ Nothing -> empty
+ Just c -> Chunk (T.singleton c) Empty
+ desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream"
+decodeUtf8With _ _ = empty
+
+-- | Decode a 'ByteString' containing UTF-8 encoded text that is known
+-- to be valid.
+--
+-- If the input contains any invalid UTF-8 data, an exception will be
+-- thrown that cannot be caught in pure code. For more control over
+-- the handling of invalid data, use 'decodeUtf8'' or
+-- 'decodeUtf8With'.
+decodeUtf8 :: B.ByteString -> Text
+decodeUtf8 = decodeUtf8With strictDecode
+{-# INLINE[0] decodeUtf8 #-}
+
+-- This rule seems to cause performance loss.
+{- RULES "LAZY STREAM stream/decodeUtf8' fusion" [1]
+ forall bs. F.stream (decodeUtf8' bs) = E.streamUtf8 strictDecode bs #-}
+
+-- | Decode a 'ByteString' containing UTF-8 encoded text..
+--
+-- If the input contains any invalid UTF-8 data, the relevant
+-- exception will be returned, otherwise the decoded text.
+--
+-- /Note/: this function is /not/ lazy, as it must decode its entire
+-- input before it can return a result. If you need lazy (streaming)
+-- decoding, use 'decodeUtf8With' in lenient mode.
+decodeUtf8' :: B.ByteString -> Either UnicodeException Text
+decodeUtf8' bs = unsafeDupablePerformIO $ do
+ let t = decodeUtf8 bs
+ try (evaluate (rnf t `seq` t))
+ where
+ rnf Empty = ()
+ rnf (Chunk _ ts) = rnf ts
+{-# INLINE decodeUtf8' #-}
+
+-- | Encode text using UTF-8 encoding.
+encodeUtf8 :: Text -> B.ByteString
+encodeUtf8 Empty = B.empty
+encodeUtf8 lt@(Chunk t _) =
+ B.toLazyByteStringWith strategy B.empty $ encodeUtf8Builder lt
+ where
+ -- To improve our small string performance, we use a strategy that
+ -- allocates a buffer that is guaranteed to be large enough for the
+ -- encoding of the first chunk, but not larger than the default
+ -- B.smallChunkSize. We clamp the firstChunkSize to ensure that we don't
+ -- generate too large buffers which hamper streaming.
+ firstChunkSize = min B.smallChunkSize (4 * (T.length t + 1))
+ strategy = B.safeStrategy firstChunkSize B.defaultChunkSize
+
+-- | Encode text to a ByteString 'B.Builder' using UTF-8 encoding.
+--
+-- @since 1.1.0.0
+encodeUtf8Builder :: Text -> B.Builder
+encodeUtf8Builder =
+ foldrChunks (\c b -> TE.encodeUtf8Builder c `mappend` b) Data.Monoid.mempty
+
+-- | Encode text using UTF-8 encoding and escape the ASCII characters using
+-- a 'BP.BoundedPrim'.
+--
+-- Use this function is to implement efficient encoders for text-based formats
+-- like JSON or HTML.
+--
+-- @since 1.1.0.0
+{-# INLINE encodeUtf8BuilderEscaped #-}
+encodeUtf8BuilderEscaped :: BP.BoundedPrim Word8 -> Text -> B.Builder
+encodeUtf8BuilderEscaped prim =
+ foldrChunks (\c b -> TE.encodeUtf8BuilderEscaped prim c `mappend` b) mempty
+
+-- | Decode text from little endian UTF-16 encoding.
+decodeUtf16LEWith :: OnDecodeError -> B.ByteString -> Text
+decodeUtf16LEWith onErr bs = F.unstream (E.streamUtf16LE onErr bs)
+{-# INLINE decodeUtf16LEWith #-}
+
+-- | Decode text from little endian UTF-16 encoding.
+--
+-- If the input contains any invalid little endian UTF-16 data, an
+-- exception will be thrown. For more control over the handling of
+-- invalid data, use 'decodeUtf16LEWith'.
+decodeUtf16LE :: B.ByteString -> Text
+decodeUtf16LE = decodeUtf16LEWith strictDecode
+{-# INLINE decodeUtf16LE #-}
+
+-- | Decode text from big endian UTF-16 encoding.
+decodeUtf16BEWith :: OnDecodeError -> B.ByteString -> Text
+decodeUtf16BEWith onErr bs = F.unstream (E.streamUtf16BE onErr bs)
+{-# INLINE decodeUtf16BEWith #-}
+
+-- | Decode text from big endian UTF-16 encoding.
+--
+-- If the input contains any invalid big endian UTF-16 data, an
+-- exception will be thrown. For more control over the handling of
+-- invalid data, use 'decodeUtf16BEWith'.
+decodeUtf16BE :: B.ByteString -> Text
+decodeUtf16BE = decodeUtf16BEWith strictDecode
+{-# INLINE decodeUtf16BE #-}
+
+-- | Encode text using little endian UTF-16 encoding.
+encodeUtf16LE :: Text -> B.ByteString
+encodeUtf16LE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf16LE) [] txt)
+{-# INLINE encodeUtf16LE #-}
+
+-- | Encode text using big endian UTF-16 encoding.
+encodeUtf16BE :: Text -> B.ByteString
+encodeUtf16BE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf16BE) [] txt)
+{-# INLINE encodeUtf16BE #-}
+
+-- | Decode text from little endian UTF-32 encoding.
+decodeUtf32LEWith :: OnDecodeError -> B.ByteString -> Text
+decodeUtf32LEWith onErr bs = F.unstream (E.streamUtf32LE onErr bs)
+{-# INLINE decodeUtf32LEWith #-}
+
+-- | Decode text from little endian UTF-32 encoding.
+--
+-- If the input contains any invalid little endian UTF-32 data, an
+-- exception will be thrown. For more control over the handling of
+-- invalid data, use 'decodeUtf32LEWith'.
+decodeUtf32LE :: B.ByteString -> Text
+decodeUtf32LE = decodeUtf32LEWith strictDecode
+{-# INLINE decodeUtf32LE #-}
+
+-- | Decode text from big endian UTF-32 encoding.
+decodeUtf32BEWith :: OnDecodeError -> B.ByteString -> Text
+decodeUtf32BEWith onErr bs = F.unstream (E.streamUtf32BE onErr bs)
+{-# INLINE decodeUtf32BEWith #-}
+
+-- | Decode text from big endian UTF-32 encoding.
+--
+-- If the input contains any invalid big endian UTF-32 data, an
+-- exception will be thrown. For more control over the handling of
+-- invalid data, use 'decodeUtf32BEWith'.
+decodeUtf32BE :: B.ByteString -> Text
+decodeUtf32BE = decodeUtf32BEWith strictDecode
+{-# INLINE decodeUtf32BE #-}
+
+-- | Encode text using little endian UTF-32 encoding.
+encodeUtf32LE :: Text -> B.ByteString
+encodeUtf32LE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32LE) [] txt)
+{-# INLINE encodeUtf32LE #-}
+
+-- | Encode text using big endian UTF-32 encoding.
+encodeUtf32BE :: Text -> B.ByteString
+encodeUtf32BE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32BE) [] txt)
+{-# INLINE encodeUtf32BE #-}
diff --git a/Data/Text/Lazy/IO.hs b/Data/Text/Lazy/IO.hs
new file mode 100644
index 0000000..d92396d
--- /dev/null
+++ b/Data/Text/Lazy/IO.hs
@@ -0,0 +1,195 @@
+{-# LANGUAGE BangPatterns, CPP, RecordWildCards #-}
+#if __GLASGOW_HASKELL__ >= 702
+{-# LANGUAGE Trustworthy #-}
+#endif
+-- |
+-- Module : Data.Text.Lazy.IO
+-- Copyright : (c) 2009, 2010 Bryan O'Sullivan,
+-- (c) 2009 Simon Marlow
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Portability : GHC
+--
+-- Efficient locale-sensitive support for lazy text I\/O.
+--
+-- Skip past the synopsis for some important notes on performance and
+-- portability across different versions of GHC.
+
+module Data.Text.Lazy.IO
+ (
+ -- * Performance
+ -- $performance
+
+ -- * Locale support
+ -- $locale
+ -- * File-at-a-time operations
+ readFile
+ , writeFile
+ , appendFile
+ -- * Operations on handles
+ , hGetContents
+ , hGetLine
+ , hPutStr
+ , hPutStrLn
+ -- * Special cases for standard input and output
+ , interact
+ , getContents
+ , getLine
+ , putStr
+ , putStrLn
+ ) where
+
+import Data.Text.Lazy (Text)
+import Prelude hiding (appendFile, getContents, getLine, interact,
+ putStr, putStrLn, readFile, writeFile)
+import System.IO (Handle, IOMode(..), hPutChar, openFile, stdin, stdout,
+ withFile)
+import qualified Data.Text.IO as T
+import qualified Data.Text.Lazy as L
+import qualified Control.Exception as E
+import Control.Monad (when)
+import Data.IORef (readIORef)
+import Data.Text.Internal.IO (hGetLineWith, readChunk)
+import Data.Text.Internal.Lazy (chunk, empty)
+import GHC.IO.Buffer (isEmptyBuffer)
+import GHC.IO.Exception (IOException(..), IOErrorType(..), ioException)
+import GHC.IO.Handle.Internals (augmentIOError, hClose_help,
+ wantReadableHandle, withHandle)
+import GHC.IO.Handle.Types (Handle__(..), HandleType(..))
+import System.IO (BufferMode(..), hGetBuffering, hSetBuffering)
+import System.IO.Error (isEOFError)
+import System.IO.Unsafe (unsafeInterleaveIO)
+
+-- $performance
+--
+-- The functions in this module obey the runtime system's locale,
+-- character set encoding, and line ending conversion settings.
+--
+-- If you know in advance that you will be working with data that has
+-- a specific encoding (e.g. UTF-8), and your application is highly
+-- performance sensitive, you may find that it is faster to perform
+-- I\/O with bytestrings and to encode and decode yourself than to use
+-- the functions in this module.
+--
+-- Whether this will hold depends on the version of GHC you are using,
+-- the platform you are working on, the data you are working with, and
+-- the encodings you are using, so be sure to test for yourself.
+
+-- | Read a file and return its contents as a string. The file is
+-- read lazily, as with 'getContents'.
+readFile :: FilePath -> IO Text
+readFile name = openFile name ReadMode >>= hGetContents
+
+-- | Write a string to a file. The file is truncated to zero length
+-- before writing begins.
+writeFile :: FilePath -> Text -> IO ()
+writeFile p = withFile p WriteMode . flip hPutStr
+
+-- | Write a string the end of a file.
+appendFile :: FilePath -> Text -> IO ()
+appendFile p = withFile p AppendMode . flip hPutStr
+
+-- | Lazily read the remaining contents of a 'Handle'. The 'Handle'
+-- will be closed after the read completes, or on error.
+hGetContents :: Handle -> IO Text
+hGetContents h = do
+ chooseGoodBuffering h
+ wantReadableHandle "hGetContents" h $ \hh -> do
+ ts <- lazyRead h
+ return (hh{haType=SemiClosedHandle}, ts)
+
+-- | Use a more efficient buffer size if we're reading in
+-- block-buffered mode with the default buffer size.
+chooseGoodBuffering :: Handle -> IO ()
+chooseGoodBuffering h = do
+ bufMode <- hGetBuffering h
+ when (bufMode == BlockBuffering Nothing) $
+ hSetBuffering h (BlockBuffering (Just 16384))
+
+lazyRead :: Handle -> IO Text
+lazyRead h = unsafeInterleaveIO $
+ withHandle "hGetContents" h $ \hh -> do
+ case haType hh of
+ ClosedHandle -> return (hh, L.empty)
+ SemiClosedHandle -> lazyReadBuffered h hh
+ _ -> ioException
+ (IOError (Just h) IllegalOperation "hGetContents"
+ "illegal handle type" Nothing Nothing)
+
+lazyReadBuffered :: Handle -> Handle__ -> IO (Handle__, Text)
+lazyReadBuffered h hh@Handle__{..} = do
+ buf <- readIORef haCharBuffer
+ (do t <- readChunk hh buf
+ ts <- lazyRead h
+ return (hh, chunk t ts)) `E.catch` \e -> do
+ (hh', _) <- hClose_help hh
+ if isEOFError e
+ then return $ if isEmptyBuffer buf
+ then (hh', empty)
+ else (hh', L.singleton '\r')
+ else E.throwIO (augmentIOError e "hGetContents" h)
+
+-- | Read a single line from a handle.
+hGetLine :: Handle -> IO Text
+hGetLine = hGetLineWith L.fromChunks
+
+-- | Write a string to a handle.
+hPutStr :: Handle -> Text -> IO ()
+hPutStr h = mapM_ (T.hPutStr h) . L.toChunks
+
+-- | Write a string to a handle, followed by a newline.
+hPutStrLn :: Handle -> Text -> IO ()
+hPutStrLn h t = hPutStr h t >> hPutChar h '\n'
+
+-- | The 'interact' function takes a function of type @Text -> Text@
+-- as its argument. The entire input from the standard input device is
+-- passed (lazily) to this function as its argument, and the resulting
+-- string is output on the standard output device.
+interact :: (Text -> Text) -> IO ()
+interact f = putStr . f =<< getContents
+
+-- | Lazily read all user input on 'stdin' as a single string.
+getContents :: IO Text
+getContents = hGetContents stdin
+
+-- | Read a single line of user input from 'stdin'.
+getLine :: IO Text
+getLine = hGetLine stdin
+
+-- | Write a string to 'stdout'.
+putStr :: Text -> IO ()
+putStr = hPutStr stdout
+
+-- | Write a string to 'stdout', followed by a newline.
+putStrLn :: Text -> IO ()
+putStrLn = hPutStrLn stdout
+
+-- $locale
+--
+-- /Note/: The behaviour of functions in this module depends on the
+-- version of GHC you are using.
+--
+-- Beginning with GHC 6.12, text I\/O is performed using the system or
+-- handle's current locale and line ending conventions.
+--
+-- Under GHC 6.10 and earlier, the system I\/O libraries /do not
+-- support/ locale-sensitive I\/O or line ending conversion. On these
+-- versions of GHC, functions in this library all use UTF-8. What
+-- does this mean in practice?
+--
+-- * All data that is read will be decoded as UTF-8.
+--
+-- * Before data is written, it is first encoded as UTF-8.
+--
+-- * On both reading and writing, the platform's native newline
+-- conversion is performed.
+--
+-- If you must use a non-UTF-8 locale on an older version of GHC, you
+-- will have to perform the transcoding yourself, e.g. as follows:
+--
+-- > import qualified Data.ByteString.Lazy as B
+-- > import Data.Text.Lazy (Text)
+-- > import Data.Text.Lazy.Encoding (encodeUtf16)
+-- >
+-- > putStr_Utf16LE :: Text -> IO ()
+-- > putStr_Utf16LE t = B.putStr (encodeUtf16LE t)
diff --git a/Data/Text/Lazy/Internal.hs b/Data/Text/Lazy/Internal.hs
new file mode 100644
index 0000000..4f07f03
--- /dev/null
+++ b/Data/Text/Lazy/Internal.hs
@@ -0,0 +1,20 @@
+{-# LANGUAGE BangPatterns, DeriveDataTypeable #-}
+-- |
+-- Module : Data.Text.Lazy.Internal
+-- Copyright : (c) 2013 Bryan O'Sullivan
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : GHC
+--
+-- This module has been renamed to 'Data.Text.Internal.Lazy'. This
+-- name for the module will be removed in the next major release.
+
+module Data.Text.Lazy.Internal
+ {-# DEPRECATED "Use Data.Text.Internal.Lazy instead" #-}
+ (
+ module Data.Text.Internal.Lazy
+ ) where
+
+import Data.Text.Internal.Lazy
diff --git a/Data/Text/Lazy/Read.hs b/Data/Text/Lazy/Read.hs
new file mode 100644
index 0000000..002601b
--- /dev/null
+++ b/Data/Text/Lazy/Read.hs
@@ -0,0 +1,192 @@
+{-# LANGUAGE OverloadedStrings, CPP #-}
+#if __GLASGOW_HASKELL__ >= 704
+{-# LANGUAGE Safe #-}
+#elif __GLASGOW_HASKELL__ >= 702
+{-# LANGUAGE Trustworthy #-}
+#endif
+
+-- |
+-- Module : Data.Text.Lazy.Read
+-- Copyright : (c) 2010, 2011 Bryan O'Sullivan
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Portability : GHC
+--
+-- Functions used frequently when reading textual data.
+module Data.Text.Lazy.Read
+ (
+ Reader
+ , decimal
+ , hexadecimal
+ , signed
+ , rational
+ , double
+ ) where
+
+import Control.Monad (liftM)
+import Data.Char (isDigit, isHexDigit)
+import Data.Int (Int8, Int16, Int32, Int64)
+import Data.Ratio ((%))
+import Data.Text.Internal.Read
+import Data.Text.Lazy as T
+import Data.Word (Word, Word8, Word16, Word32, Word64)
+
+-- | Read some text. If the read succeeds, return its value and the
+-- remaining text, otherwise an error message.
+type Reader a = IReader Text a
+type Parser = IParser Text
+
+-- | Read a decimal integer. The input must begin with at least one
+-- decimal digit, and is consumed until a non-digit or end of string
+-- is reached.
+--
+-- This function does not handle leading sign characters. If you need
+-- to handle signed input, use @'signed' 'decimal'@.
+--
+-- /Note/: For fixed-width integer types, this function does not
+-- attempt to detect overflow, so a sufficiently long input may give
+-- incorrect results. If you are worried about overflow, use
+-- 'Integer' for your result type.
+decimal :: Integral a => Reader a
+{-# SPECIALIZE decimal :: Reader Int #-}
+{-# SPECIALIZE decimal :: Reader Int8 #-}
+{-# SPECIALIZE decimal :: Reader Int16 #-}
+{-# SPECIALIZE decimal :: Reader Int32 #-}
+{-# SPECIALIZE decimal :: Reader Int64 #-}
+{-# SPECIALIZE decimal :: Reader Integer #-}
+{-# SPECIALIZE decimal :: Reader Data.Word.Word #-}
+{-# SPECIALIZE decimal :: Reader Word8 #-}
+{-# SPECIALIZE decimal :: Reader Word16 #-}
+{-# SPECIALIZE decimal :: Reader Word32 #-}
+{-# SPECIALIZE decimal :: Reader Word64 #-}
+decimal txt
+ | T.null h = Left "input does not start with a digit"
+ | otherwise = Right (T.foldl' go 0 h, t)
+ where (h,t) = T.span isDigit txt
+ go n d = (n * 10 + fromIntegral (digitToInt d))
+
+-- | Read a hexadecimal integer, consisting of an optional leading
+-- @\"0x\"@ followed by at least one hexadecimal digit. Input is
+-- consumed until a non-hex-digit or end of string is reached.
+-- This function is case insensitive.
+--
+-- This function does not handle leading sign characters. If you need
+-- to handle signed input, use @'signed' 'hexadecimal'@.
+--
+-- /Note/: For fixed-width integer types, this function does not
+-- attempt to detect overflow, so a sufficiently long input may give
+-- incorrect results. If you are worried about overflow, use
+-- 'Integer' for your result type.
+hexadecimal :: Integral a => Reader a
+{-# SPECIALIZE hexadecimal :: Reader Int #-}
+{-# SPECIALIZE hexadecimal :: Reader Integer #-}
+hexadecimal txt
+ | h == "0x" || h == "0X" = hex t
+ | otherwise = hex txt
+ where (h,t) = T.splitAt 2 txt
+
+hex :: Integral a => Reader a
+{-# SPECIALIZE hexadecimal :: Reader Int #-}
+{-# SPECIALIZE hexadecimal :: Reader Int8 #-}
+{-# SPECIALIZE hexadecimal :: Reader Int16 #-}
+{-# SPECIALIZE hexadecimal :: Reader Int32 #-}
+{-# SPECIALIZE hexadecimal :: Reader Int64 #-}
+{-# SPECIALIZE hexadecimal :: Reader Integer #-}
+{-# SPECIALIZE hexadecimal :: Reader Word #-}
+{-# SPECIALIZE hexadecimal :: Reader Word8 #-}
+{-# SPECIALIZE hexadecimal :: Reader Word16 #-}
+{-# SPECIALIZE hexadecimal :: Reader Word32 #-}
+{-# SPECIALIZE hexadecimal :: Reader Word64 #-}
+hex txt
+ | T.null h = Left "input does not start with a hexadecimal digit"
+ | otherwise = Right (T.foldl' go 0 h, t)
+ where (h,t) = T.span isHexDigit txt
+ go n d = (n * 16 + fromIntegral (hexDigitToInt d))
+
+-- | Read an optional leading sign character (@\'-\'@ or @\'+\'@) and
+-- apply it to the result of applying the given reader.
+signed :: Num a => Reader a -> Reader a
+{-# INLINE signed #-}
+signed f = runP (signa (P f))
+
+-- | Read a rational number.
+--
+-- This function accepts an optional leading sign character, followed
+-- by at least one decimal digit. The syntax similar to that accepted
+-- by the 'read' function, with the exception that a trailing @\'.\'@
+-- or @\'e\'@ /not/ followed by a number is not consumed.
+--
+-- Examples:
+--
+-- >rational "3" == Right (3.0, "")
+-- >rational "3.1" == Right (3.1, "")
+-- >rational "3e4" == Right (30000.0, "")
+-- >rational "3.1e4" == Right (31000.0, "")
+-- >rational ".3" == Left "input does not start with a digit"
+-- >rational "e3" == Left "input does not start with a digit"
+--
+-- Examples of differences from 'read':
+--
+-- >rational "3.foo" == Right (3.0, ".foo")
+-- >rational "3e" == Right (3.0, "e")
+rational :: Fractional a => Reader a
+{-# SPECIALIZE rational :: Reader Double #-}
+rational = floaty $ \real frac fracDenom -> fromRational $
+ real % 1 + frac % fracDenom
+
+-- | Read a rational number.
+--
+-- The syntax accepted by this function is the same as for 'rational'.
+--
+-- /Note/: This function is almost ten times faster than 'rational',
+-- but is slightly less accurate.
+--
+-- The 'Double' type supports about 16 decimal places of accuracy.
+-- For 94.2% of numbers, this function and 'rational' give identical
+-- results, but for the remaining 5.8%, this function loses precision
+-- around the 15th decimal place. For 0.001% of numbers, this
+-- function will lose precision at the 13th or 14th decimal place.
+double :: Reader Double
+double = floaty $ \real frac fracDenom ->
+ fromIntegral real +
+ fromIntegral frac / fromIntegral fracDenom
+
+signa :: Num a => Parser a -> Parser a
+{-# SPECIALIZE signa :: Parser Int -> Parser Int #-}
+{-# SPECIALIZE signa :: Parser Int8 -> Parser Int8 #-}
+{-# SPECIALIZE signa :: Parser Int16 -> Parser Int16 #-}
+{-# SPECIALIZE signa :: Parser Int32 -> Parser Int32 #-}
+{-# SPECIALIZE signa :: Parser Int64 -> Parser Int64 #-}
+{-# SPECIALIZE signa :: Parser Integer -> Parser Integer #-}
+signa p = do
+ sign <- perhaps '+' $ char (\c -> c == '-' || c == '+')
+ if sign == '+' then p else negate `liftM` p
+
+char :: (Char -> Bool) -> Parser Char
+char p = P $ \t -> case T.uncons t of
+ Just (c,t') | p c -> Right (c,t')
+ _ -> Left "character does not match"
+
+floaty :: Fractional a => (Integer -> Integer -> Integer -> a) -> Reader a
+{-# INLINE floaty #-}
+floaty f = runP $ do
+ sign <- perhaps '+' $ char (\c -> c == '-' || c == '+')
+ real <- P decimal
+ T fraction fracDigits <- perhaps (T 0 0) $ do
+ _ <- char (=='.')
+ digits <- P $ \t -> Right (fromIntegral . T.length $ T.takeWhile isDigit t, t)
+ n <- P decimal
+ return $ T n digits
+ let e c = c == 'e' || c == 'E'
+ power <- perhaps 0 (char e >> signa (P decimal) :: Parser Int)
+ let n = if fracDigits == 0
+ then if power == 0
+ then fromIntegral real
+ else fromIntegral real * (10 ^^ power)
+ else if power == 0
+ then f real fraction (10 ^ fracDigits)
+ else f real fraction (10 ^ fracDigits) * (10 ^^ power)
+ return $! if sign == '+'
+ then n
+ else -n
diff --git a/Data/Text/Read.hs b/Data/Text/Read.hs
new file mode 100644
index 0000000..023f8e4
--- /dev/null
+++ b/Data/Text/Read.hs
@@ -0,0 +1,200 @@
+{-# LANGUAGE OverloadedStrings, UnboxedTuples, CPP #-}
+#if __GLASGOW_HASKELL__ >= 702
+{-# LANGUAGE Trustworthy #-}
+#endif
+
+-- |
+-- Module : Data.Text.Read
+-- Copyright : (c) 2010, 2011 Bryan O'Sullivan
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Portability : GHC
+--
+-- Functions used frequently when reading textual data.
+module Data.Text.Read
+ (
+ Reader
+ , decimal
+ , hexadecimal
+ , signed
+ , rational
+ , double
+ ) where
+
+import Control.Monad (liftM)
+import Data.Char (isDigit, isHexDigit)
+import Data.Int (Int8, Int16, Int32, Int64)
+import Data.Ratio ((%))
+import Data.Text as T
+import Data.Text.Internal.Private (span_)
+import Data.Text.Internal.Read
+import Data.Word (Word, Word8, Word16, Word32, Word64)
+
+-- | Read some text. If the read succeeds, return its value and the
+-- remaining text, otherwise an error message.
+type Reader a = IReader Text a
+type Parser a = IParser Text a
+
+-- | Read a decimal integer. The input must begin with at least one
+-- decimal digit, and is consumed until a non-digit or end of string
+-- is reached.
+--
+-- This function does not handle leading sign characters. If you need
+-- to handle signed input, use @'signed' 'decimal'@.
+--
+-- /Note/: For fixed-width integer types, this function does not
+-- attempt to detect overflow, so a sufficiently long input may give
+-- incorrect results. If you are worried about overflow, use
+-- 'Integer' for your result type.
+decimal :: Integral a => Reader a
+{-# SPECIALIZE decimal :: Reader Int #-}
+{-# SPECIALIZE decimal :: Reader Int8 #-}
+{-# SPECIALIZE decimal :: Reader Int16 #-}
+{-# SPECIALIZE decimal :: Reader Int32 #-}
+{-# SPECIALIZE decimal :: Reader Int64 #-}
+{-# SPECIALIZE decimal :: Reader Integer #-}
+{-# SPECIALIZE decimal :: Reader Data.Word.Word #-}
+{-# SPECIALIZE decimal :: Reader Word8 #-}
+{-# SPECIALIZE decimal :: Reader Word16 #-}
+{-# SPECIALIZE decimal :: Reader Word32 #-}
+{-# SPECIALIZE decimal :: Reader Word64 #-}
+decimal txt
+ | T.null h = Left "input does not start with a digit"
+ | otherwise = Right (T.foldl' go 0 h, t)
+ where (# h,t #) = span_ isDigit txt
+ go n d = (n * 10 + fromIntegral (digitToInt d))
+
+-- | Read a hexadecimal integer, consisting of an optional leading
+-- @\"0x\"@ followed by at least one hexadecimal digit. Input is
+-- consumed until a non-hex-digit or end of string is reached.
+-- This function is case insensitive.
+--
+-- This function does not handle leading sign characters. If you need
+-- to handle signed input, use @'signed' 'hexadecimal'@.
+--
+-- /Note/: For fixed-width integer types, this function does not
+-- attempt to detect overflow, so a sufficiently long input may give
+-- incorrect results. If you are worried about overflow, use
+-- 'Integer' for your result type.
+hexadecimal :: Integral a => Reader a
+{-# SPECIALIZE hexadecimal :: Reader Int #-}
+{-# SPECIALIZE hexadecimal :: Reader Int8 #-}
+{-# SPECIALIZE hexadecimal :: Reader Int16 #-}
+{-# SPECIALIZE hexadecimal :: Reader Int32 #-}
+{-# SPECIALIZE hexadecimal :: Reader Int64 #-}
+{-# SPECIALIZE hexadecimal :: Reader Integer #-}
+{-# SPECIALIZE hexadecimal :: Reader Word #-}
+{-# SPECIALIZE hexadecimal :: Reader Word8 #-}
+{-# SPECIALIZE hexadecimal :: Reader Word16 #-}
+{-# SPECIALIZE hexadecimal :: Reader Word32 #-}
+{-# SPECIALIZE hexadecimal :: Reader Word64 #-}
+hexadecimal txt
+ | h == "0x" || h == "0X" = hex t
+ | otherwise = hex txt
+ where (h,t) = T.splitAt 2 txt
+
+hex :: Integral a => Reader a
+{-# SPECIALIZE hex :: Reader Int #-}
+{-# SPECIALIZE hex :: Reader Int8 #-}
+{-# SPECIALIZE hex :: Reader Int16 #-}
+{-# SPECIALIZE hex :: Reader Int32 #-}
+{-# SPECIALIZE hex :: Reader Int64 #-}
+{-# SPECIALIZE hex :: Reader Integer #-}
+{-# SPECIALIZE hex :: Reader Word #-}
+{-# SPECIALIZE hex :: Reader Word8 #-}
+{-# SPECIALIZE hex :: Reader Word16 #-}
+{-# SPECIALIZE hex :: Reader Word32 #-}
+{-# SPECIALIZE hex :: Reader Word64 #-}
+hex txt
+ | T.null h = Left "input does not start with a hexadecimal digit"
+ | otherwise = Right (T.foldl' go 0 h, t)
+ where (# h,t #) = span_ isHexDigit txt
+ go n d = (n * 16 + fromIntegral (hexDigitToInt d))
+
+-- | Read an optional leading sign character (@\'-\'@ or @\'+\'@) and
+-- apply it to the result of applying the given reader.
+signed :: Num a => Reader a -> Reader a
+{-# INLINE signed #-}
+signed f = runP (signa (P f))
+
+-- | Read a rational number.
+--
+-- This function accepts an optional leading sign character, followed
+-- by at least one decimal digit. The syntax similar to that accepted
+-- by the 'read' function, with the exception that a trailing @\'.\'@
+-- or @\'e\'@ /not/ followed by a number is not consumed.
+--
+-- Examples (with behaviour identical to 'read'):
+--
+-- >rational "3" == Right (3.0, "")
+-- >rational "3.1" == Right (3.1, "")
+-- >rational "3e4" == Right (30000.0, "")
+-- >rational "3.1e4" == Right (31000.0, "")
+-- >rational ".3" == Left "input does not start with a digit"
+-- >rational "e3" == Left "input does not start with a digit"
+--
+-- Examples of differences from 'read':
+--
+-- >rational "3.foo" == Right (3.0, ".foo")
+-- >rational "3e" == Right (3.0, "e")
+rational :: Fractional a => Reader a
+{-# SPECIALIZE rational :: Reader Double #-}
+rational = floaty $ \real frac fracDenom -> fromRational $
+ real % 1 + frac % fracDenom
+
+-- | Read a rational number.
+--
+-- The syntax accepted by this function is the same as for 'rational'.
+--
+-- /Note/: This function is almost ten times faster than 'rational',
+-- but is slightly less accurate.
+--
+-- The 'Double' type supports about 16 decimal places of accuracy.
+-- For 94.2% of numbers, this function and 'rational' give identical
+-- results, but for the remaining 5.8%, this function loses precision
+-- around the 15th decimal place. For 0.001% of numbers, this
+-- function will lose precision at the 13th or 14th decimal place.
+double :: Reader Double
+double = floaty $ \real frac fracDenom ->
+ fromIntegral real +
+ fromIntegral frac / fromIntegral fracDenom
+
+signa :: Num a => Parser a -> Parser a
+{-# SPECIALIZE signa :: Parser Int -> Parser Int #-}
+{-# SPECIALIZE signa :: Parser Int8 -> Parser Int8 #-}
+{-# SPECIALIZE signa :: Parser Int16 -> Parser Int16 #-}
+{-# SPECIALIZE signa :: Parser Int32 -> Parser Int32 #-}
+{-# SPECIALIZE signa :: Parser Int64 -> Parser Int64 #-}
+{-# SPECIALIZE signa :: Parser Integer -> Parser Integer #-}
+signa p = do
+ sign <- perhaps '+' $ char (\c -> c == '-' || c == '+')
+ if sign == '+' then p else negate `liftM` p
+
+char :: (Char -> Bool) -> Parser Char
+char p = P $ \t -> case T.uncons t of
+ Just (c,t') | p c -> Right (c,t')
+ _ -> Left "character does not match"
+
+floaty :: Fractional a => (Integer -> Integer -> Integer -> a) -> Reader a
+{-# INLINE floaty #-}
+floaty f = runP $ do
+ sign <- perhaps '+' $ char (\c -> c == '-' || c == '+')
+ real <- P decimal
+ T fraction fracDigits <- perhaps (T 0 0) $ do
+ _ <- char (=='.')
+ digits <- P $ \t -> Right (T.length $ T.takeWhile isDigit t, t)
+ n <- P decimal
+ return $ T n digits
+ let e c = c == 'e' || c == 'E'
+ power <- perhaps 0 (char e >> signa (P decimal) :: Parser Int)
+ let n = if fracDigits == 0
+ then if power == 0
+ then fromIntegral real
+ else fromIntegral real * (10 ^^ power)
+ else if power == 0
+ then f real fraction (10 ^ fracDigits)
+ else f real fraction (10 ^ fracDigits) * (10 ^^ power)
+ return $! if sign == '+'
+ then n
+ else -n
diff --git a/Data/Text/Show.hs b/Data/Text/Show.hs
new file mode 100644
index 0000000..92d8bce
--- /dev/null
+++ b/Data/Text/Show.hs
@@ -0,0 +1,91 @@
+{-# LANGUAGE CPP, MagicHash #-}
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+#if __GLASGOW_HASKELL__ >= 702
+{-# LANGUAGE Trustworthy #-}
+#endif
+
+-- |
+-- Module : Data.Text.Show
+-- Copyright : (c) 2009-2015 Bryan O'Sullivan
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : GHC
+
+module Data.Text.Show
+ (
+ singleton
+ , unpack
+ , unpackCString#
+ ) where
+
+import Control.Monad.ST (ST)
+import Data.Text.Internal (Text(..), empty_, safe)
+import Data.Text.Internal.Encoding.Utf8 (charTailBytes)
+import Data.Text.Internal.Fusion (stream, unstream)
+import Data.Text.Internal.Unsafe.Char (unsafeWrite)
+import GHC.Prim (Addr#)
+import qualified Data.Text.Array as A
+import qualified Data.Text.Internal.Fusion.Common as S
+
+#if __GLASGOW_HASKELL__ >= 702
+import qualified GHC.CString as GHC
+#else
+import qualified GHC.Base as GHC
+#endif
+
+instance Show Text where
+ showsPrec p ps r = showsPrec p (unpack ps) r
+
+-- | /O(n)/ Convert a 'Text' into a 'String'. Subject to fusion.
+unpack :: Text -> String
+unpack = S.unstreamList . stream
+{-# INLINE [1] unpack #-}
+
+-- | /O(n)/ Convert a literal string into a 'Text'. Subject to
+-- fusion.
+--
+-- This is exposed solely for people writing GHC rewrite rules.
+--
+-- @since 1.2.1.1
+unpackCString# :: Addr# -> Text
+unpackCString# addr# = unstream (S.streamCString# addr#)
+{-# NOINLINE unpackCString# #-}
+
+{-# RULES "TEXT literal" [1] forall a.
+ unstream (S.map safe (S.streamList (GHC.unpackCString# a)))
+ = unpackCString# a #-}
+
+{-# RULES "TEXT literal UTF8" [1] forall a.
+ unstream (S.map safe (S.streamList (GHC.unpackCStringUtf8# a)))
+ = unpackCString# a #-}
+
+{-# RULES "TEXT empty literal" [1]
+ unstream (S.map safe (S.streamList []))
+ = empty_ #-}
+
+{-# RULES "TEXT singleton literal" [1] forall a.
+ unstream (S.map safe (S.streamList [a]))
+ = singleton_ a #-}
+
+-- | /O(1)/ Convert a character into a Text. Subject to fusion.
+-- Performs replacement on invalid scalar values.
+singleton :: Char -> Text
+singleton = unstream . S.singleton . safe
+{-# INLINE [1] singleton #-}
+
+{-# RULES "TEXT singleton" forall a.
+ unstream (S.singleton (safe a))
+ = singleton_ a #-}
+
+-- This is intended to reduce inlining bloat.
+singleton_ :: Char -> Text
+singleton_ c = Text (A.run x) 0 len
+ where x :: ST s (A.MArray s)
+ x = do arr <- A.new len
+ _ <- unsafeWrite arr 0 d
+ return arr
+ len = charTailBytes d + 1
+ d = safe c
+{-# NOINLINE singleton_ #-}
diff --git a/Data/Text/Unsafe.hs b/Data/Text/Unsafe.hs
new file mode 100644
index 0000000..0b2f71f
--- /dev/null
+++ b/Data/Text/Unsafe.hs
@@ -0,0 +1,113 @@
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnboxedTuples #-}
+-- |
+-- Module : Data.Text.Unsafe
+-- Copyright : (c) 2009, 2010, 2011 Bryan O'Sullivan
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com
+-- Portability : portable
+--
+-- A module containing unsafe 'Text' operations, for very very careful
+-- use in heavily tested code.
+module Data.Text.Unsafe
+ (
+ inlineInterleaveST
+ , inlinePerformIO
+ , unsafeDupablePerformIO
+ , Iter(..)
+ , iter
+ , iter_
+ , reverseIter
+ , reverseIter_
+ , unsafeHead
+ , unsafeTail
+ , lengthWord8
+ , takeWord8
+ , dropWord8
+ ) where
+
+#if defined(ASSERTS)
+import Control.Exception (assert)
+#endif
+import qualified Data.Text.Array as A
+import Data.Text.Internal (Text (..))
+import Data.Text.Internal.Encoding.Utf8 (decodeCharIndex,
+ reverseDecodeCharIndex)
+import Data.Text.Internal.Unsafe (inlineInterleaveST,
+ inlinePerformIO)
+import GHC.IO (unsafeDupablePerformIO)
+
+-- | /O(1)/ A variant of 'head' for non-empty 'Text'. 'unsafeHead'
+-- omits the check for the empty case, so there is an obligation on
+-- the programmer to provide a proof that the 'Text' is non-empty.
+unsafeHead :: Text -> Char
+unsafeHead (Text arr off _len) =
+ decodeCharIndex (\c _ -> c) (A.unsafeIndex arr) off
+{-# INLINE unsafeHead #-}
+
+-- | /O(1)/ A variant of 'tail' for non-empty 'Text'. 'unsafeTail'
+-- omits the check for the empty case, so there is an obligation on
+-- the programmer to provide a proof that the 'Text' is non-empty.
+unsafeTail :: Text -> Text
+unsafeTail t@(Text arr off len) =
+#if defined(ASSERTS)
+ assert (d <= len) $
+#endif
+ Text arr (off+d) (len-d)
+ where d = iter_ t 0
+{-# INLINE unsafeTail #-}
+
+data Iter = Iter {-# UNPACK #-} !Char {-# UNPACK #-} !Int
+
+-- | /O(1)/ Iterate (unsafely) one step forwards through a UTF-8
+-- array, returning the current character and the delta to add to give
+-- the next offset to iterate at.
+iter :: Text -> Int -> Iter
+iter (Text arr off _len) i =
+ decodeCharIndex (\c d -> Iter c d) (A.unsafeIndex arr) (off + i)
+{-# INLINE iter #-}
+
+-- | /O(1)/ Iterate one step through a UTF-8 array, returning the
+-- delta to add to give the next offset to iterate at.
+iter_ :: Text -> Int -> Int
+iter_ (Text arr off _len) i =
+ decodeCharIndex (\_ n -> n) (\x -> A.unsafeIndex arr (x + off)) i
+{-# INLINE iter_ #-}
+
+-- | /O(1)/ Iterate one step backwards through a UTF-8 array,
+-- returning the current character and the delta to add (i.e. a
+-- negative number) to give the next offset to iterate at.
+reverseIter :: Text -> Int -> (Char,Int)
+reverseIter (Text arr off _len) i =
+ reverseDecodeCharIndex (\c s -> (c, -s)) idx (off + i)
+ where
+ idx = A.unsafeIndex arr
+{-# INLINE reverseIter #-}
+
+-- | /O(1)/ Iterate one step backwards through a UTF-8 array,
+-- returning the delta to add (i.e. a negative number) to give the
+-- next offset to iterate at.
+--
+-- @since 1.1.1.0
+reverseIter_ :: Text -> Int -> Int
+reverseIter_ (Text arr off _len) i =
+ reverseDecodeCharIndex (\_ n -> -n) (\x -> A.unsafeIndex arr (x + off)) i
+{-# INLINE reverseIter_ #-}
+
+-- | /O(1)/ Return the length of a 'Text' in units of 'Word8'. This
+-- is useful for sizing a target array appropriately before using
+-- 'unsafeCopyToPtr'.
+lengthWord8 :: Text -> Int
+lengthWord8 (Text _arr _off len) = len
+{-# INLINE lengthWord8 #-}
+
+-- | /O(1)/ Unchecked take of 'k' 'Word16's from the front of a 'Text'.
+takeWord8 :: Int -> Text -> Text
+takeWord8 k (Text arr off _len) = Text arr off k
+{-# INLINE takeWord8 #-}
+
+-- | /O(1)/ Unchecked drop of 'k' 'Word16's from the front of a 'Text'.
+dropWord8 :: Int -> Text -> Text
+dropWord8 k (Text arr off len) = Text arr (off+k) (len-k)
+{-# INLINE dropWord8 #-}
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..3c92c1b
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,26 @@
+Copyright (c) 2008-2009, Tom Harper
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials provided
+ with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/Setup.lhs b/Setup.lhs
new file mode 100644
index 0000000..5bde0de
--- /dev/null
+++ b/Setup.lhs
@@ -0,0 +1,3 @@
+#!/usr/bin/env runhaskell
+> import Distribution.Simple
+> main = defaultMain
diff --git a/benchmarks/Setup.hs b/benchmarks/Setup.hs
new file mode 100644
index 0000000..9a994af
--- /dev/null
+++ b/benchmarks/Setup.hs
@@ -0,0 +1,2 @@
+import Distribution.Simple
+main = defaultMain
diff --git a/benchmarks/cbits/time_iconv.c b/benchmarks/cbits/time_iconv.c
new file mode 100644
index 0000000..c563b22
--- /dev/null
+++ b/benchmarks/cbits/time_iconv.c
@@ -0,0 +1,35 @@
+#include <iconv.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+
+int time_iconv(char *srcbuf, size_t srcbufsize)
+{
+ uint16_t *destbuf = NULL;
+ size_t destbufsize;
+ static uint16_t *origdestbuf;
+ static size_t origdestbufsize;
+ iconv_t ic = (iconv_t) -1;
+ int ret = 0;
+
+ if (ic == (iconv_t) -1) {
+ ic = iconv_open("UTF-16LE", "UTF-8");
+ if (ic == (iconv_t) -1) {
+ ret = -1;
+ goto done;
+ }
+ }
+
+ destbufsize = srcbufsize * sizeof(uint16_t);
+ if (destbufsize > origdestbufsize) {
+ free(origdestbuf);
+ origdestbuf = destbuf = malloc(origdestbufsize = destbufsize);
+ } else {
+ destbuf = origdestbuf;
+ }
+
+ iconv(ic, &srcbuf, &srcbufsize, (char**) &destbuf, &destbufsize);
+
+ done:
+ return ret;
+}
diff --git a/benchmarks/haskell/Benchmarks.hs b/benchmarks/haskell/Benchmarks.hs
new file mode 100644
index 0000000..f074ab4
--- /dev/null
+++ b/benchmarks/haskell/Benchmarks.hs
@@ -0,0 +1,79 @@
+-- | Main module to run the micro benchmarks
+--
+{-# LANGUAGE OverloadedStrings #-}
+module Main
+ ( main
+ ) where
+
+import Criterion.Main (Benchmark, defaultMain, bgroup)
+import System.FilePath ((</>))
+import System.IO (IOMode (WriteMode), openFile, hSetEncoding, utf8)
+
+import qualified Benchmarks.Builder as Builder
+import qualified Benchmarks.DecodeUtf8 as DecodeUtf8
+import qualified Benchmarks.EncodeUtf8 as EncodeUtf8
+import qualified Benchmarks.Equality as Equality
+import qualified Benchmarks.FileRead as FileRead
+import qualified Benchmarks.FoldLines as FoldLines
+import qualified Benchmarks.Mul as Mul
+import qualified Benchmarks.Pure as Pure
+import qualified Benchmarks.ReadNumbers as ReadNumbers
+import qualified Benchmarks.Replace as Replace
+import qualified Benchmarks.Search as Search
+import qualified Benchmarks.Stream as Stream
+import qualified Benchmarks.WordFrequencies as WordFrequencies
+
+import qualified Benchmarks.Programs.BigTable as Programs.BigTable
+import qualified Benchmarks.Programs.Cut as Programs.Cut
+import qualified Benchmarks.Programs.Fold as Programs.Fold
+import qualified Benchmarks.Programs.Sort as Programs.Sort
+import qualified Benchmarks.Programs.StripTags as Programs.StripTags
+import qualified Benchmarks.Programs.Throughput as Programs.Throughput
+
+main :: IO ()
+main = benchmarks >>= defaultMain
+
+benchmarks :: IO [Benchmark]
+benchmarks = do
+ sink <- openFile "/dev/null" WriteMode
+ hSetEncoding sink utf8
+
+ -- Traditional benchmarks
+ bs <- sequence
+ [ Builder.benchmark
+ , DecodeUtf8.benchmark "html" (tf "libya-chinese.html")
+ , DecodeUtf8.benchmark "xml" (tf "yiwiki.xml")
+ , DecodeUtf8.benchmark "ascii" (tf "ascii.txt")
+ , DecodeUtf8.benchmark "russian" (tf "russian.txt")
+ , DecodeUtf8.benchmark "japanese" (tf "japanese.txt")
+ , EncodeUtf8.benchmark "επανάληψη 竺法蘭共譯"
+ , Equality.benchmark (tf "japanese.txt")
+ , FileRead.benchmark (tf "russian.txt")
+ , FoldLines.benchmark (tf "russian.txt")
+ , Mul.benchmark
+ , Pure.benchmark "tiny" (tf "tiny.txt")
+ , Pure.benchmark "ascii" (tf "ascii-small.txt")
+ -- , Pure.benchmark "france" (tf "france.html")
+ , Pure.benchmark "russian" (tf "russian-small.txt")
+ , Pure.benchmark "japanese" (tf "japanese.txt")
+ , ReadNumbers.benchmark (tf "numbers.txt")
+ , Replace.benchmark (tf "russian.txt") "принимая" "своем"
+ , Search.benchmark (tf "russian.txt") "принимая"
+ , Stream.benchmark (tf "russian.txt")
+ , WordFrequencies.benchmark (tf "russian.txt")
+ ]
+
+ -- Program-like benchmarks
+ ps <- bgroup "Programs" `fmap` sequence
+ [ Programs.BigTable.benchmark sink
+ , Programs.Cut.benchmark (tf "russian.txt") sink 20 40
+ , Programs.Fold.benchmark (tf "russian.txt") sink
+ , Programs.Sort.benchmark (tf "russian.txt") sink
+ , Programs.StripTags.benchmark (tf "yiwiki.xml") sink
+ , Programs.Throughput.benchmark (tf "russian.txt") sink
+ ]
+
+ return $ bs ++ [ps]
+ where
+ -- Location of a test file
+ tf = ("../tests/text-test-data" </>)
diff --git a/benchmarks/haskell/Benchmarks/Builder.hs b/benchmarks/haskell/Benchmarks/Builder.hs
new file mode 100644
index 0000000..b6e7bd7
--- /dev/null
+++ b/benchmarks/haskell/Benchmarks/Builder.hs
@@ -0,0 +1,75 @@
+-- | Testing the internal builder monoid
+--
+-- Tested in this benchmark:
+--
+-- * Concatenating many small strings using a builder
+--
+{-# LANGUAGE OverloadedStrings #-}
+module Benchmarks.Builder
+ ( benchmark
+ ) where
+
+import Criterion (Benchmark, bgroup, bench, nf)
+import Data.Binary.Builder as B
+import Data.ByteString.Char8 ()
+import Data.Monoid (mconcat, mempty)
+import qualified Blaze.ByteString.Builder as Blaze
+import qualified Blaze.ByteString.Builder.Char.Utf8 as Blaze
+import qualified Data.ByteString as SB
+import qualified Data.ByteString.Lazy as LB
+import qualified Data.Text as T
+import qualified Data.Text.Lazy as LT
+import qualified Data.Text.Lazy.Builder as LTB
+import qualified Data.Text.Lazy.Builder.Int as Int
+import Data.Int (Int64)
+
+benchmark :: IO Benchmark
+benchmark = return $ bgroup "Builder"
+ [ bgroup "Comparison"
+ [ bench "LazyText" $ nf
+ (LT.length . LTB.toLazyText . mconcat . map LTB.fromText) texts
+ , bench "Binary" $ nf
+ (LB.length . B.toLazyByteString . mconcat . map B.fromByteString)
+ byteStrings
+ , bench "Blaze" $ nf
+ (LB.length . Blaze.toLazyByteString . mconcat . map Blaze.fromString)
+ strings
+ ]
+ , bgroup "Int"
+ [ bgroup "Decimal"
+ [ bgroup "Positive" .
+ flip map numbers $ \n ->
+ (bench (show (length (show n))) $ nf (LTB.toLazyText . Int.decimal) n)
+ , bgroup "Negative" .
+ flip map numbers $ \m ->
+ let n = negate m in
+ (bench (show (length (show n))) $ nf (LTB.toLazyText . Int.decimal) n)
+ , bench "Empty" $ nf LTB.toLazyText mempty
+ , bgroup "Show" .
+ flip map numbers $ \n ->
+ (bench (show (length (show n))) $ nf show n)
+ ]
+ ]
+ ]
+ where
+ numbers :: [Int64]
+ numbers = [
+ 6, 14, 500, 9688, 10654, 620735, 5608880, 37010612,
+ 731223504, 5061580596, 24596952933, 711732309084, 2845910093839,
+ 54601756118340, 735159434806159, 3619097625502435, 95777227510267124,
+ 414944309510675693, 8986407456998704019
+ ]
+
+texts :: [T.Text]
+texts = take 200000 $ cycle ["foo", "λx", "由の"]
+{-# NOINLINE texts #-}
+
+-- Note that the non-ascii characters will be chopped
+byteStrings :: [SB.ByteString]
+byteStrings = take 200000 $ cycle ["foo", "λx", "由の"]
+{-# NOINLINE byteStrings #-}
+
+-- Note that the non-ascii characters will be chopped
+strings :: [String]
+strings = take 200000 $ cycle ["foo", "λx", "由の"]
+{-# NOINLINE strings #-}
diff --git a/benchmarks/haskell/Benchmarks/DecodeUtf8.hs b/benchmarks/haskell/Benchmarks/DecodeUtf8.hs
new file mode 100644
index 0000000..8d5f29d
--- /dev/null
+++ b/benchmarks/haskell/Benchmarks/DecodeUtf8.hs
@@ -0,0 +1,67 @@
+{-# LANGUAGE ForeignFunctionInterface #-}
+
+-- | Test decoding of UTF-8
+--
+-- Tested in this benchmark:
+--
+-- * Decoding bytes using UTF-8
+--
+-- In some tests:
+--
+-- * Taking the length of the result
+--
+-- * Taking the init of the result
+--
+-- The latter are used for testing stream fusion.
+--
+module Benchmarks.DecodeUtf8
+ ( benchmark
+ ) where
+
+import Foreign.C.Types
+import Data.ByteString.Internal (ByteString(..))
+import Data.ByteString.Lazy.Internal (ByteString(..))
+import Foreign.Ptr (Ptr, plusPtr)
+import Foreign.ForeignPtr (withForeignPtr)
+import Data.Word (Word8)
+import qualified Criterion as C
+import Criterion (Benchmark, bgroup, nf, whnfIO)
+import qualified Codec.Binary.UTF8.Generic as U8
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Lazy as BL
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as T
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text.Lazy.Encoding as TL
+
+benchmark :: String -> FilePath -> IO Benchmark
+benchmark kind fp = do
+ bs <- B.readFile fp
+ lbs <- BL.readFile fp
+ let bench name = C.bench (name ++ "+" ++ kind)
+ decodeStream (Chunk b0 bs0) = case T.streamDecodeUtf8 b0 of
+ T.Some t0 _ f0 -> t0 : go f0 bs0
+ where go f (Chunk b bs1) = case f b of
+ T.Some t1 _ f1 -> t1 : go f1 bs1
+ go _ _ = []
+ decodeStream _ = []
+ return $ bgroup "DecodeUtf8"
+ [ bench "Strict" $ nf T.decodeUtf8 bs
+ , bench "Stream" $ nf decodeStream lbs
+ , bench "IConv" $ whnfIO $ iconv bs
+ , bench "StrictLength" $ nf (T.length . T.decodeUtf8) bs
+ , bench "StrictInitLength" $ nf (T.length . T.init . T.decodeUtf8) bs
+ , bench "Lazy" $ nf TL.decodeUtf8 lbs
+ , bench "LazyLength" $ nf (TL.length . TL.decodeUtf8) lbs
+ , bench "LazyInitLength" $ nf (TL.length . TL.init . TL.decodeUtf8) lbs
+ , bench "StrictStringUtf8" $ nf U8.toString bs
+ , bench "StrictStringUtf8Length" $ nf (length . U8.toString) bs
+ , bench "LazyStringUtf8" $ nf U8.toString lbs
+ , bench "LazyStringUtf8Length" $ nf (length . U8.toString) lbs
+ ]
+
+iconv :: B.ByteString -> IO CInt
+iconv (PS fp off len) = withForeignPtr fp $ \ptr ->
+ time_iconv (ptr `plusPtr` off) (fromIntegral len)
+
+foreign import ccall unsafe time_iconv :: Ptr Word8 -> CSize -> IO CInt
diff --git a/benchmarks/haskell/Benchmarks/EncodeUtf8.hs b/benchmarks/haskell/Benchmarks/EncodeUtf8.hs
new file mode 100644
index 0000000..758e095
--- /dev/null
+++ b/benchmarks/haskell/Benchmarks/EncodeUtf8.hs
@@ -0,0 +1,33 @@
+-- | UTF-8 encode a text
+--
+-- Tested in this benchmark:
+--
+-- * Replicating a string a number of times
+--
+-- * UTF-8 encoding it
+--
+module Benchmarks.EncodeUtf8
+ ( benchmark
+ ) where
+
+import Criterion (Benchmark, bgroup, bench, whnf)
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Lazy as BL
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as T
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text.Lazy.Encoding as TL
+
+benchmark :: String -> IO Benchmark
+benchmark string = do
+ return $ bgroup "EncodeUtf8"
+ [ bench "Text" $ whnf (B.length . T.encodeUtf8) text
+ , bench "LazyText" $ whnf (BL.length . TL.encodeUtf8) lazyText
+ ]
+ where
+ -- The string in different formats
+ text = T.replicate k $ T.pack string
+ lazyText = TL.replicate (fromIntegral k) $ TL.pack string
+
+ -- Amount
+ k = 100000
diff --git a/benchmarks/haskell/Benchmarks/Equality.hs b/benchmarks/haskell/Benchmarks/Equality.hs
new file mode 100644
index 0000000..33964a2
--- /dev/null
+++ b/benchmarks/haskell/Benchmarks/Equality.hs
@@ -0,0 +1,38 @@
+-- | Compare a string with a copy of itself that is identical except
+-- for the last character.
+--
+-- Tested in this benchmark:
+--
+-- * Comparison of strings (Eq instance)
+--
+module Benchmarks.Equality
+ ( benchmark
+ ) where
+
+import Criterion (Benchmark, bgroup, bench, whnf)
+import qualified Data.ByteString.Char8 as B
+import qualified Data.ByteString.Lazy.Char8 as BL
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as T
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text.Lazy.Encoding as TL
+
+benchmark :: FilePath -> IO Benchmark
+benchmark fp = do
+ b <- B.readFile fp
+ bl1 <- BL.readFile fp
+ -- A lazy bytestring is a list of chunks. When we do not explicitly create two
+ -- different lazy bytestrings at a different address, the bytestring library
+ -- will compare the chunk addresses instead of the chunk contents. This is why
+ -- we read the lazy bytestring twice here.
+ bl2 <- BL.readFile fp
+ l <- readFile fp
+ let t = T.decodeUtf8 b
+ tl = TL.decodeUtf8 bl1
+ return $ bgroup "Equality"
+ [ bench "Text" $ whnf (== T.init t `T.snoc` '\xfffd') t
+ , bench "LazyText" $ whnf (== TL.init tl `TL.snoc` '\xfffd') tl
+ , bench "ByteString" $ whnf (== B.init b `B.snoc` '\xfffd') b
+ , bench "LazyByteString" $ whnf (== BL.init bl2 `BL.snoc` '\xfffd') bl1
+ , bench "String" $ whnf (== init l ++ "\xfffd") l
+ ]
diff --git a/benchmarks/haskell/Benchmarks/FileRead.hs b/benchmarks/haskell/Benchmarks/FileRead.hs
new file mode 100644
index 0000000..443ad5a
--- /dev/null
+++ b/benchmarks/haskell/Benchmarks/FileRead.hs
@@ -0,0 +1,33 @@
+-- | Benchmarks simple file reading
+--
+-- Tested in this benchmark:
+--
+-- * Reading a file from the disk
+--
+module Benchmarks.FileRead
+ ( benchmark
+ ) where
+
+import Control.Applicative ((<$>))
+import Criterion (Benchmark, bgroup, bench, whnfIO)
+import qualified Data.ByteString as SB
+import qualified Data.ByteString.Lazy as LB
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as T
+import qualified Data.Text.IO as T
+import qualified Data.Text.Lazy as LT
+import qualified Data.Text.Lazy.Encoding as LT
+import qualified Data.Text.Lazy.IO as LT
+
+benchmark :: FilePath -> IO Benchmark
+benchmark p = return $ bgroup "FileRead"
+ [ bench "String" $ whnfIO $ length <$> readFile p
+ , bench "ByteString" $ whnfIO $ SB.length <$> SB.readFile p
+ , bench "LazyByteString" $ whnfIO $ LB.length <$> LB.readFile p
+ , bench "Text" $ whnfIO $ T.length <$> T.readFile p
+ , bench "LazyText" $ whnfIO $ LT.length <$> LT.readFile p
+ , bench "TextByteString" $ whnfIO $
+ (T.length . T.decodeUtf8) <$> SB.readFile p
+ , bench "LazyTextByteString" $ whnfIO $
+ (LT.length . LT.decodeUtf8) <$> LB.readFile p
+ ]
diff --git a/benchmarks/haskell/Benchmarks/FoldLines.hs b/benchmarks/haskell/Benchmarks/FoldLines.hs
new file mode 100644
index 0000000..4cc598f
--- /dev/null
+++ b/benchmarks/haskell/Benchmarks/FoldLines.hs
@@ -0,0 +1,58 @@
+-- | Read a file line-by-line using handles, and perform a fold over the lines.
+-- The fold is used here to calculate the number of lines in the file.
+--
+-- Tested in this benchmark:
+--
+-- * Buffered, line-based IO
+--
+{-# LANGUAGE BangPatterns #-}
+module Benchmarks.FoldLines
+ ( benchmark
+ ) where
+
+import Criterion (Benchmark, bgroup, bench, whnfIO)
+import System.IO
+import qualified Data.ByteString as B
+import qualified Data.Text as T
+import qualified Data.Text.IO as T
+
+benchmark :: FilePath -> IO Benchmark
+benchmark fp = return $ bgroup "ReadLines"
+ [ bench "Text" $ withHandle $ foldLinesT (\n _ -> n + 1) (0 :: Int)
+ , bench "ByteString" $ withHandle $ foldLinesB (\n _ -> n + 1) (0 :: Int)
+ ]
+ where
+ withHandle f = whnfIO $ do
+ h <- openFile fp ReadMode
+ hSetBuffering h (BlockBuffering (Just 16384))
+ x <- f h
+ hClose h
+ return x
+
+-- | Text line fold
+--
+foldLinesT :: (a -> T.Text -> a) -> a -> Handle -> IO a
+foldLinesT f z0 h = go z0
+ where
+ go !z = do
+ eof <- hIsEOF h
+ if eof
+ then return z
+ else do
+ l <- T.hGetLine h
+ let z' = f z l in go z'
+{-# INLINE foldLinesT #-}
+
+-- | ByteString line fold
+--
+foldLinesB :: (a -> B.ByteString -> a) -> a -> Handle -> IO a
+foldLinesB f z0 h = go z0
+ where
+ go !z = do
+ eof <- hIsEOF h
+ if eof
+ then return z
+ else do
+ l <- B.hGetLine h
+ let z' = f z l in go z'
+{-# INLINE foldLinesB #-}
diff --git a/benchmarks/haskell/Benchmarks/Mul.hs b/benchmarks/haskell/Benchmarks/Mul.hs
new file mode 100644
index 0000000..c7dfb1d
--- /dev/null
+++ b/benchmarks/haskell/Benchmarks/Mul.hs
@@ -0,0 +1,138 @@
+module Benchmarks.Mul (benchmark) where
+
+import Control.Exception (evaluate)
+import Criterion.Main
+import Data.Int (Int32, Int64)
+import Data.Text.Internal (mul32, mul64)
+import qualified Data.Vector.Unboxed as U
+
+oldMul :: Int64 -> Int64 -> Int64
+oldMul m n
+ | n == 0 = 0
+ | m <= maxBound `quot` n = m * n
+ | otherwise = error "overflow"
+
+benchmark :: IO Benchmark
+benchmark = do
+ _ <- evaluate testVector32
+ _ <- evaluate testVector64
+ return $ bgroup "Mul" [
+ bench "oldMul" $ whnf (U.map (uncurry oldMul)) testVector64
+ , bench "mul64" $ whnf (U.map (uncurry mul64)) testVector64
+ , bench "*64" $ whnf (U.map (uncurry (*))) testVector64
+ , bench "mul32" $ whnf (U.map (uncurry mul32)) testVector32
+ , bench "*32" $ whnf (U.map (uncurry (*))) testVector32
+ ]
+
+testVector64 :: U.Vector (Int64,Int64)
+testVector64 = U.fromList [
+ (0,1248868987182846646),(169004623633872,24458),(482549039517835,7614),
+ (372,8157063115504364),(27,107095594861148252),(3,63249878517962420),
+ (4363,255694473572912),(86678474,1732634806),(1572453024,1800489338),
+ (9384523143,77053781),(49024709555,75095046),(7,43457620410239131),
+ (8,8201563008844571),(387719037,1520696708),(189869238220197,1423),
+ (46788016849611,23063),(503077742109974359,0),(104,1502010908706487),
+ (30478140346,207525518),(80961140129236192,14),(4283,368012829143675),
+ (1028719181728108146,6),(318904,5874863049591),(56724427166898,110794),
+ (234539368,31369110449),(2,251729663598178612),(103291548194451219,5),
+ (76013,5345328755566),(1769631,2980846129318),(40898,60598477385754),
+ (0,98931348893227155),(573555872156917492,3),(318821187115,4476566),
+ (11152874213584,243582),(40274276,16636653248),(127,4249988676030597),
+ (103543712111871836,5),(71,16954462148248238),(3963027173504,216570),
+ (13000,503523808916753),(17038308,20018685905),(0,510350226577891549),
+ (175898,3875698895405),(425299191292676,5651),(17223451323664536,50),
+ (61755131,14247665326),(0,1018195131697569303),(36433751497238985,20),
+ (3473607861601050,1837),(1392342328,1733971838),(225770297367,3249655),
+ (14,127545244155254102),(1751488975299136,2634),(3949208,504190668767),
+ (153329,831454434345),(1066212122928663658,2),(351224,2663633539556),
+ (344565,53388869217),(35825609350446863,54),(276011553660081475,10),
+ (1969754174790470349,3),(35,68088438338633),(506710,3247689556438),
+ (11099382291,327739909),(105787303549,32824363),(210366111,14759049409),
+ (688893241579,3102676),(8490,70047474429581),(152085,29923000251880),
+ (5046974599257095,400),(4183167795,263434071),(10089728,502781960687),
+ (44831977765,4725378),(91,8978094664238578),(30990165721,44053350),
+ (1772377,149651820860),(243420621763408572,4),(32,5790357453815138),
+ (27980806337993771,5),(47696295759774,20848),(1745874142313778,1098),
+ (46869334770121,1203),(886995283,1564424789),(40679396544,76002479),
+ (1,672849481568486995),(337656187205,3157069),(816980552858963,6003),
+ (2271434085804831543,1),(0,1934521023868747186),(6266220038281,15825),
+ (4160,107115946987394),(524,246808621791561),(0,1952519482439636339),
+ (128,2865935904539691),(1044,3211982069426297),(16000511542473,88922),
+ (1253596745404082,2226),(27041,56836278958002),(23201,49247489754471),
+ (175906590497,21252392),(185163584757182295,24),(34742225226802197,150),
+ (2363228,250824838408),(216327527109550,45),(24,81574076994520675),
+ (28559899906542,15356),(10890139774837133,511),(2293,707179303654492),
+ (2749366833,40703233),(0,4498229704622845986),(439,4962056468281937),
+ (662,1453820621089921),(16336770612459631,220),(24282989393,74239137),
+ (2724564648490195,3),(743672760,124992589),(4528103,704330948891),
+ (6050483122491561,250),(13322953,13594265152),(181794,22268101450214),
+ (25957941712,75384092),(43352,7322262295009),(32838,52609059549923),
+ (33003585202001564,2),(103019,68430142267402),(129918230800,8742978),
+ (0,2114347379589080688),(2548,905723041545274),(222745067962838382,0),
+ (1671683850790425181,1),(455,4836932776795684),(794227702827214,6620),
+ (212534135175874,1365),(96432431858,29784975),(466626763743380,3484),
+ (29793949,53041519613),(8359,309952753409844),(3908960585331901,26),
+ (45185288970365760,114),(10131829775,68110174),(58039242399640479,83),
+ (628092278238719399,6),(1,196469106875361889),(302336625,16347502444),
+ (148,3748088684181047),(1,1649096568849015456),(1019866864,2349753026),
+ (8211344830,569363306),(65647579546873,34753),(2340190,1692053129069),
+ (64263301,30758930355),(48681618072372209,110),(7074794736,47640197),
+ (249634721521,7991792),(1162917363807215,232),(7446433349,420634045),
+ (63398619383,60709817),(51359004508011,14200),(131788797028647,7072),
+ (52079887791430043,7),(7,136277667582599838),(28582879735696,50327),
+ (1404582800566278,833),(469164435,15017166943),(99567079957578263,49),
+ (1015285971,3625801566),(321504843,4104079293),(5196954,464515406632),
+ (114246832260876,7468),(8149664437,487119673),(12265299,378168974869),
+ (37711995764,30766513),(3971137243,710996152),(483120070302,603162),
+ (103009942,61645547145),(8476344625340,6987),(547948761229739,1446),
+ (42234,18624767306301),(13486714173011,58948),(4,198309153268019840),
+ (9913176974,325539248),(28246225540203,116822),(2882463945582154,18),
+ (959,25504987505398),(3,1504372236378217710),(13505229956793,374987),
+ (751661959,457611342),(27375926,36219151769),(482168869,5301952074),
+ (1,1577425863241520640),(714116235611821,1164),(904492524250310488,0),
+ (5983514941763398,68),(10759472423,23540686),(72539568471529,34919),
+ (4,176090672310337473),(938702842110356453,1),(673652445,3335287382),
+ (3111998893666122,917),(1568013,3168419765469)]
+
+testVector32 :: U.Vector (Int32,Int32)
+testVector32 = U.fromList [
+ (39242,410),(0,100077553),(2206,9538),(509400240,1),(38048,6368),
+ (1789,651480),(2399,157032),(701,170017),(5241456,14),(11212,70449),
+ (1,227804876),(749687254,1),(74559,2954),(1158,147957),(410604456,1),
+ (170851,1561),(92643422,1),(6192,180509),(7,24202210),(3440,241481),
+ (5753677,5),(294327,1622),(252,4454673),(127684121,11),(28315800,30),
+ (340370905,0),(1,667887987),(592782090,1),(49023,27641),(750,290387),
+ (72886,3847),(0,301047933),(3050276,473),(1,788366142),(59457,15813),
+ (637726933,1),(1135,344317),(853616,264),(696816,493),(7038,12046),
+ (125219574,4),(803694088,1),(107081726,1),(39294,21699),(16361,38191),
+ (132561123,12),(1760,23499),(847543,484),(175687349,1),(2963,252678),
+ (6248,224553),(27596,4606),(5422922,121),(1542,485890),(131,583035),
+ (59096,4925),(3637115,132),(0,947225435),(86854,6794),(2984745,339),
+ (760129569,1),(1,68260595),(380835652,2),(430575,2579),(54514,7211),
+ (15550606,3),(9,27367402),(3007053,207),(7060988,60),(28560,27130),
+ (1355,21087),(10880,53059),(14563646,4),(461886361,1),(2,169260724),
+ (241454126,2),(406797,1),(61631630,16),(44473,5943),(63869104,12),
+ (950300,1528),(2113,62333),(120817,9358),(100261456,1),(426764723,1),
+ (119,12723684),(3,53358711),(4448071,18),(1,230278091),(238,232102),
+ (8,57316440),(42437979,10),(6769,19555),(48590,22006),(11500585,79),
+ (2808,97638),(42,26952545),(11,32104194),(23954638,1),(785427272,0),
+ (513,81379),(31333960,37),(897772,1009),(4,25679692),(103027993,12),
+ (104972702,11),(546,443401),(7,65137092),(88574269,3),(872139069,0),
+ (2,97417121),(378802603,0),(141071401,4),(22613,10575),(2191743,118),
+ (470,116119),(7062,38166),(231056,1847),(43901963,9),(2400,70640),
+ (63553,1555),(34,11249573),(815174,1820),(997894011,0),(98881794,2),
+ (5448,43132),(27956,9),(904926,1357),(112608626,3),(124,613021),
+ (282086,1966),(99,10656881),(113799,1501),(433318,2085),(442,948171),
+ (165380,1043),(28,14372905),(14880,50462),(2386,219918),(229,1797565),
+ (1174961,298),(3925,41833),(3903515,299),(15690452,111),(360860521,3),
+ (7440846,81),(2541026,507),(0,492448477),(6869,82469),(245,8322939),
+ (3503496,253),(123495298,0),(150963,2299),(33,4408482),(1,200911107),
+ (305,252121),(13,123369189),(215846,8181),(2440,65387),(776764401,1),
+ (1241172,434),(8,15493155),(81953961,6),(17884993,5),(26,6893822),
+ (0,502035190),(1,582451018),(2,514870139),(227,3625619),(49,12720258),
+ (1456769,207),(94797661,10),(234407,893),(26843,5783),(15688,24547),
+ (4091,86268),(4339448,151),(21360,6294),(397046497,2),(1227,205936),
+ (9966,21959),(160046791,1),(0,159992224),(27,24974797),(19177,29334),
+ (4136148,42),(21179785,53),(61256583,31),(385,344176),(7,11934915),
+ (1,18992566),(3488065,5),(768021,224),(36288474,7),(8624,117561),
+ (8,20341439),(5903,261475),(561,1007618),(1738,392327),(633049,1708)]
diff --git a/benchmarks/haskell/Benchmarks/Programs/BigTable.hs b/benchmarks/haskell/Benchmarks/Programs/BigTable.hs
new file mode 100644
index 0000000..b546fac
--- /dev/null
+++ b/benchmarks/haskell/Benchmarks/Programs/BigTable.hs
@@ -0,0 +1,42 @@
+-- | Create a large HTML table and dump it to a handle
+--
+-- Tested in this benchmark:
+--
+-- * Creating a large HTML document using a builder
+--
+-- * Writing to a handle
+--
+{-# LANGUAGE OverloadedStrings #-}
+module Benchmarks.Programs.BigTable
+ ( benchmark
+ ) where
+
+import Criterion (Benchmark, bench, whnfIO)
+import Data.Monoid (mappend, mconcat)
+import Data.Text.Lazy.Builder (Builder, fromText, toLazyText)
+import Data.Text.Lazy.IO (hPutStr)
+import System.IO (Handle)
+import qualified Data.Text as T
+
+benchmark :: Handle -> IO Benchmark
+benchmark sink = return $ bench "BigTable" $ whnfIO $ do
+ hPutStr sink "Content-Type: text/html\n\n<table>"
+ hPutStr sink . toLazyText . makeTable =<< rows
+ hPutStr sink "</table>"
+ where
+ -- We provide the number of rows in IO so the builder value isn't shared
+ -- between the benchmark samples.
+ rows :: IO Int
+ rows = return 20000
+ {-# NOINLINE rows #-}
+
+makeTable :: Int -> Builder
+makeTable n = mconcat $ replicate n $ mconcat $ map makeCol [1 .. 50]
+
+makeCol :: Int -> Builder
+makeCol 1 = fromText "<tr><td>1</td>"
+makeCol 50 = fromText "<td>50</td></tr>"
+makeCol i = fromText "<td>" `mappend` (fromInt i `mappend` fromText "</td>")
+
+fromInt :: Int -> Builder
+fromInt = fromText . T.pack . show
diff --git a/benchmarks/haskell/Benchmarks/Programs/Cut.hs b/benchmarks/haskell/Benchmarks/Programs/Cut.hs
new file mode 100644
index 0000000..5887299
--- /dev/null
+++ b/benchmarks/haskell/Benchmarks/Programs/Cut.hs
@@ -0,0 +1,98 @@
+-- | Cut into a file, selecting certain columns (e.g. columns 10 to 40)
+--
+-- Tested in this benchmark:
+--
+-- * Reading the file
+--
+-- * Splitting into lines
+--
+-- * Taking a number of characters from the lines
+--
+-- * Joining the lines
+--
+-- * Writing back to a handle
+--
+module Benchmarks.Programs.Cut
+ ( benchmark
+ ) where
+
+import Criterion (Benchmark, bgroup, bench, whnfIO)
+import System.IO (Handle, hPutStr)
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Char8 as BC
+import qualified Data.ByteString.Lazy as BL
+import qualified Data.ByteString.Lazy.Char8 as BLC
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as T
+import qualified Data.Text.IO as T
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text.Lazy.Encoding as TL
+import qualified Data.Text.Lazy.IO as TL
+
+benchmark :: FilePath -> Handle -> Int -> Int -> IO Benchmark
+benchmark p sink from to = return $ bgroup "Cut"
+ [ bench' "String" string
+ , bench' "ByteString" byteString
+ , bench' "LazyByteString" lazyByteString
+ , bench' "Text" text
+ , bench' "LazyText" lazyText
+ , bench' "TextByteString" textByteString
+ , bench' "LazyTextByteString" lazyTextByteString
+ ]
+ where
+ bench' n s = bench n $ whnfIO (s p sink from to)
+
+string :: FilePath -> Handle -> Int -> Int -> IO ()
+string fp sink from to = do
+ s <- readFile fp
+ hPutStr sink $ cut s
+ where
+ cut = unlines . map (take (to - from) . drop from) . lines
+
+byteString :: FilePath -> Handle -> Int -> Int -> IO ()
+byteString fp sink from to = do
+ bs <- B.readFile fp
+ B.hPutStr sink $ cut bs
+ where
+ cut = BC.unlines . map (B.take (to - from) . B.drop from) . BC.lines
+
+lazyByteString :: FilePath -> Handle -> Int -> Int -> IO ()
+lazyByteString fp sink from to = do
+ bs <- BL.readFile fp
+ BL.hPutStr sink $ cut bs
+ where
+ cut = BLC.unlines . map (BL.take (to' - from') . BL.drop from') . BLC.lines
+ from' = fromIntegral from
+ to' = fromIntegral to
+
+text :: FilePath -> Handle -> Int -> Int -> IO ()
+text fp sink from to = do
+ t <- T.readFile fp
+ T.hPutStr sink $ cut t
+ where
+ cut = T.unlines . map (T.take (to - from) . T.drop from) . T.lines
+
+lazyText :: FilePath -> Handle -> Int -> Int -> IO ()
+lazyText fp sink from to = do
+ t <- TL.readFile fp
+ TL.hPutStr sink $ cut t
+ where
+ cut = TL.unlines . map (TL.take (to' - from') . TL.drop from') . TL.lines
+ from' = fromIntegral from
+ to' = fromIntegral to
+
+textByteString :: FilePath -> Handle -> Int -> Int -> IO ()
+textByteString fp sink from to = do
+ t <- T.decodeUtf8 `fmap` B.readFile fp
+ B.hPutStr sink $ T.encodeUtf8 $ cut t
+ where
+ cut = T.unlines . map (T.take (to - from) . T.drop from) . T.lines
+
+lazyTextByteString :: FilePath -> Handle -> Int -> Int -> IO ()
+lazyTextByteString fp sink from to = do
+ t <- TL.decodeUtf8 `fmap` BL.readFile fp
+ BL.hPutStr sink $ TL.encodeUtf8 $ cut t
+ where
+ cut = TL.unlines . map (TL.take (to' - from') . TL.drop from') . TL.lines
+ from' = fromIntegral from
+ to' = fromIntegral to
diff --git a/benchmarks/haskell/Benchmarks/Programs/Fold.hs b/benchmarks/haskell/Benchmarks/Programs/Fold.hs
new file mode 100644
index 0000000..c84616b
--- /dev/null
+++ b/benchmarks/haskell/Benchmarks/Programs/Fold.hs
@@ -0,0 +1,68 @@
+-- | Benchmark which formats paragraph, like the @sort@ unix utility.
+--
+-- Tested in this benchmark:
+--
+-- * Reading the file
+--
+-- * Splitting into paragraphs
+--
+-- * Reformatting the paragraphs to a certain line width
+--
+-- * Concatenating the results using the text builder
+--
+-- * Writing back to a handle
+--
+{-# LANGUAGE OverloadedStrings #-}
+module Benchmarks.Programs.Fold
+ ( benchmark
+ ) where
+
+import Data.List (foldl')
+import Data.List (intersperse)
+import Data.Monoid (mempty, mappend, mconcat)
+import System.IO (Handle)
+import Criterion (Benchmark, bench, whnfIO)
+import qualified Data.Text as T
+import qualified Data.Text.IO as T
+import qualified Data.Text.Lazy.Builder as TLB
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text.Lazy.IO as TL
+
+benchmark :: FilePath -> Handle -> IO Benchmark
+benchmark i o = return $
+ bench "Fold" $ whnfIO $ T.readFile i >>= TL.hPutStr o . fold 80
+
+-- | We represent a paragraph by a word list
+--
+type Paragraph = [T.Text]
+
+-- | Fold a text
+--
+fold :: Int -> T.Text -> TL.Text
+fold maxWidth = TLB.toLazyText . mconcat .
+ intersperse "\n\n" . map (foldParagraph maxWidth) . paragraphs
+
+-- | Fold a paragraph
+--
+foldParagraph :: Int -> Paragraph -> TLB.Builder
+foldParagraph _ [] = mempty
+foldParagraph max' (w : ws) = fst $ foldl' go (TLB.fromText w, T.length w) ws
+ where
+ go (builder, width) word
+ | width + len + 1 <= max' =
+ (builder `mappend` " " `mappend` word', width + len + 1)
+ | otherwise =
+ (builder `mappend` "\n" `mappend` word', len)
+ where
+ word' = TLB.fromText word
+ len = T.length word
+
+-- | Divide a text into paragraphs
+--
+paragraphs :: T.Text -> [Paragraph]
+paragraphs = splitParagraphs . map T.words . T.lines
+ where
+ splitParagraphs ls = case break null ls of
+ ([], []) -> []
+ (p, []) -> [concat p]
+ (p, lr) -> concat p : splitParagraphs (dropWhile null lr)
diff --git a/benchmarks/haskell/Benchmarks/Programs/Sort.hs b/benchmarks/haskell/Benchmarks/Programs/Sort.hs
new file mode 100644
index 0000000..0176bee
--- /dev/null
+++ b/benchmarks/haskell/Benchmarks/Programs/Sort.hs
@@ -0,0 +1,71 @@
+-- | This benchmark sorts the lines of a file, like the @sort@ unix utility.
+--
+-- Tested in this benchmark:
+--
+-- * Reading the file
+--
+-- * Splitting into lines
+--
+-- * Sorting the lines
+--
+-- * Joining the lines
+--
+-- * Writing back to a handle
+--
+{-# LANGUAGE OverloadedStrings #-}
+module Benchmarks.Programs.Sort
+ ( benchmark
+ ) where
+
+import Criterion (Benchmark, bgroup, bench, whnfIO)
+import Data.Monoid (mconcat)
+import System.IO (Handle, hPutStr)
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Char8 as BC
+import qualified Data.ByteString.Lazy as BL
+import qualified Data.ByteString.Lazy.Char8 as BLC
+import qualified Data.List as L
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as T
+import qualified Data.Text.IO as T
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text.Lazy.Builder as TLB
+import qualified Data.Text.Lazy.Encoding as TL
+import qualified Data.Text.Lazy.IO as TL
+
+benchmark :: FilePath -> Handle -> IO Benchmark
+benchmark i o = return $ bgroup "Sort"
+ [ bench "String" $ whnfIO $ readFile i >>= hPutStr o . string
+ , bench "ByteString" $ whnfIO $ B.readFile i >>= B.hPutStr o . byteString
+ , bench "LazyByteString" $ whnfIO $
+ BL.readFile i >>= BL.hPutStr o . lazyByteString
+ , bench "Text" $ whnfIO $ T.readFile i >>= T.hPutStr o . text
+ , bench "LazyText" $ whnfIO $ TL.readFile i >>= TL.hPutStr o . lazyText
+ , bench "TextByteString" $ whnfIO $ B.readFile i >>=
+ B.hPutStr o . T.encodeUtf8 . text . T.decodeUtf8
+ , bench "LazyTextByteString" $ whnfIO $ BL.readFile i >>=
+ BL.hPutStr o . TL.encodeUtf8 . lazyText . TL.decodeUtf8
+ , bench "TextBuilder" $ whnfIO $ B.readFile i >>=
+ BL.hPutStr o . TL.encodeUtf8 . textBuilder . T.decodeUtf8
+ ]
+
+string :: String -> String
+string = unlines . L.sort . lines
+
+byteString :: B.ByteString -> B.ByteString
+byteString = BC.unlines . L.sort . BC.lines
+
+lazyByteString :: BL.ByteString -> BL.ByteString
+lazyByteString = BLC.unlines . L.sort . BLC.lines
+
+text :: T.Text -> T.Text
+text = T.unlines . L.sort . T.lines
+
+lazyText :: TL.Text -> TL.Text
+lazyText = TL.unlines . L.sort . TL.lines
+
+-- | Text variant using a builder monoid for the final concatenation
+--
+textBuilder :: T.Text -> TL.Text
+textBuilder = TLB.toLazyText . mconcat . L.intersperse (TLB.singleton '\n') .
+ map TLB.fromText . L.sort . T.lines
diff --git a/benchmarks/haskell/Benchmarks/Programs/StripTags.hs b/benchmarks/haskell/Benchmarks/Programs/StripTags.hs
new file mode 100644
index 0000000..7fda0f3
--- /dev/null
+++ b/benchmarks/haskell/Benchmarks/Programs/StripTags.hs
@@ -0,0 +1,53 @@
+-- | Program to replace HTML tags by whitespace
+--
+-- This program was originally contributed by Petr Prokhorenkov.
+--
+-- Tested in this benchmark:
+--
+-- * Reading the file
+--
+-- * Replacing text between HTML tags (<>) with whitespace
+--
+-- * Writing back to a handle
+--
+{-# OPTIONS_GHC -fspec-constr-count=5 #-}
+module Benchmarks.Programs.StripTags
+ ( benchmark
+ ) where
+
+import Criterion (Benchmark, bgroup, bench, whnfIO)
+import Data.List (mapAccumL)
+import System.IO (Handle, hPutStr)
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Char8 as BC
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as T
+import qualified Data.Text.IO as T
+
+benchmark :: FilePath -> Handle -> IO Benchmark
+benchmark i o = return $ bgroup "StripTags"
+ [ bench "String" $ whnfIO $ readFile i >>= hPutStr o . string
+ , bench "ByteString" $ whnfIO $ B.readFile i >>= B.hPutStr o . byteString
+ , bench "Text" $ whnfIO $ T.readFile i >>= T.hPutStr o . text
+ , bench "TextByteString" $ whnfIO $
+ B.readFile i >>= B.hPutStr o . T.encodeUtf8 . text . T.decodeUtf8
+ ]
+
+string :: String -> String
+string = snd . mapAccumL step 0
+
+text :: T.Text -> T.Text
+text = snd . T.mapAccumL step 0
+
+byteString :: B.ByteString -> B.ByteString
+byteString = snd . BC.mapAccumL step 0
+
+step :: Int -> Char -> (Int, Char)
+step d c
+ | d > 0 || d' > 0 = (d', ' ')
+ | otherwise = (d', c)
+ where
+ d' = d + depth c
+ depth '>' = 1
+ depth '<' = -1
+ depth _ = 0
diff --git a/benchmarks/haskell/Benchmarks/Programs/Throughput.hs b/benchmarks/haskell/Benchmarks/Programs/Throughput.hs
new file mode 100644
index 0000000..7499039
--- /dev/null
+++ b/benchmarks/haskell/Benchmarks/Programs/Throughput.hs
@@ -0,0 +1,41 @@
+-- | This benchmark simply reads and writes a file using the various string
+-- libraries. The point of it is that we can make better estimations on how
+-- much time the other benchmarks spend doing IO.
+--
+-- Note that we expect ByteStrings to be a whole lot faster, since they do not
+-- do any actual encoding/decoding here, while String and Text do have UTF-8
+-- encoding/decoding.
+--
+-- Tested in this benchmark:
+--
+-- * Reading the file
+--
+-- * Replacing text between HTML tags (<>) with whitespace
+--
+-- * Writing back to a handle
+--
+module Benchmarks.Programs.Throughput
+ ( benchmark
+ ) where
+
+import Criterion (Benchmark, bgroup, bench, whnfIO)
+import System.IO (Handle, hPutStr)
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Lazy as BL
+import qualified Data.Text.Encoding as T
+import qualified Data.Text.IO as T
+import qualified Data.Text.Lazy.Encoding as TL
+import qualified Data.Text.Lazy.IO as TL
+
+benchmark :: FilePath -> Handle -> IO Benchmark
+benchmark fp sink = return $ bgroup "Throughput"
+ [ bench "String" $ whnfIO $ readFile fp >>= hPutStr sink
+ , bench "ByteString" $ whnfIO $ B.readFile fp >>= B.hPutStr sink
+ , bench "LazyByteString" $ whnfIO $ BL.readFile fp >>= BL.hPutStr sink
+ , bench "Text" $ whnfIO $ T.readFile fp >>= T.hPutStr sink
+ , bench "LazyText" $ whnfIO $ TL.readFile fp >>= TL.hPutStr sink
+ , bench "TextByteString" $ whnfIO $
+ B.readFile fp >>= B.hPutStr sink . T.encodeUtf8 . T.decodeUtf8
+ , bench "LazyTextByteString" $ whnfIO $
+ BL.readFile fp >>= BL.hPutStr sink . TL.encodeUtf8 . TL.decodeUtf8
+ ]
diff --git a/benchmarks/haskell/Benchmarks/Pure.hs b/benchmarks/haskell/Benchmarks/Pure.hs
new file mode 100644
index 0000000..9ae5117
--- /dev/null
+++ b/benchmarks/haskell/Benchmarks/Pure.hs
@@ -0,0 +1,486 @@
+-- | Benchmarks various pure functions from the Text library
+--
+-- Tested in this benchmark:
+--
+-- * Most pure functions defined the string types
+--
+{-# LANGUAGE BangPatterns, CPP, GADTs, MagicHash #-}
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+module Benchmarks.Pure
+ ( benchmark
+ ) where
+
+import Control.DeepSeq (NFData (..))
+import Control.Exception (evaluate)
+import Criterion (Benchmark, bgroup, bench, nf)
+import Data.Char (toLower, toUpper)
+import Data.Monoid (mappend, mempty)
+import GHC.Base (Char (..), Int (..), chr#, ord#, (+#))
+import qualified Data.ByteString.Char8 as BS
+import qualified Data.ByteString.Lazy.Char8 as BL
+import qualified Data.ByteString.UTF8 as UTF8
+import qualified Data.List as L
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as T
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text.Lazy.Builder as TB
+import qualified Data.Text.Lazy.Encoding as TL
+
+benchmark :: String -> FilePath -> IO Benchmark
+benchmark kind fp = do
+ -- Evaluate stuff before actually running the benchmark, we don't want to
+ -- count it here.
+
+ -- ByteString A
+ bsa <- BS.readFile fp
+
+ -- Text A/B, LazyText A/B
+ ta <- evaluate $ T.decodeUtf8 bsa
+ tb <- evaluate $ T.toUpper ta
+ tla <- evaluate $ TL.fromChunks (T.chunksOf 16376 ta)
+ tlb <- evaluate $ TL.fromChunks (T.chunksOf 16376 tb)
+
+ -- ByteString B, LazyByteString A/B
+ bsb <- evaluate $ T.encodeUtf8 tb
+ bla <- evaluate $ BL.fromChunks (chunksOf 16376 bsa)
+ blb <- evaluate $ BL.fromChunks (chunksOf 16376 bsb)
+
+ -- String A/B
+ sa <- evaluate $ UTF8.toString bsa
+ sb <- evaluate $ T.unpack tb
+
+ -- Lengths
+ bsa_len <- evaluate $ BS.length bsa
+ ta_len <- evaluate $ T.length ta
+ bla_len <- evaluate $ BL.length bla
+ tla_len <- evaluate $ TL.length tla
+ sa_len <- evaluate $ L.length sa
+
+ -- Lines
+ bsl <- evaluate $ BS.lines bsa
+ bll <- evaluate $ BL.lines bla
+ tl <- evaluate $ T.lines ta
+ tll <- evaluate $ TL.lines tla
+ sl <- evaluate $ L.lines sa
+
+ return $ bgroup "Pure"
+ [ bgroup "append"
+ [ benchT $ nf (T.append tb) ta
+ , benchTL $ nf (TL.append tlb) tla
+ , benchBS $ nf (BS.append bsb) bsa
+ , benchBSL $ nf (BL.append blb) bla
+ , benchS $ nf ((++) sb) sa
+ ]
+ , bgroup "concat"
+ [ benchT $ nf T.concat tl
+ , benchTL $ nf TL.concat tll
+ , benchBS $ nf BS.concat bsl
+ , benchBSL $ nf BL.concat bll
+ , benchS $ nf L.concat sl
+ ]
+ , bgroup "cons"
+ [ benchT $ nf (T.cons c) ta
+ , benchTL $ nf (TL.cons c) tla
+ , benchBS $ nf (BS.cons c) bsa
+ , benchBSL $ nf (BL.cons c) bla
+ , benchS $ nf (c:) sa
+ ]
+ , bgroup "concatMap"
+ [ benchT $ nf (T.concatMap (T.replicate 3 . T.singleton)) ta
+ , benchTL $ nf (TL.concatMap (TL.replicate 3 . TL.singleton)) tla
+ , benchBS $ nf (BS.concatMap (BS.replicate 3)) bsa
+ , benchBSL $ nf (BL.concatMap (BL.replicate 3)) bla
+ , benchS $ nf (L.concatMap (L.replicate 3 . (:[]))) sa
+ ]
+ , bgroup "decode"
+ [ benchT $ nf T.decodeUtf8 bsa
+ , benchTL $ nf TL.decodeUtf8 bla
+ , benchBS $ nf BS.unpack bsa
+ , benchBSL $ nf BL.unpack bla
+ , benchS $ nf UTF8.toString bsa
+ ]
+ , bgroup "decode'"
+ [ benchT $ nf T.decodeUtf8' bsa
+ , benchTL $ nf TL.decodeUtf8' bla
+ ]
+ , bgroup "drop"
+ [ benchT $ nf (T.drop (ta_len `div` 3)) ta
+ , benchTL $ nf (TL.drop (tla_len `div` 3)) tla
+ , benchBS $ nf (BS.drop (bsa_len `div` 3)) bsa
+ , benchBSL $ nf (BL.drop (bla_len `div` 3)) bla
+ , benchS $ nf (L.drop (sa_len `div` 3)) sa
+ ]
+ , bgroup "encode"
+ [ benchT $ nf T.encodeUtf8 ta
+ , benchTL $ nf TL.encodeUtf8 tla
+ , benchBS $ nf BS.pack sa
+ , benchBSL $ nf BL.pack sa
+ , benchS $ nf UTF8.fromString sa
+ ]
+ , bgroup "filter"
+ [ benchT $ nf (T.filter p0) ta
+ , benchTL $ nf (TL.filter p0) tla
+ , benchBS $ nf (BS.filter p0) bsa
+ , benchBSL $ nf (BL.filter p0) bla
+ , benchS $ nf (L.filter p0) sa
+ ]
+ , bgroup "filter.filter"
+ [ benchT $ nf (T.filter p1 . T.filter p0) ta
+ , benchTL $ nf (TL.filter p1 . TL.filter p0) tla
+ , benchBS $ nf (BS.filter p1 . BS.filter p0) bsa
+ , benchBSL $ nf (BL.filter p1 . BL.filter p0) bla
+ , benchS $ nf (L.filter p1 . L.filter p0) sa
+ ]
+ , bgroup "foldl'"
+ [ benchT $ nf (T.foldl' len 0) ta
+ , benchTL $ nf (TL.foldl' len 0) tla
+ , benchBS $ nf (BS.foldl' len 0) bsa
+ , benchBSL $ nf (BL.foldl' len 0) bla
+ , benchS $ nf (L.foldl' len 0) sa
+ ]
+ , bgroup "foldr"
+ [ benchT $ nf (L.length . T.foldr (:) []) ta
+ , benchTL $ nf (L.length . TL.foldr (:) []) tla
+ , benchBS $ nf (L.length . BS.foldr (:) []) bsa
+ , benchBSL $ nf (L.length . BL.foldr (:) []) bla
+ , benchS $ nf (L.length . L.foldr (:) []) sa
+ ]
+ , bgroup "head"
+ [ benchT $ nf T.head ta
+ , benchTL $ nf TL.head tla
+ , benchBS $ nf BS.head bsa
+ , benchBSL $ nf BL.head bla
+ , benchS $ nf L.head sa
+ ]
+ , bgroup "init"
+ [ benchT $ nf T.init ta
+ , benchTL $ nf TL.init tla
+ , benchBS $ nf BS.init bsa
+ , benchBSL $ nf BL.init bla
+ , benchS $ nf L.init sa
+ ]
+ , bgroup "intercalate"
+ [ benchT $ nf (T.intercalate tsw) tl
+ , benchTL $ nf (TL.intercalate tlw) tll
+ , benchBS $ nf (BS.intercalate bsw) bsl
+ , benchBSL $ nf (BL.intercalate blw) bll
+ , benchS $ nf (L.intercalate lw) sl
+ ]
+ , bgroup "intersperse"
+ [ benchT $ nf (T.intersperse c) ta
+ , benchTL $ nf (TL.intersperse c) tla
+ , benchBS $ nf (BS.intersperse c) bsa
+ , benchBSL $ nf (BL.intersperse c) bla
+ , benchS $ nf (L.intersperse c) sa
+ ]
+ , bgroup "isInfixOf"
+ [ benchT $ nf (T.isInfixOf tsw) ta
+ , benchTL $ nf (TL.isInfixOf tlw) tla
+ , benchBS $ nf (BS.isInfixOf bsw) bsa
+ -- no isInfixOf for lazy bytestrings
+ , benchS $ nf (L.isInfixOf lw) sa
+ ]
+ , bgroup "last"
+ [ benchT $ nf T.last ta
+ , benchTL $ nf TL.last tla
+ , benchBS $ nf BS.last bsa
+ , benchBSL $ nf BL.last bla
+ , benchS $ nf L.last sa
+ ]
+ , bgroup "map"
+ [ benchT $ nf (T.map f) ta
+ , benchTL $ nf (TL.map f) tla
+ , benchBS $ nf (BS.map f) bsa
+ , benchBSL $ nf (BL.map f) bla
+ , benchS $ nf (L.map f) sa
+ ]
+ , bgroup "mapAccumL"
+ [ benchT $ nf (T.mapAccumL g 0) ta
+ , benchTL $ nf (TL.mapAccumL g 0) tla
+ , benchBS $ nf (BS.mapAccumL g 0) bsa
+ , benchBSL $ nf (BL.mapAccumL g 0) bla
+ , benchS $ nf (L.mapAccumL g 0) sa
+ ]
+ , bgroup "mapAccumR"
+ [ benchT $ nf (T.mapAccumR g 0) ta
+ , benchTL $ nf (TL.mapAccumR g 0) tla
+ , benchBS $ nf (BS.mapAccumR g 0) bsa
+ , benchBSL $ nf (BL.mapAccumR g 0) bla
+ , benchS $ nf (L.mapAccumR g 0) sa
+ ]
+ , bgroup "map.map"
+ [ benchT $ nf (T.map f . T.map f) ta
+ , benchTL $ nf (TL.map f . TL.map f) tla
+ , benchBS $ nf (BS.map f . BS.map f) bsa
+ , benchBSL $ nf (BL.map f . BL.map f) bla
+ , benchS $ nf (L.map f . L.map f) sa
+ ]
+ , bgroup "replicate char"
+ [ benchT $ nf (T.replicate bsa_len) (T.singleton c)
+ , benchTL $ nf (TL.replicate (fromIntegral bsa_len)) (TL.singleton c)
+ , benchBS $ nf (BS.replicate bsa_len) c
+ , benchBSL $ nf (BL.replicate (fromIntegral bsa_len)) c
+ , benchS $ nf (L.replicate bsa_len) c
+ ]
+ , bgroup "replicate string"
+ [ benchT $ nf (T.replicate (bsa_len `div` T.length tsw)) tsw
+ , benchTL $ nf (TL.replicate (fromIntegral bsa_len `div` TL.length tlw)) tlw
+ , benchS $ nf (replicat (bsa_len `div` T.length tsw)) lw
+ ]
+ , bgroup "reverse"
+ [ benchT $ nf T.reverse ta
+ , benchTL $ nf TL.reverse tla
+ , benchBS $ nf BS.reverse bsa
+ , benchBSL $ nf BL.reverse bla
+ , benchS $ nf L.reverse sa
+ ]
+ , bgroup "take"
+ [ benchT $ nf (T.take (ta_len `div` 3)) ta
+ , benchTL $ nf (TL.take (tla_len `div` 3)) tla
+ , benchBS $ nf (BS.take (bsa_len `div` 3)) bsa
+ , benchBSL $ nf (BL.take (bla_len `div` 3)) bla
+ , benchS $ nf (L.take (sa_len `div` 3)) sa
+ ]
+ , bgroup "tail"
+ [ benchT $ nf T.tail ta
+ , benchTL $ nf TL.tail tla
+ , benchBS $ nf BS.tail bsa
+ , benchBSL $ nf BL.tail bla
+ , benchS $ nf L.tail sa
+ ]
+ , bgroup "toLower"
+ [ benchT $ nf T.toLower ta
+ , benchTL $ nf TL.toLower tla
+ , benchBS $ nf (BS.map toLower) bsa
+ , benchBSL $ nf (BL.map toLower) bla
+ , benchS $ nf (L.map toLower) sa
+ ]
+ , bgroup "toUpper"
+ [ benchT $ nf T.toUpper ta
+ , benchTL $ nf TL.toUpper tla
+ , benchBS $ nf (BS.map toUpper) bsa
+ , benchBSL $ nf (BL.map toUpper) bla
+ , benchS $ nf (L.map toUpper) sa
+ ]
+ , bgroup "uncons"
+ [ benchT $ nf T.uncons ta
+ , benchTL $ nf TL.uncons tla
+ , benchBS $ nf BS.uncons bsa
+ , benchBSL $ nf BL.uncons bla
+ , benchS $ nf L.uncons sa
+ ]
+ , bgroup "words"
+ [ benchT $ nf T.words ta
+ , benchTL $ nf TL.words tla
+ , benchBS $ nf BS.words bsa
+ , benchBSL $ nf BL.words bla
+ , benchS $ nf L.words sa
+ ]
+ , bgroup "zipWith"
+ [ benchT $ nf (T.zipWith min tb) ta
+ , benchTL $ nf (TL.zipWith min tlb) tla
+ , benchBS $ nf (BS.zipWith min bsb) bsa
+ , benchBSL $ nf (BL.zipWith min blb) bla
+ , benchS $ nf (L.zipWith min sb) sa
+ ]
+ , bgroup "length"
+ [ bgroup "cons"
+ [ benchT $ nf (T.length . T.cons c) ta
+ , benchTL $ nf (TL.length . TL.cons c) tla
+ , benchBS $ nf (BS.length . BS.cons c) bsa
+ , benchBSL $ nf (BL.length . BL.cons c) bla
+ , benchS $ nf (L.length . (:) c) sa
+ ]
+ , bgroup "decode"
+ [ benchT $ nf (T.length . T.decodeUtf8) bsa
+ , benchTL $ nf (TL.length . TL.decodeUtf8) bla
+ , benchBS $ nf (L.length . BS.unpack) bsa
+ , benchBSL $ nf (L.length . BL.unpack) bla
+ , bench "StringUTF8" $ nf (L.length . UTF8.toString) bsa
+ ]
+ , bgroup "drop"
+ [ benchT $ nf (T.length . T.drop (ta_len `div` 3)) ta
+ , benchTL $ nf (TL.length . TL.drop (tla_len `div` 3)) tla
+ , benchBS $ nf (BS.length . BS.drop (bsa_len `div` 3)) bsa
+ , benchBSL $ nf (BL.length . BL.drop (bla_len `div` 3)) bla
+ , benchS $ nf (L.length . L.drop (sa_len `div` 3)) sa
+ ]
+ , bgroup "filter"
+ [ benchT $ nf (T.length . T.filter p0) ta
+ , benchTL $ nf (TL.length . TL.filter p0) tla
+ , benchBS $ nf (BS.length . BS.filter p0) bsa
+ , benchBSL $ nf (BL.length . BL.filter p0) bla
+ , benchS $ nf (L.length . L.filter p0) sa
+ ]
+ , bgroup "filter.filter"
+ [ benchT $ nf (T.length . T.filter p1 . T.filter p0) ta
+ , benchTL $ nf (TL.length . TL.filter p1 . TL.filter p0) tla
+ , benchBS $ nf (BS.length . BS.filter p1 . BS.filter p0) bsa
+ , benchBSL $ nf (BL.length . BL.filter p1 . BL.filter p0) bla
+ , benchS $ nf (L.length . L.filter p1 . L.filter p0) sa
+ ]
+ , bgroup "init"
+ [ benchT $ nf (T.length . T.init) ta
+ , benchTL $ nf (TL.length . TL.init) tla
+ , benchBS $ nf (BS.length . BS.init) bsa
+ , benchBSL $ nf (BL.length . BL.init) bla
+ , benchS $ nf (L.length . L.init) sa
+ ]
+ , bgroup "intercalate"
+ [ benchT $ nf (T.length . T.intercalate tsw) tl
+ , benchTL $ nf (TL.length . TL.intercalate tlw) tll
+ , benchBS $ nf (BS.length . BS.intercalate bsw) bsl
+ , benchBSL $ nf (BL.length . BL.intercalate blw) bll
+ , benchS $ nf (L.length . L.intercalate lw) sl
+ ]
+ , bgroup "intersperse"
+ [ benchT $ nf (T.length . T.intersperse c) ta
+ , benchTL $ nf (TL.length . TL.intersperse c) tla
+ , benchBS $ nf (BS.length . BS.intersperse c) bsa
+ , benchBSL $ nf (BL.length . BL.intersperse c) bla
+ , benchS $ nf (L.length . L.intersperse c) sa
+ ]
+ , bgroup "map"
+ [ benchT $ nf (T.length . T.map f) ta
+ , benchTL $ nf (TL.length . TL.map f) tla
+ , benchBS $ nf (BS.length . BS.map f) bsa
+ , benchBSL $ nf (BL.length . BL.map f) bla
+ , benchS $ nf (L.length . L.map f) sa
+ ]
+ , bgroup "map.map"
+ [ benchT $ nf (T.length . T.map f . T.map f) ta
+ , benchTL $ nf (TL.length . TL.map f . TL.map f) tla
+ , benchBS $ nf (BS.length . BS.map f . BS.map f) bsa
+ , benchS $ nf (L.length . L.map f . L.map f) sa
+ ]
+ , bgroup "replicate char"
+ [ benchT $ nf (T.length . T.replicate bsa_len) (T.singleton c)
+ , benchTL $ nf (TL.length . TL.replicate (fromIntegral bsa_len)) (TL.singleton c)
+ , benchBS $ nf (BS.length . BS.replicate bsa_len) c
+ , benchBSL $ nf (BL.length . BL.replicate (fromIntegral bsa_len)) c
+ , benchS $ nf (L.length . L.replicate bsa_len) c
+ ]
+ , bgroup "replicate string"
+ [ benchT $ nf (T.length . T.replicate (bsa_len `div` T.length tsw)) tsw
+ , benchTL $ nf (TL.length . TL.replicate (fromIntegral bsa_len `div` TL.length tlw)) tlw
+ , benchS $ nf (L.length . replicat (bsa_len `div` T.length tsw)) lw
+ ]
+ , bgroup "take"
+ [ benchT $ nf (T.length . T.take (ta_len `div` 3)) ta
+ , benchTL $ nf (TL.length . TL.take (tla_len `div` 3)) tla
+ , benchBS $ nf (BS.length . BS.take (bsa_len `div` 3)) bsa
+ , benchBSL $ nf (BL.length . BL.take (bla_len `div` 3)) bla
+ , benchS $ nf (L.length . L.take (sa_len `div` 3)) sa
+ ]
+ , bgroup "tail"
+ [ benchT $ nf (T.length . T.tail) ta
+ , benchTL $ nf (TL.length . TL.tail) tla
+ , benchBS $ nf (BS.length . BS.tail) bsa
+ , benchBSL $ nf (BL.length . BL.tail) bla
+ , benchS $ nf (L.length . L.tail) sa
+ ]
+ , bgroup "toLower"
+ [ benchT $ nf (T.length . T.toLower) ta
+ , benchTL $ nf (TL.length . TL.toLower) tla
+ , benchBS $ nf (BS.length . BS.map toLower) bsa
+ , benchBSL $ nf (BL.length . BL.map toLower) bla
+ , benchS $ nf (L.length . L.map toLower) sa
+ ]
+ , bgroup "toUpper"
+ [ benchT $ nf (T.length . T.toUpper) ta
+ , benchTL $ nf (TL.length . TL.toUpper) tla
+ , benchBS $ nf (BS.length . BS.map toUpper) bsa
+ , benchBSL $ nf (BL.length . BL.map toUpper) bla
+ , benchS $ nf (L.length . L.map toUpper) sa
+ ]
+ , bgroup "words"
+ [ benchT $ nf (L.length . T.words) ta
+ , benchTL $ nf (L.length . TL.words) tla
+ , benchBS $ nf (L.length . BS.words) bsa
+ , benchBSL $ nf (L.length . BL.words) bla
+ , benchS $ nf (L.length . L.words) sa
+ ]
+ , bgroup "zipWith"
+ [ benchT $ nf (T.length . T.zipWith min tb) ta
+ , benchTL $ nf (TL.length . TL.zipWith min tlb) tla
+ , benchBS $ nf (L.length . BS.zipWith min bsb) bsa
+ , benchBSL $ nf (L.length . BL.zipWith min blb) bla
+ , benchS $ nf (L.length . L.zipWith min sb) sa
+ ]
+ ]
+ , bgroup "Builder"
+ [ bench "mappend char" $ nf (TL.length . TB.toLazyText . mappendNChar 'a') 10000
+ , bench "mappend 8 char" $ nf (TL.length . TB.toLazyText . mappend8Char) 'a'
+ , bench "mappend text" $ nf (TL.length . TB.toLazyText . mappendNText short) 10000
+ ]
+ ]
+ where
+ benchS = bench ("String+" ++ kind)
+ benchT = bench ("Text+" ++ kind)
+ benchTL = bench ("LazyText+" ++ kind)
+ benchBS = bench ("ByteString+" ++ kind)
+ benchBSL = bench ("LazyByteString+" ++ kind)
+
+ c = 'й'
+ p0 = (== c)
+ p1 = (/= 'д')
+ lw = "право"
+ bsw = UTF8.fromString lw
+ blw = BL.fromChunks [bsw]
+ tsw = T.pack lw
+ tlw = TL.fromChunks [tsw]
+ f (C# c#) = C# (chr# (ord# c# +# 1#))
+ g (I# i#) (C# c#) = (I# (i# +# 1#), C# (chr# (ord# c# +# i#)))
+ len l _ = l + (1::Int)
+ replicat n = concat . L.replicate n
+ short = T.pack "short"
+
+#if !MIN_VERSION_bytestring(0,10,0)
+instance NFData BS.ByteString
+
+instance NFData BL.ByteString where
+ rnf BL.Empty = ()
+ rnf (BL.Chunk _ ts) = rnf ts
+#endif
+
+data B where
+ B :: NFData a => a -> B
+
+instance NFData B where
+ rnf (B b) = rnf b
+
+-- | Split a bytestring in chunks
+--
+chunksOf :: Int -> BS.ByteString -> [BS.ByteString]
+chunksOf k = go
+ where
+ go t = case BS.splitAt k t of
+ (a,b) | BS.null a -> []
+ | otherwise -> a : go b
+
+-- | Append a character n times
+--
+mappendNChar :: Char -> Int -> TB.Builder
+mappendNChar c n = go 0
+ where
+ go i
+ | i < n = TB.singleton c `mappend` go (i+1)
+ | otherwise = mempty
+
+-- | Gives more opportunity for inlining and elimination of unnecesary
+-- bounds checks.
+--
+mappend8Char :: Char -> TB.Builder
+mappend8Char c = TB.singleton c `mappend` TB.singleton c `mappend`
+ TB.singleton c `mappend` TB.singleton c `mappend`
+ TB.singleton c `mappend` TB.singleton c `mappend`
+ TB.singleton c `mappend` TB.singleton c
+
+-- | Append a text N times
+--
+mappendNText :: T.Text -> Int -> TB.Builder
+mappendNText t n = go 0
+ where
+ go i
+ | i < n = TB.fromText t `mappend` go (i+1)
+ | otherwise = mempty
diff --git a/benchmarks/haskell/Benchmarks/ReadNumbers.hs b/benchmarks/haskell/Benchmarks/ReadNumbers.hs
new file mode 100644
index 0000000..9d78ffb
--- /dev/null
+++ b/benchmarks/haskell/Benchmarks/ReadNumbers.hs
@@ -0,0 +1,93 @@
+-- | Read numbers from a file with a just a number on each line, find the
+-- minimum of those numbers. The file contains different kinds of numbers:
+--
+-- * Decimals
+--
+-- * Hexadecimals
+--
+-- * Floating point numbers
+--
+-- * Floating point numbers in scientific notation
+--
+-- The different benchmarks will only take into account the values they can
+-- parse.
+--
+-- Tested in this benchmark:
+--
+-- * Lexing/parsing of different numerical types
+--
+module Benchmarks.ReadNumbers
+ ( benchmark
+ ) where
+
+import Criterion (Benchmark, bgroup, bench, whnf)
+import Data.List (foldl')
+import Numeric (readDec, readFloat, readHex)
+import qualified Data.ByteString.Char8 as B
+import qualified Data.ByteString.Lazy.Char8 as BL
+import qualified Data.ByteString.Lex.Fractional as B
+import qualified Data.Text as T
+import qualified Data.Text.IO as T
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text.Lazy.IO as TL
+import qualified Data.Text.Lazy.Read as TL
+import qualified Data.Text.Read as T
+
+benchmark :: FilePath -> IO Benchmark
+benchmark fp = do
+ -- Read all files into lines: string, text, lazy text, bytestring, lazy
+ -- bytestring
+ s <- lines `fmap` readFile fp
+ t <- T.lines `fmap` T.readFile fp
+ tl <- TL.lines `fmap` TL.readFile fp
+ b <- B.lines `fmap` B.readFile fp
+ bl <- BL.lines `fmap` BL.readFile fp
+ return $ bgroup "ReadNumbers"
+ [ bench "DecimalString" $ whnf (int . string readDec) s
+ , bench "HexadecimalString" $ whnf (int . string readHex) s
+ , bench "DoubleString" $ whnf (double . string readFloat) s
+
+ , bench "DecimalText" $ whnf (int . text (T.signed T.decimal)) t
+ , bench "HexadecimalText" $ whnf (int . text (T.signed T.hexadecimal)) t
+ , bench "DoubleText" $ whnf (double . text T.double) t
+ , bench "RationalText" $ whnf (double . text T.rational) t
+
+ , bench "DecimalLazyText" $
+ whnf (int . text (TL.signed TL.decimal)) tl
+ , bench "HexadecimalLazyText" $
+ whnf (int . text (TL.signed TL.hexadecimal)) tl
+ , bench "DoubleLazyText" $
+ whnf (double . text TL.double) tl
+ , bench "RationalLazyText" $
+ whnf (double . text TL.rational) tl
+
+ , bench "DecimalByteString" $ whnf (int . byteString B.readInt) b
+ , bench "DoubleByteString" $ whnf (double . byteString B.readDecimal) b
+
+ , bench "DecimalLazyByteString" $
+ whnf (int . byteString BL.readInt) bl
+ ]
+ where
+ -- Used for fixing types
+ int :: Int -> Int
+ int = id
+ double :: Double -> Double
+ double = id
+
+string :: (Ord a, Num a) => (t -> [(a, t)]) -> [t] -> a
+string reader = foldl' go 1000000
+ where
+ go z t = case reader t of [(n, _)] -> min n z
+ _ -> z
+
+text :: (Ord a, Num a) => (t -> Either String (a,t)) -> [t] -> a
+text reader = foldl' go 1000000
+ where
+ go z t = case reader t of Left _ -> z
+ Right (n, _) -> min n z
+
+byteString :: (Ord a, Num a) => (t -> Maybe (a,t)) -> [t] -> a
+byteString reader = foldl' go 1000000
+ where
+ go z t = case reader t of Nothing -> z
+ Just (n, _) -> min n z
diff --git a/benchmarks/haskell/Benchmarks/Replace.hs b/benchmarks/haskell/Benchmarks/Replace.hs
new file mode 100644
index 0000000..f2dcf32
--- /dev/null
+++ b/benchmarks/haskell/Benchmarks/Replace.hs
@@ -0,0 +1,43 @@
+{-# LANGUAGE BangPatterns #-}
+-- | Replace a string by another string
+--
+-- Tested in this benchmark:
+--
+-- * Search and replace of a pattern in a text
+--
+module Benchmarks.Replace
+ ( benchmark
+ ) where
+
+import Criterion (Benchmark, bgroup, bench, nf)
+import qualified Data.ByteString.Char8 as B
+import qualified Data.ByteString.Lazy as BL
+import qualified Data.ByteString.Lazy.Search as BL
+import qualified Data.ByteString.Search as B
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as T
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text.Lazy.Encoding as TL
+import qualified Data.Text.Lazy.IO as TL
+
+benchmark :: FilePath -> String -> String -> IO Benchmark
+benchmark fp pat sub = do
+ tl <- TL.readFile fp
+ bl <- BL.readFile fp
+ let !t = TL.toStrict tl
+ !b = T.encodeUtf8 t
+ return $ bgroup "Replace" [
+ bench "Text" $ nf (T.length . T.replace tpat tsub) t
+ , bench "ByteString" $ nf (BL.length . B.replace bpat bsub) b
+ , bench "LazyText" $ nf (TL.length . TL.replace tlpat tlsub) tl
+ , bench "LazyByteString" $ nf (BL.length . BL.replace blpat blsub) bl
+ ]
+ where
+ tpat = T.pack pat
+ tsub = T.pack sub
+ tlpat = TL.pack pat
+ tlsub = TL.pack sub
+ bpat = T.encodeUtf8 tpat
+ bsub = T.encodeUtf8 tsub
+ blpat = B.concat $ BL.toChunks $ TL.encodeUtf8 tlpat
+ blsub = B.concat $ BL.toChunks $ TL.encodeUtf8 tlsub
diff --git a/benchmarks/haskell/Benchmarks/Search.hs b/benchmarks/haskell/Benchmarks/Search.hs
new file mode 100644
index 0000000..93f1e05
--- /dev/null
+++ b/benchmarks/haskell/Benchmarks/Search.hs
@@ -0,0 +1,48 @@
+-- | Search for a pattern in a file, find the number of occurences
+--
+-- Tested in this benchmark:
+--
+-- * Searching all occurences of a pattern using library routines
+--
+module Benchmarks.Search
+ ( benchmark
+ ) where
+
+import Criterion (Benchmark, bench, bgroup, whnf)
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Lazy as BL
+import qualified Data.ByteString.Lazy.Search as BL
+import qualified Data.ByteString.Search as B
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as T
+import qualified Data.Text.IO as T
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text.Lazy.IO as TL
+
+benchmark :: FilePath -> T.Text -> IO Benchmark
+benchmark fp needleT = do
+ b <- B.readFile fp
+ bl <- BL.readFile fp
+ t <- T.readFile fp
+ tl <- TL.readFile fp
+ return $ bgroup "FileIndices"
+ [ bench "ByteString" $ whnf (byteString needleB) b
+ , bench "LazyByteString" $ whnf (lazyByteString needleB) bl
+ , bench "Text" $ whnf (text needleT) t
+ , bench "LazyText" $ whnf (lazyText needleTL) tl
+ ]
+ where
+ needleB = T.encodeUtf8 needleT
+ needleTL = TL.fromChunks [needleT]
+
+byteString :: B.ByteString -> B.ByteString -> Int
+byteString needle = length . B.indices needle
+
+lazyByteString :: B.ByteString -> BL.ByteString -> Int
+lazyByteString needle = length . BL.indices needle
+
+text :: T.Text -> T.Text -> Int
+text = T.count
+
+lazyText :: TL.Text -> TL.Text -> Int
+lazyText needle = fromIntegral . TL.count needle
diff --git a/benchmarks/haskell/Benchmarks/Stream.hs b/benchmarks/haskell/Benchmarks/Stream.hs
new file mode 100644
index 0000000..232c8b5
--- /dev/null
+++ b/benchmarks/haskell/Benchmarks/Stream.hs
@@ -0,0 +1,104 @@
+-- | This module contains a number of benchmarks for the different streaming
+-- functions
+--
+-- Tested in this benchmark:
+--
+-- * Most streaming functions
+--
+{-# LANGUAGE BangPatterns #-}
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+module Benchmarks.Stream
+ ( benchmark
+ ) where
+
+import Control.DeepSeq (NFData (..))
+import Criterion (Benchmark, bgroup, bench, nf)
+import Data.Text.Internal.Fusion.Types (Step (..), Stream (..))
+import qualified Data.Text.Encoding as T
+import qualified Data.Text.Encoding.Error as E
+import qualified Data.Text.Internal.Encoding.Fusion as T
+import qualified Data.Text.Internal.Encoding.Fusion.Common as F
+import qualified Data.Text.Internal.Fusion as F
+import qualified Data.Text.IO as T
+import qualified Data.Text.Lazy.Encoding as TL
+import qualified Data.Text.Internal.Lazy.Encoding.Fusion as TL
+import qualified Data.Text.Internal.Lazy.Fusion as FL
+import qualified Data.Text.Lazy.IO as TL
+
+instance NFData a => NFData (Stream a) where
+ -- Currently, this implementation does not force evaluation of the size hint
+ rnf (Stream next s0 _) = go s0
+ where
+ go !s = case next s of
+ Done -> ()
+ Skip s' -> go s'
+ Yield x s' -> rnf x `seq` go s'
+
+benchmark :: FilePath -> IO Benchmark
+benchmark fp = do
+ -- Different formats
+ t <- T.readFile fp
+ let !utf8 = T.encodeUtf8 t
+ !utf16le = T.encodeUtf16LE t
+ !utf16be = T.encodeUtf16BE t
+ !utf32le = T.encodeUtf32LE t
+ !utf32be = T.encodeUtf32BE t
+
+ -- Once again for the lazy variants
+ tl <- TL.readFile fp
+ let !utf8L = TL.encodeUtf8 tl
+ !utf16leL = TL.encodeUtf16LE tl
+ !utf16beL = TL.encodeUtf16BE tl
+ !utf32leL = TL.encodeUtf32LE tl
+ !utf32beL = TL.encodeUtf32BE tl
+
+ -- For the functions which operate on streams
+ let !s = F.stream t
+
+ return $ bgroup "Stream"
+
+ -- Fusion
+ [ bgroup "stream" $
+ [ bench "Text" $ nf F.stream t
+ , bench "LazyText" $ nf FL.stream tl
+ ]
+ -- must perform exactly the same as stream above due to
+ -- stream/unstream (i.e. stream after unstream) fusion
+ , bgroup "stream-fusion" $
+ [ bench "Text" $ nf (F.stream . F.unstream . F.stream) t
+ , bench "LazyText" $ nf (FL.stream . FL.unstream . FL.stream) tl
+ ]
+ -- measure the overhead of unstream after stream
+ , bgroup "stream-unstream" $
+ [ bench "Text" $ nf (F.unstream . F.stream) t
+ , bench "LazyText" $ nf (FL.unstream . FL.stream) tl
+ ]
+
+ -- Encoding.Fusion
+ , bgroup "streamUtf8"
+ [ bench "Text" $ nf (T.streamUtf8 E.lenientDecode) utf8
+ , bench "LazyText" $ nf (TL.streamUtf8 E.lenientDecode) utf8L
+ ]
+ , bgroup "streamUtf16LE"
+ [ bench "Text" $ nf (T.streamUtf16LE E.lenientDecode) utf16le
+ , bench "LazyText" $ nf (TL.streamUtf16LE E.lenientDecode) utf16leL
+ ]
+ , bgroup "streamUtf16BE"
+ [ bench "Text" $ nf (T.streamUtf16BE E.lenientDecode) utf16be
+ , bench "LazyText" $ nf (TL.streamUtf16BE E.lenientDecode) utf16beL
+ ]
+ , bgroup "streamUtf32LE"
+ [ bench "Text" $ nf (T.streamUtf32LE E.lenientDecode) utf32le
+ , bench "LazyText" $ nf (TL.streamUtf32LE E.lenientDecode) utf32leL
+ ]
+ , bgroup "streamUtf32BE"
+ [ bench "Text" $ nf (T.streamUtf32BE E.lenientDecode) utf32be
+ , bench "LazyText" $ nf (TL.streamUtf32BE E.lenientDecode) utf32beL
+ ]
+
+ -- Encoding.Fusion.Common
+ , bench "restreamUtf16LE" $ nf F.restreamUtf16LE s
+ , bench "restreamUtf16BE" $ nf F.restreamUtf16BE s
+ , bench "restreamUtf32LE" $ nf F.restreamUtf32LE s
+ , bench "restreamUtf32BE" $ nf F.restreamUtf32BE s
+ ]
diff --git a/benchmarks/haskell/Benchmarks/WordFrequencies.hs b/benchmarks/haskell/Benchmarks/WordFrequencies.hs
new file mode 100644
index 0000000..698103d
--- /dev/null
+++ b/benchmarks/haskell/Benchmarks/WordFrequencies.hs
@@ -0,0 +1,36 @@
+-- | A word frequency count using the different string types
+--
+-- Tested in this benchmark:
+--
+-- * Splitting into words
+--
+-- * Converting to lowercase
+--
+-- * Comparing: Eq/Ord instances
+--
+module Benchmarks.WordFrequencies
+ ( benchmark
+ ) where
+
+import Criterion (Benchmark, bench, bgroup, whnf)
+import Data.Char (toLower)
+import Data.List (foldl')
+import Data.Map (Map)
+import qualified Data.ByteString.Char8 as B
+import qualified Data.Map as M
+import qualified Data.Text as T
+import qualified Data.Text.IO as T
+
+benchmark :: FilePath -> IO Benchmark
+benchmark fp = do
+ s <- readFile fp
+ b <- B.readFile fp
+ t <- T.readFile fp
+ return $ bgroup "WordFrequencies"
+ [ bench "String" $ whnf (frequencies . words . map toLower) s
+ , bench "ByteString" $ whnf (frequencies . B.words . B.map toLower) b
+ , bench "Text" $ whnf (frequencies . T.words . T.toLower) t
+ ]
+
+frequencies :: Ord a => [a] -> Map a Int
+frequencies = foldl' (\m k -> M.insertWith (+) k 1 m) M.empty
diff --git a/benchmarks/haskell/Multilang.hs b/benchmarks/haskell/Multilang.hs
new file mode 100644
index 0000000..a861afa
--- /dev/null
+++ b/benchmarks/haskell/Multilang.hs
@@ -0,0 +1,32 @@
+{-# LANGUAGE BangPatterns, OverloadedStrings, RankNTypes #-}
+
+module Main (
+ main
+ ) where
+
+import Control.Monad (forM_)
+import qualified Data.ByteString as B
+import qualified Data.Text as Text
+import Data.Text.Encoding (decodeUtf8)
+import Data.Text (Text)
+import System.IO (hFlush, stdout)
+import Timer (timer)
+
+type BM = Text -> ()
+
+bm :: forall a. (Text -> a) -> BM
+bm f t = f t `seq` ()
+
+benchmarks :: [(String, Text.Text -> ())]
+benchmarks = [
+ ("find_first", bm $ Text.isInfixOf "en:Benin")
+ , ("find_index", bm $ Text.findIndex (=='c'))
+ ]
+
+main :: IO ()
+main = do
+ !contents <- decodeUtf8 `fmap` B.readFile "../tests/text-test-data/yiwiki.xml"
+ forM_ benchmarks $ \(name, bmark) -> do
+ putStr $ name ++ " "
+ hFlush stdout
+ putStrLn =<< (timer 100 contents bmark)
diff --git a/benchmarks/haskell/Timer.hs b/benchmarks/haskell/Timer.hs
new file mode 100644
index 0000000..ac09616
--- /dev/null
+++ b/benchmarks/haskell/Timer.hs
@@ -0,0 +1,30 @@
+{-# LANGUAGE BangPatterns #-}
+
+module Timer (timer) where
+
+import Control.Exception (evaluate)
+import Data.Time.Clock.POSIX (getPOSIXTime)
+import GHC.Float (FFFormat(..), formatRealFloat)
+
+ickyRound :: Int -> Double -> String
+ickyRound k = formatRealFloat FFFixed (Just k)
+
+timer :: Int -> a -> (a -> b) -> IO String
+timer count a0 f = do
+ let loop !k !fastest
+ | k <= 0 = return fastest
+ | otherwise = do
+ start <- getPOSIXTime
+ let inner a i
+ | i <= 0 = return ()
+ | otherwise = evaluate (f a) >> inner a (i-1)
+ inner a0 count
+ end <- getPOSIXTime
+ let elapsed = end - start
+ loop (k-1) (min fastest (elapsed / fromIntegral count))
+ t <- loop (3::Int) 1e300
+ let log10 x = log x / log 10
+ ft = realToFrac t
+ prec = round (log10 (fromIntegral count) - log10 ft)
+ return $! ickyRound prec ft
+{-# NOINLINE timer #-}
diff --git a/benchmarks/python/cut.py b/benchmarks/python/cut.py
new file mode 100644
index 0000000..fbfc7b7
--- /dev/null
+++ b/benchmarks/python/cut.py
@@ -0,0 +1,12 @@
+#!/usr/bin/env python
+
+import utils, sys, codecs
+
+def cut(filename, l, r):
+ content = open(filename, encoding='utf-8')
+ for line in content:
+ print(line[l:r])
+
+for f in sys.argv[1:]:
+ t = utils.benchmark(lambda: cut(f, 20, 40))
+ sys.stderr.write('{0}: {1}\n'.format(f, t))
diff --git a/benchmarks/python/multilang.py b/benchmarks/python/multilang.py
new file mode 100755
index 0000000..f286854
--- /dev/null
+++ b/benchmarks/python/multilang.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python
+
+import math
+import sys
+import time
+
+def find_first():
+ cf = contents.find
+ return timer(lambda: cf("en:Benin"))
+
+def timer(f, count=100):
+ a = 1e300
+ def g():
+ return
+ for i in xrange(3):
+ start = time.time()
+ for j in xrange(count):
+ g()
+ a = min(a, (time.time() - start) / count)
+
+ b = 1e300
+ for i in xrange(3):
+ start = time.time()
+ for j in xrange(count):
+ f()
+ b = min(b, (time.time() - start) / count)
+
+ return round(b - a, int(round(math.log(count, 10) - math.log(b - a, 10))))
+
+contents = open('../../tests/text-test-data/yiwiki.xml', 'r').read()
+contents = contents.decode('utf-8')
+
+benchmarks = (
+ find_first,
+ )
+
+to_run = sys.argv[1:]
+bms = []
+if to_run:
+ for r in to_run:
+ for b in benchmarks:
+ if b.__name__.startswith(r):
+ bms.append(b)
+else:
+ bms = benchmarks
+
+for b in bms:
+ sys.stdout.write(b.__name__ + ' ')
+ sys.stdout.flush()
+ print b()
diff --git a/benchmarks/python/sort.py b/benchmarks/python/sort.py
new file mode 100644
index 0000000..2c8b350
--- /dev/null
+++ b/benchmarks/python/sort.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python
+
+import utils, sys, codecs
+
+def sort(filename):
+ content = open(filename, encoding='utf-8').read()
+ lines = content.splitlines()
+ lines.sort()
+ print('\n'.join(lines))
+
+for f in sys.argv[1:]:
+ t = utils.benchmark(lambda: sort(f))
+ sys.stderr.write('{0}: {1}\n'.format(f, t))
diff --git a/benchmarks/python/strip_tags.py b/benchmarks/python/strip_tags.py
new file mode 100644
index 0000000..8f144ba
--- /dev/null
+++ b/benchmarks/python/strip_tags.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python
+
+import utils, sys
+
+def strip_tags(filename):
+ string = open(filename, encoding='utf-8').read()
+
+ d = 0
+ out = []
+
+ for c in string:
+ if c == '<': d += 1
+
+ if d > 0:
+ out += ' '
+ else:
+ out += c
+
+ if c == '>': d -= 1
+
+ print(''.join(out))
+
+for f in sys.argv[1:]:
+ t = utils.benchmark(lambda: strip_tags(f))
+ sys.stderr.write('{0}: {1}\n'.format(f, t))
diff --git a/benchmarks/python/utils.py b/benchmarks/python/utils.py
new file mode 100755
index 0000000..5651e9b
--- /dev/null
+++ b/benchmarks/python/utils.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+
+import sys, time
+
+def benchmark_once(f):
+ start = time.time()
+ f()
+ end = time.time()
+ return end - start
+
+def benchmark(f):
+ runs = 100
+ total = 0.0
+ for i in range(runs):
+ result = benchmark_once(f)
+ sys.stderr.write('Run {0}: {1}\n'.format(i, result))
+ total += result
+ return total / runs
diff --git a/benchmarks/ruby/cut.rb b/benchmarks/ruby/cut.rb
new file mode 100644
index 0000000..d207d4d
--- /dev/null
+++ b/benchmarks/ruby/cut.rb
@@ -0,0 +1,16 @@
+#!/usr/bin/env ruby
+
+require './utils.rb'
+
+def cut(filename, l, r)
+ File.open(filename, 'r:utf-8') do |file|
+ file.each_line do |line|
+ puts line[l, r - l]
+ end
+ end
+end
+
+ARGV.each do |f|
+ t = benchmark { cut(f, 20, 40) }
+ STDERR.puts "#{f}: #{t}"
+end
diff --git a/benchmarks/ruby/fold.rb b/benchmarks/ruby/fold.rb
new file mode 100644
index 0000000..5b6f50b
--- /dev/null
+++ b/benchmarks/ruby/fold.rb
@@ -0,0 +1,50 @@
+#!/usr/bin/env ruby
+
+require './utils.rb'
+
+def fold(filename, max_width)
+ File.open(filename, 'r:utf-8') do |file|
+ # Words in this paragraph
+ paragraph = []
+
+ file.each_line do |line|
+ # If we encounter an empty line, we reformat and dump the current
+ # paragraph
+ if line.strip.empty?
+ puts fold_paragraph(paragraph, max_width)
+ puts
+ paragraph = []
+ # Otherwise, we append the words found in the line to the paragraph
+ else
+ paragraph.concat line.split
+ end
+ end
+
+ # Last paragraph
+ puts fold_paragraph(paragraph, max_width) unless paragraph.empty?
+ end
+end
+
+# Fold a single paragraph to the desired width
+def fold_paragraph(paragraph, max_width)
+ # Gradually build our output
+ str, *rest = paragraph
+ width = str.length
+
+ rest.each do |word|
+ if width + word.length + 1 <= max_width
+ str << ' ' << word
+ width += word.length + 1
+ else
+ str << "\n" << word
+ width = word.length
+ end
+ end
+
+ str
+end
+
+ARGV.each do |f|
+ t = benchmark { fold(f, 80) }
+ STDERR.puts "#{f}: #{t}"
+end
diff --git a/benchmarks/ruby/sort.rb b/benchmarks/ruby/sort.rb
new file mode 100644
index 0000000..45099f0
--- /dev/null
+++ b/benchmarks/ruby/sort.rb
@@ -0,0 +1,15 @@
+#!/usr/bin/env ruby
+
+require './utils.rb'
+
+def sort(filename)
+ File.open(filename, 'r:utf-8') do |file|
+ content = file.read
+ puts content.lines.sort.join
+ end
+end
+
+ARGV.each do |f|
+ t = benchmark { sort(f) }
+ STDERR.puts "#{f}: #{t}"
+end
diff --git a/benchmarks/ruby/strip_tags.rb b/benchmarks/ruby/strip_tags.rb
new file mode 100644
index 0000000..44f6914
--- /dev/null
+++ b/benchmarks/ruby/strip_tags.rb
@@ -0,0 +1,22 @@
+#!/usr/bin/env ruby
+
+require './utils.rb'
+
+def strip_tags(filename)
+ File.open(filename, 'r:utf-8') do |file|
+ str = file.read
+
+ d = 0
+
+ str.each_char do |c|
+ d += 1 if c == '<'
+ putc(if d > 0 then ' ' else c end)
+ d -= 1 if c == '>'
+ end
+ end
+end
+
+ARGV.each do |f|
+ t = benchmark { strip_tags(f) }
+ STDERR.puts "#{f}: #{t}"
+end
diff --git a/benchmarks/ruby/utils.rb b/benchmarks/ruby/utils.rb
new file mode 100644
index 0000000..bf7254c
--- /dev/null
+++ b/benchmarks/ruby/utils.rb
@@ -0,0 +1,14 @@
+require 'benchmark'
+
+def benchmark(&block)
+ runs = 100
+ total = 0
+
+ runs.times do |i|
+ result = Benchmark.measure(&block).total
+ $stderr.puts "Run #{i}: #{result}"
+ total += result
+ end
+
+ total / runs
+end
diff --git a/benchmarks/text-benchmarks.cabal b/benchmarks/text-benchmarks.cabal
new file mode 100644
index 0000000..e5542f2
--- /dev/null
+++ b/benchmarks/text-benchmarks.cabal
@@ -0,0 +1,139 @@
+name: text-benchmarks
+version: 0.0.0.0
+synopsis: Benchmarks for the text package
+description: Benchmarks for the text package
+homepage: https://bitbucket.org/bos/text
+license: BSD2
+license-file: ../LICENSE
+author: Jasper Van der Jeugt <jaspervdj@gmail.com>,
+ Bryan O'Sullivan <bos@serpentine.com>,
+ Tom Harper <rtomharper@googlemail.com>,
+ Duncan Coutts <duncan@haskell.org>
+maintainer: jaspervdj@gmail.com
+category: Text
+build-type: Simple
+cabal-version: >=1.8
+
+flag bytestring-builder
+ description: Depend on the bytestring-builder package for backwards compatibility.
+ default: False
+ manual: False
+
+flag llvm
+ description: use LLVM
+ default: False
+ manual: True
+
+executable text-benchmarks
+ ghc-options: -Wall -O2 -rtsopts
+ if flag(llvm)
+ ghc-options: -fllvm
+ cpp-options: -DHAVE_DEEPSEQ -DINTEGER_GMP
+ build-depends: array,
+ base == 4.*,
+ binary,
+ blaze-builder,
+ bytestring-lexing >= 0.5.0,
+ containers,
+ criterion >= 0.10.0.0,
+ deepseq,
+ directory,
+ filepath,
+ ghc-prim,
+ integer-gmp,
+ stringsearch,
+ utf8-string,
+ vector
+
+ if flag(bytestring-builder)
+ build-depends: bytestring >= 0.9 && < 0.10.4,
+ bytestring-builder >= 0.10.4
+ else
+ build-depends: bytestring >= 0.10.4
+
+ -- modules for benchmark proper
+ c-sources: cbits/time_iconv.c
+ hs-source-dirs: haskell
+ main-is: Benchmarks.hs
+ other-modules:
+ Benchmarks.Builder
+ Benchmarks.Concat
+ Benchmarks.DecodeUtf8
+ Benchmarks.EncodeUtf8
+ Benchmarks.Equality
+ Benchmarks.FileRead
+ Benchmarks.FoldLines
+ Benchmarks.Mul
+ Benchmarks.Programs.BigTable
+ Benchmarks.Programs.Cut
+ Benchmarks.Programs.Fold
+ Benchmarks.Programs.Sort
+ Benchmarks.Programs.StripTags
+ Benchmarks.Programs.Throughput
+ Benchmarks.Pure
+ Benchmarks.ReadNumbers
+ Benchmarks.Replace
+ Benchmarks.Search
+ Benchmarks.Stream
+ Benchmarks.WordFrequencies
+
+ -- Source code for IUT (implementation under test)
+ -- "borrowed" from parent folder
+ include-dirs: ../include
+ c-sources: ../cbits/cbits.c
+ hs-source-dirs: ..
+ other-modules:
+ Data.Text
+ Data.Text.Array
+ Data.Text.Encoding
+ Data.Text.Encoding.Error
+ Data.Text.Foreign
+ Data.Text.IO
+ Data.Text.Internal
+ Data.Text.Internal.Builder
+ Data.Text.Internal.Builder.Functions
+ Data.Text.Internal.Builder.Int.Digits
+ Data.Text.Internal.Builder.RealFloat.Functions
+ Data.Text.Internal.Encoding.Fusion
+ Data.Text.Internal.Encoding.Fusion.Common
+ Data.Text.Internal.Encoding.Utf16
+ Data.Text.Internal.Encoding.Utf32
+ Data.Text.Internal.Encoding.Utf8
+ Data.Text.Internal.Functions
+ Data.Text.Internal.Fusion
+ Data.Text.Internal.Fusion.CaseMapping
+ Data.Text.Internal.Fusion.Common
+ Data.Text.Internal.Fusion.Size
+ Data.Text.Internal.Fusion.Types
+ Data.Text.Internal.IO
+ Data.Text.Internal.Lazy
+ Data.Text.Internal.Lazy.Encoding.Fusion
+ Data.Text.Internal.Lazy.Fusion
+ Data.Text.Internal.Lazy.Search
+ Data.Text.Internal.Private
+ Data.Text.Internal.Read
+ Data.Text.Internal.Search
+ Data.Text.Internal.Unsafe
+ Data.Text.Internal.Unsafe.Char
+ Data.Text.Internal.Unsafe.Shift
+ Data.Text.Lazy
+ Data.Text.Lazy.Builder
+ Data.Text.Lazy.Builder.Int
+ Data.Text.Lazy.Builder.RealFloat
+ Data.Text.Lazy.Encoding
+ Data.Text.Lazy.IO
+ Data.Text.Lazy.Internal
+ Data.Text.Lazy.Read
+ Data.Text.Read
+ Data.Text.Unsafe
+ Data.Text.Show
+
+executable text-multilang
+ hs-source-dirs: haskell
+ main-is: Multilang.hs
+ other-modules: Timer
+ ghc-options: -Wall -O2
+ build-depends: base == 4.*,
+ bytestring,
+ text,
+ time
diff --git a/cbits/cbits.c b/cbits/cbits.c
new file mode 100644
index 0000000..717b766
--- /dev/null
+++ b/cbits/cbits.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>.
+ *
+ * Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>.
+ *
+ * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
+ */
+
+#include <string.h>
+#include <stdint.h>
+#include <stdio.h>
+#include "text_cbits.h"
+
+int _hs_text_utf_8_memcmp(const void *a, size_t aoff, const void *b, size_t boff,
+ size_t n)
+{
+ return memcmp(a + aoff, b + boff, n);
+}
+
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 12
+
+static const uint8_t utf8d[] = {
+ /*
+ * The first part of the table maps bytes to character classes that
+ * to reduce the size of the transition table and create bitmasks.
+ */
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+
+ /*
+ * The second part is a transition table that maps a combination of
+ * a state of the automaton and a character class to a state.
+ */
+ 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+ 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+ 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+ 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+ 12,36,12,12,12,12,12,12,12,12,12,12,
+};
+
+static inline uint32_t
+decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
+ uint32_t type = utf8d[byte];
+
+ *codep = (*state != UTF8_ACCEPT) ?
+ (byte & 0x3fu) | (*codep << 6) :
+ (0xff >> type) & (byte);
+
+ return *state = utf8d[256 + *state + type];
+}
+
+/*
+ * A best-effort decoder. Runs until it hits either end of input or
+ * the start of an invalid byte sequence.
+ *
+ * At exit, we update *destoff with the next offset to write to, *src
+ * with the next source location past the last one successfully
+ * decoded, and return the next source location to read from.
+ *
+ * Moreover, we expose the internal decoder state (state0 and
+ * codepoint0), allowing one to restart the decoder after it
+ * terminates (say, due to a partial codepoint).
+ *
+ * In particular, there are a few possible outcomes,
+ *
+ * 1) We decoded the buffer entirely:
+ * In this case we return srcend
+ * state0 == UTF8_ACCEPT
+ *
+ * 2) We met an invalid encoding
+ * In this case we return the address of the first invalid byte
+ * state0 == UTF8_REJECT
+ *
+ * 3) We reached the end of the buffer while decoding a codepoint
+ * In this case we return a pointer to the first byte of the partial codepoint
+ * state0 != UTF8_ACCEPT, UTF8_REJECT
+ *
+ */
+#if defined(__GNUC__) || defined(__clang__)
+static inline uint8_t const *
+_hs_text_utf_8_decode_utf8_int(uint8_t *const dest, size_t *destoff,
+ const uint8_t **src, const uint8_t *srcend,
+ uint32_t *codepoint0, uint32_t *state0)
+ __attribute((always_inline));
+#endif
+
+static inline uint8_t const *
+_hs_text_utf_8_decode_utf8_int(uint8_t *const dest, size_t *destoff,
+ const uint8_t **src, const uint8_t *srcend,
+ uint32_t *codepoint0, uint32_t *state0)
+{
+ uint8_t *d = dest + *destoff;
+ const uint8_t *s = *src, *last = *src;
+ uint32_t state = *state0;
+ uint32_t codepoint = *codepoint0;
+ uint8_t c;
+
+ while (s < srcend) {
+#if defined(__i386__) || defined(__x86_64__)
+ /*
+ * This code will only work on a little-endian system that
+ * supports unaligned loads.
+ *
+ * It gives a substantial speed win on data that is purely or
+ * partly ASCII (e.g. HTML), at only a slight cost on purely
+ * non-ASCII text.
+ */
+
+ if (state == UTF8_ACCEPT) {
+ while (s < srcend - 4) {
+ codepoint = *((uint32_t *) s);
+ if ((codepoint & 0x80808080) != 0) {
+ break;
+ }
+ *((uint32_t *)d) = codepoint;
+ s += 4;
+ d += 4;
+ }
+ last = s;
+ }
+#endif
+ c = *s++;
+ switch (decode(&state, &codepoint, c)) {
+ case UTF8_ACCEPT:
+ last = s;
+ /* fallthrough */
+ default:
+ *d++ = c;
+ break;
+ case UTF8_REJECT:
+ goto done;
+ }
+ }
+done:
+
+ *destoff = d - dest;
+ *codepoint0 = codepoint;
+ *state0 = state;
+ *src = last;
+
+ return s;
+}
+
+uint8_t const *
+_hs_text_utf_8_decode_utf8_state(uint8_t *const dest, size_t *destoff,
+ const uint8_t **src,
+ const uint8_t *srcend,
+ uint32_t *codepoint0, uint32_t *state0)
+{
+ uint8_t const *ret = _hs_text_utf_8_decode_utf8_int(dest, destoff, src, srcend,
+ codepoint0, state0);
+ if (*state0 == UTF8_REJECT)
+ ret -=1;
+ return ret;
+}
+
+/*
+ * Helper to decode buffer and discard final decoder state
+ */
+const uint8_t *
+_hs_text_utf_8_decode_utf8(uint8_t *const dest, size_t *destoff,
+ const uint8_t *src, const uint8_t *const srcend)
+{
+ uint32_t codepoint;
+ uint32_t state = UTF8_ACCEPT;
+ uint8_t const *ret = _hs_text_utf_8_decode_utf8_int(dest, destoff, &src, srcend,
+ &codepoint, &state);
+ /* Back up if we have an incomplete or invalid encoding */
+ if (state != UTF8_ACCEPT)
+ ret -= 1;
+ return ret;
+}
diff --git a/changelog.md b/changelog.md
new file mode 100644
index 0000000..5795fc0
--- /dev/null
+++ b/changelog.md
@@ -0,0 +1,3 @@
+### 1.2.3.0
+
+First released version of `text-utf8` package matching the API of `text-1.2.3.0`.
diff --git a/include/text_cbits.h b/include/text_cbits.h
new file mode 100644
index 0000000..3523efe
--- /dev/null
+++ b/include/text_cbits.h
@@ -0,0 +1,11 @@
+/*
+ * Copyright (c) 2013 Bryan O'Sullivan <bos@serpentine.com>.
+ */
+
+#ifndef _text_cbits_h
+#define _text_cbits_h
+
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 12
+
+#endif
diff --git a/scripts/ApiCompare.hs b/scripts/ApiCompare.hs
new file mode 100644
index 0000000..516487b
--- /dev/null
+++ b/scripts/ApiCompare.hs
@@ -0,0 +1,28 @@
+-- This script compares the strict and lazy Text APIs to ensure that
+-- they're reasonably in sync.
+
+{-# LANGUAGE OverloadedStrings #-}
+
+import qualified Data.Set as S
+import qualified Data.Text as T
+import System.Process
+
+main = do
+ let tidy pkg = (S.fromList . filter (T.isInfixOf "::") . T.lines .
+ T.replace "GHC.Int.Int64" "Int" .
+ T.replace "\n " "" .
+ T.replace (T.append (T.pack pkg) ".") "" . T.pack) `fmap`
+ readProcess "ghci" [] (":browse " ++ pkg)
+ let diff a b = mapM_ (putStrLn . (" "++) . T.unpack) . S.toList $
+ S.difference a b
+ text <- tidy "Data.Text"
+ lazy <- tidy "Data.Text.Lazy"
+ list <- tidy "Data.List"
+ putStrLn "Text \\ List:"
+ diff text list
+ putStrLn ""
+ putStrLn "Text \\ Lazy:"
+ diff text lazy
+ putStrLn ""
+ putStrLn "Lazy \\ Text:"
+ diff lazy text
diff --git a/scripts/Arsec.hs b/scripts/Arsec.hs
new file mode 100644
index 0000000..93416e4
--- /dev/null
+++ b/scripts/Arsec.hs
@@ -0,0 +1,44 @@
+module Arsec
+ (
+ Comment
+ , comment
+ , semi
+ , showC
+ , unichar
+ , unichars
+ , module Control.Applicative
+ , module Control.Monad
+ , module Data.Char
+ , module Text.ParserCombinators.Parsec.Char
+ , module Text.ParserCombinators.Parsec.Combinator
+ , module Text.ParserCombinators.Parsec.Error
+ , module Text.ParserCombinators.Parsec.Prim
+ ) where
+
+import Control.Monad
+import Control.Applicative
+import Data.Char
+import Numeric
+import Text.ParserCombinators.Parsec.Char hiding (lower, upper)
+import Text.ParserCombinators.Parsec.Combinator hiding (optional)
+import Text.ParserCombinators.Parsec.Error
+import Text.ParserCombinators.Parsec.Prim hiding ((<|>), many)
+
+type Comment = String
+
+unichar :: Parser Char
+unichar = chr . fst . head . readHex <$> many1 hexDigit
+
+unichars :: Parser [Char]
+unichars = manyTill (unichar <* spaces) semi
+
+semi :: Parser ()
+semi = char ';' *> spaces *> pure ()
+
+comment :: Parser Comment
+comment = (char '#' *> manyTill anyToken (char '\n')) <|> string "\n"
+
+showC :: Char -> String
+showC c = "'\\x" ++ d ++ "'"
+ where h = showHex (ord c) ""
+ d = replicate (4 - length h) '0' ++ h
diff --git a/scripts/CaseFolding.hs b/scripts/CaseFolding.hs
new file mode 100644
index 0000000..11d180c
--- /dev/null
+++ b/scripts/CaseFolding.hs
@@ -0,0 +1,46 @@
+-- This script processes the following source file:
+--
+-- http://unicode.org/Public/UNIDATA/CaseFolding.txt
+
+module CaseFolding
+ (
+ CaseFolding(..)
+ , Fold(..)
+ , parseCF
+ , mapCF
+ ) where
+
+import Arsec
+
+data Fold = Fold {
+ code :: Char
+ , status :: Char
+ , mapping :: [Char]
+ , name :: String
+ } deriving (Eq, Ord, Show)
+
+data CaseFolding = CF { cfComments :: [Comment], cfFolding :: [Fold] }
+ deriving (Show)
+
+entries :: Parser CaseFolding
+entries = CF <$> many comment <*> many (entry <* many comment)
+ where
+ entry = Fold <$> unichar <* semi
+ <*> oneOf "CFST" <* semi
+ <*> unichars
+ <*> (string "# " *> manyTill anyToken (char '\n'))
+
+parseCF :: FilePath -> IO (Either ParseError CaseFolding)
+parseCF name = parse entries name <$> readFile name
+
+mapCF :: CaseFolding -> [String]
+mapCF (CF _ ms) = typ ++ (map nice . filter p $ ms) ++ [last]
+ where
+ typ = ["foldMapping :: forall s. Char -> s -> Step (CC s) Char"
+ ,"{-# NOINLINE foldMapping #-}"]
+ last = "foldMapping c s = Yield (toLower c) (CC s '\\0' '\\0')"
+ nice c = "-- " ++ name c ++ "\n" ++
+ "foldMapping " ++ showC (code c) ++ " s = Yield " ++ x ++ " (CC s " ++ y ++ " " ++ z ++ ")"
+ where [x,y,z] = (map showC . take 3) (mapping c ++ repeat '\0')
+ p f = status f `elem` "CF" &&
+ mapping f /= [toLower (code f)]
diff --git a/scripts/CaseMapping.hs b/scripts/CaseMapping.hs
new file mode 100644
index 0000000..e6350ab
--- /dev/null
+++ b/scripts/CaseMapping.hs
@@ -0,0 +1,38 @@
+import System.Environment
+import System.IO
+
+import Arsec
+import CaseFolding
+import SpecialCasing
+
+main = do
+ args <- getArgs
+ let oname = case args of
+ [] -> "../Data/Text/Internal/Fusion/CaseMapping.hs"
+ [o] -> o
+ psc <- parseSC "SpecialCasing.txt"
+ pcf <- parseCF "CaseFolding.txt"
+ scs <- case psc of
+ Left err -> print err >> return undefined
+ Right ms -> return ms
+ cfs <- case pcf of
+ Left err -> print err >> return undefined
+ Right ms -> return ms
+ h <- openFile oname WriteMode
+ let comments = map ("--" ++) $
+ take 2 (cfComments cfs) ++ take 2 (scComments scs)
+ mapM_ (hPutStrLn h) $
+ ["{-# LANGUAGE Rank2Types #-}"
+ ,"-- AUTOMATICALLY GENERATED - DO NOT EDIT"
+ ,"-- Generated by scripts/CaseMapping.hs"] ++
+ comments ++
+ [""
+ ,"module Data.Text.Internal.Fusion.CaseMapping where"
+ ,"import Data.Char"
+ ,"import Data.Text.Internal.Fusion.Types"
+ ,""]
+ mapM_ (hPutStrLn h) (mapSC "upper" upper toUpper scs)
+ mapM_ (hPutStrLn h) (mapSC "lower" lower toLower scs)
+ mapM_ (hPutStrLn h) (mapSC "title" title toTitle scs)
+ mapM_ (hPutStrLn h) (mapCF cfs)
+ hClose h
diff --git a/scripts/SpecialCasing.hs b/scripts/SpecialCasing.hs
new file mode 100644
index 0000000..03be3d3
--- /dev/null
+++ b/scripts/SpecialCasing.hs
@@ -0,0 +1,56 @@
+-- This script processes the following source file:
+--
+-- http://unicode.org/Public/UNIDATA/SpecialCasing.txt
+
+module SpecialCasing
+ (
+ SpecialCasing(..)
+ , Case(..)
+ , parseSC
+ , mapSC
+ ) where
+
+import Arsec
+
+data SpecialCasing = SC { scComments :: [Comment], scCasing :: [Case] }
+ deriving (Show)
+
+data Case = Case {
+ code :: Char
+ , lower :: [Char]
+ , title :: [Char]
+ , upper :: [Char]
+ , conditions :: String
+ , name :: String
+ } deriving (Eq, Ord, Show)
+
+entries :: Parser SpecialCasing
+entries = SC <$> many comment <*> many (entry <* many comment)
+ where
+ entry = Case <$> unichar <* semi
+ <*> unichars
+ <*> unichars
+ <*> unichars
+ <*> manyTill anyToken (string "# ")
+ <*> manyTill anyToken (char '\n')
+
+parseSC :: FilePath -> IO (Either ParseError SpecialCasing)
+parseSC name = parse entries name <$> readFile name
+
+mapSC :: String -> (Case -> String) -> (Char -> Char) -> SpecialCasing
+ -> [String]
+mapSC which access twiddle (SC _ ms) =
+ typ ++ (map nice . filter p $ ms) ++ [last]
+ where
+ typ = [which ++ "Mapping :: forall s. Char -> s -> Step (CC s) Char"
+ ,"{-# NOINLINE " ++ which ++ "Mapping #-}"]
+ last = which ++ "Mapping c s = Yield (to" ++ ucFirst which ++ " c) (CC s '\\0' '\\0')"
+ nice c = "-- " ++ name c ++ "\n" ++
+ which ++ "Mapping " ++ showC (code c) ++ " s = Yield " ++ x ++ " (CC s " ++ y ++ " " ++ z ++ ")"
+ where [x,y,z] = (map showC . take 3) (access c ++ repeat '\0')
+ p c = [k] /= a && a /= [twiddle k] && null (conditions c)
+ where a = access c
+ k = code c
+
+ucFirst (c:cs) = toUpper c : cs
+ucFirst [] = []
diff --git a/tests-and-benchmarks.markdown b/tests-and-benchmarks.markdown
new file mode 100644
index 0000000..3e1e664
--- /dev/null
+++ b/tests-and-benchmarks.markdown
@@ -0,0 +1,68 @@
+Tests and benchmarks
+====================
+
+Prerequisites
+-------------
+
+To run the tests and benchmarks, you will need the test data, which
+you can clone from one of the following locations:
+
+* Mercurial master repository:
+ [bitbucket.org/bos/text-test-data](https://bitbucket.org/bos/text-test-data)
+
+* Git mirror repository:
+ [github.com/bos/text-test-data](https://github.com/bos/text-test-data)
+
+You can clone either repository into the `tests` subdirectory using
+
+ cd tests/
+ make text-test-data # to clone from mercurial, OR
+ make VCS=git text-test-data # to clone from git
+
+Many tests and benchmarks will fail if the test files are missing.
+
+Functional tests
+----------------
+
+The functional tests are located in the `tests` subdirectory. An overview of
+what's in that directory:
+
+ Makefile Has targets for common tasks
+ Tests Source files of the testing code
+ scripts Various utility scripts
+ text-tests.cabal Cabal file that compiles all benchmarks
+
+The `text-tests.cabal` builds:
+
+- A copy of the text library, sharing the source code, but exposing all internal
+ modules, for testing purposes
+- The different test suites
+
+To compile, run all tests, and generate a coverage report, simply use `make`.
+
+Benchmarks
+----------
+
+The benchmarks are located in the `benchmarks` subdirectory. An overview of
+what's in that directory:
+
+ Makefile Has targets for common tasks
+ haskell Source files of the haskell benchmarks
+ python Python implementations of some benchmarks
+ ruby Ruby implementations of some benchmarks
+ text-benchmarks.cabal Cabal file which compiles all benchmarks
+
+To compile the benchmarks, navigate to the `benchmarks` subdirectory and run
+`cabal configure && cabal build`. Then, you can run the benchmarks using:
+
+ ./dist/build/text-benchmarks/text-benchmarks
+
+Or if you have a recent enough `cabal`, you can build and run the
+benchmarks via
+
+ cabal new-run exe:text-benchmarks -- --help
+
+However, since there's quite a lot of benchmarks, you usually don't want to
+run them all. Instead, use the `-l` flag to get a list of benchmarks
+and run the ones you want to inspect. If you want to configure the benchmarks
+further, the exact parameters can be changed in `Benchmarks.hs`.
diff --git a/tests/.ghci b/tests/.ghci
new file mode 100644
index 0000000..251552a
--- /dev/null
+++ b/tests/.ghci
@@ -0,0 +1 @@
+:set -DHAVE_DEEPSEQ -isrc -i../..
diff --git a/tests/LiteralRuleTest.hs b/tests/LiteralRuleTest.hs
new file mode 100644
index 0000000..bcc4a87
--- /dev/null
+++ b/tests/LiteralRuleTest.hs
@@ -0,0 +1,21 @@
+{-# LANGUAGE OverloadedStrings #-}
+
+module LiteralRuleTest where
+
+import Data.Text (Text)
+
+-- This should produce 8 firings of the "TEXT literal" rule
+strings :: [Text]
+strings = [ "abstime", "aclitem", "bit", "bool", "box", "bpchar", "bytea", "char" ]
+
+-- This should produce 7 firings of the "TEXT literal UTF8" rule
+utf8Strings :: [Text]
+utf8Strings = [ "\0abstime", "\0aclitem", "\xfefe bit", "\0bool", "\0box", "\0bpchar", "\0bytea" ]
+
+-- This should produce 4 firings of the "TEXT empty literal" rule
+empties :: [Text]
+empties = [ "", "", "", "" ]
+
+-- This should produce 5 firings of the "TEXT empty literal" rule
+--singletons :: [Text]
+--singletons = [ "a", "b", "c", "d", "e" ]
diff --git a/tests/Makefile b/tests/Makefile
new file mode 100644
index 0000000..4317a8a
--- /dev/null
+++ b/tests/Makefile
@@ -0,0 +1,45 @@
+VCS = hg
+count = 1000
+
+all: coverage literal-rule-test
+
+literal-rule-test:
+ ./literal-rule-test.sh
+
+coverage: build coverage/hpc_index.html
+
+build: text-test-data
+ cabal configure -fhpc
+ cabal build
+
+text-test-data:
+ifeq ($(VCS),git)
+ git clone https://github.com/bos/text-test-data.git
+else
+ hg clone https://bitbucket.org/bos/text-test-data
+endif
+ $(MAKE) -C text-test-data
+
+coverage/text-tests.tix:
+ -mkdir -p coverage
+ ./dist/build/text-tests/text-tests -a $(count)
+ mv text-tests.tix $@
+
+coverage/text-tests-stdio.tix:
+ -mkdir -p coverage
+ ./scripts/cover-stdio.sh ./dist/build/text-tests-stdio/text-tests-stdio
+ mv text-tests-stdio.tix $@
+
+coverage/coverage.tix: coverage/text-tests.tix coverage/text-tests-stdio.tix
+ hpc combine --output=$@ \
+ --exclude=Main \
+ coverage/text-tests.tix \
+ coverage/text-tests-stdio.tix
+
+coverage/hpc_index.html: coverage/coverage.tix
+ hpc markup --destdir=coverage coverage/coverage.tix
+
+clean:
+ rm -rf dist coverage .hpc
+
+.PHONY: all build clean coverage literal-rule-test
diff --git a/tests/Tests.hs b/tests/Tests.hs
new file mode 100644
index 0000000..fb97ff4
--- /dev/null
+++ b/tests/Tests.hs
@@ -0,0 +1,13 @@
+-- | Provides a simple main function which runs all the tests
+--
+module Main
+ ( main
+ ) where
+
+import Test.Framework (defaultMain)
+
+import qualified Tests.Properties as Properties
+import qualified Tests.Regressions as Regressions
+
+main :: IO ()
+main = defaultMain [Properties.tests, Regressions.tests]
diff --git a/tests/Tests/IO.hs b/tests/Tests/IO.hs
new file mode 100644
index 0000000..e67d82e
--- /dev/null
+++ b/tests/Tests/IO.hs
@@ -0,0 +1,34 @@
+-- | Program which exposes some haskell functions as an exutable. The results
+-- and coverage of this module is meant to be checked using a shell script.
+--
+module Main
+ (
+ main
+ ) where
+
+import System.Environment (getArgs)
+import System.Exit (exitFailure)
+import System.IO (hPutStrLn, stderr)
+import qualified Data.Text as T
+import qualified Data.Text.IO as T
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text.Lazy.IO as TL
+
+main :: IO ()
+main = do
+ args <- getArgs
+ case args of
+ ["T.readFile", name] -> T.putStr =<< T.readFile name
+ ["T.writeFile", name, t] -> T.writeFile name (T.pack t)
+ ["T.appendFile", name, t] -> T.appendFile name (T.pack t)
+ ["T.interact"] -> T.interact id
+ ["T.getContents"] -> T.putStr =<< T.getContents
+ ["T.getLine"] -> T.putStrLn =<< T.getLine
+
+ ["TL.readFile", name] -> TL.putStr =<< TL.readFile name
+ ["TL.writeFile", name, t] -> TL.writeFile name (TL.pack t)
+ ["TL.appendFile", name, t] -> TL.appendFile name (TL.pack t)
+ ["TL.interact"] -> TL.interact id
+ ["TL.getContents"] -> TL.putStr =<< TL.getContents
+ ["TL.getLine"] -> TL.putStrLn =<< TL.getLine
+ _ -> hPutStrLn stderr "invalid directive!" >> exitFailure
diff --git a/tests/Tests/Properties.hs b/tests/Tests/Properties.hs
new file mode 100644
index 0000000..984ce6a
--- /dev/null
+++ b/tests/Tests/Properties.hs
@@ -0,0 +1,1400 @@
+-- | QuickCheck properties for the text library.
+
+{-# LANGUAGE BangPatterns, FlexibleInstances, OverloadedStrings,
+ ScopedTypeVariables, TypeSynonymInstances #-}
+{-# OPTIONS_GHC -fno-enable-rewrite-rules -fno-warn-missing-signatures #-}
+module Tests.Properties
+ (
+ tests
+ ) where
+
+import Control.Applicative ((<$>), (<*>))
+import Control.Arrow ((***), first, second)
+import Data.Bits ((.&.))
+import Data.Char (chr, isDigit, isHexDigit, isLower, isSpace, isLetter, isUpper, ord)
+import Data.Int (Int8, Int16, Int32, Int64)
+import Data.Monoid (Monoid(..))
+import Data.String (IsString(fromString))
+import Data.Text.Encoding.Error
+import Data.Text.Foreign
+import Data.Text.Internal.Encoding.Utf8
+import Data.Text.Internal.Fusion.Size
+import Data.Text.Internal.Search (indices)
+import Data.Text.Lazy.Read as TL
+import Data.Text.Read as T
+import Data.Word (Word, Word8, Word16, Word32, Word64)
+import Data.Maybe (mapMaybe)
+import Numeric (showEFloat, showFFloat, showGFloat, showHex)
+import Prelude hiding (replicate)
+import Test.Framework (Test, testGroup)
+import Test.Framework.Providers.QuickCheck2 (testProperty)
+import Test.QuickCheck hiding ((.&.))
+import Test.QuickCheck.Monadic
+import Test.QuickCheck.Property (Property(..))
+import Test.QuickCheck.Unicode (char)
+import Tests.QuickCheckUtils
+import Tests.Utils
+import Text.Show.Functions ()
+import qualified Control.Exception as Exception
+import qualified Data.Bits as Bits (shiftL, shiftR)
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Lazy as BL
+import qualified Data.Char as C
+import qualified Data.List as L
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as E
+import qualified Data.Text.IO as T
+import qualified Data.Text.Internal.Fusion as S
+import qualified Data.Text.Internal.Fusion.Common as S
+import qualified Data.Text.Internal.Lazy.Fusion as SL
+import qualified Data.Text.Internal.Lazy.Search as S (indices)
+import qualified Data.Text.Internal.Unsafe.Shift as U
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text.Lazy.Builder as TB
+import qualified Data.Text.Lazy.Builder.Int as TB
+import qualified Data.Text.Lazy.Builder.RealFloat as TB
+import qualified Data.Text.Lazy.Encoding as EL
+import qualified Data.Text.Lazy.IO as TL
+import qualified System.IO as IO
+import qualified Tests.Properties.Mul as Mul
+import qualified Tests.SlowFunctions as Slow
+
+t_pack_unpack = (T.unpack . T.pack) `eq` id
+tl_pack_unpack = (TL.unpack . TL.pack) `eq` id
+t_stream_unstream = (S.unstream . S.stream) `eq` id
+tl_stream_unstream = (SL.unstream . SL.stream) `eq` id
+t_reverse_stream t = (S.reverse . S.reverseStream) t === t
+t_singleton c = [c] === (T.unpack . T.singleton) c
+tl_singleton c = [c] === (TL.unpack . TL.singleton) c
+tl_unstreamChunks x = f 11 x === f 1000 x
+ where f n = SL.unstreamChunks n . S.streamList
+tl_chunk_unchunk = (TL.fromChunks . TL.toChunks) `eq` id
+tl_from_to_strict = (TL.fromStrict . TL.toStrict) `eq` id
+
+-- Note: this silently truncates code-points > 255 to 8-bit due to 'B.pack'
+encodeL1 :: T.Text -> B.ByteString
+encodeL1 = B.pack . map (fromIntegral . fromEnum) . T.unpack
+encodeLazyL1 :: TL.Text -> BL.ByteString
+encodeLazyL1 = BL.fromChunks . map encodeL1 . TL.toChunks
+
+t_ascii t = E.decodeASCII (E.encodeUtf8 a) === a
+ where a = T.map (\c -> chr (ord c `mod` 128)) t
+tl_ascii t = EL.decodeASCII (EL.encodeUtf8 a) === a
+ where a = TL.map (\c -> chr (ord c `mod` 128)) t
+t_latin1 t = E.decodeLatin1 (encodeL1 a) === a
+ where a = T.map (\c -> chr (ord c `mod` 256)) t
+tl_latin1 t = EL.decodeLatin1 (encodeLazyL1 a) === a
+ where a = TL.map (\c -> chr (ord c `mod` 256)) t
+t_utf8 = forAll genUnicode $ (E.decodeUtf8 . E.encodeUtf8) `eq` id
+t_utf8' = forAll genUnicode $ (E.decodeUtf8' . E.encodeUtf8) `eq` (id . Right)
+tl_utf8 = forAll genUnicode $ (EL.decodeUtf8 . EL.encodeUtf8) `eq` id
+tl_utf8' = forAll genUnicode $ (EL.decodeUtf8' . EL.encodeUtf8) `eq` (id . Right)
+t_utf16LE = forAll genUnicode $ (E.decodeUtf16LE . E.encodeUtf16LE) `eq` id
+tl_utf16LE = forAll genUnicode $ (EL.decodeUtf16LE . EL.encodeUtf16LE) `eq` id
+t_utf16BE = forAll genUnicode $ (E.decodeUtf16BE . E.encodeUtf16BE) `eq` id
+tl_utf16BE = forAll genUnicode $ (EL.decodeUtf16BE . EL.encodeUtf16BE) `eq` id
+t_utf32LE = forAll genUnicode $ (E.decodeUtf32LE . E.encodeUtf32LE) `eq` id
+tl_utf32LE = forAll genUnicode $ (EL.decodeUtf32LE . EL.encodeUtf32LE) `eq` id
+t_utf32BE = forAll genUnicode $ (E.decodeUtf32BE . E.encodeUtf32BE) `eq` id
+tl_utf32BE = forAll genUnicode $ (EL.decodeUtf32BE . EL.encodeUtf32BE) `eq` id
+
+t_utf8_incr = forAll genUnicode $ \s (Positive n) -> (recode n `eq` id) s
+ where recode n = T.concat . map fst . feedChunksOf n E.streamDecodeUtf8 .
+ E.encodeUtf8
+
+feedChunksOf :: Int -> (B.ByteString -> E.Decoding) -> B.ByteString
+ -> [(T.Text, B.ByteString)]
+feedChunksOf n f bs
+ | B.null bs = []
+ | otherwise = let (x,y) = B.splitAt n bs
+ E.Some t b f' = f x
+ in (t,b) : feedChunksOf n f' y
+
+t_utf8_undecoded = forAll genUnicode $ \t ->
+ let b = E.encodeUtf8 t
+ ls = concatMap (leftover . E.encodeUtf8 . T.singleton) . T.unpack $ t
+ leftover = (++ [B.empty]) . init . tail . B.inits
+ in (map snd . feedChunksOf 1 E.streamDecodeUtf8) b === ls
+
+data Badness = Solo | Leading | Trailing
+ deriving (Eq, Show)
+
+instance Arbitrary Badness where
+ arbitrary = elements [Solo, Leading, Trailing]
+
+t_utf8_err :: Badness -> DecodeErr -> Property
+t_utf8_err bad de = do
+ let gen = case bad of
+ Solo -> genInvalidUTF8
+ Leading -> B.append <$> genInvalidUTF8 <*> genUTF8
+ Trailing -> B.append <$> genUTF8 <*> genInvalidUTF8
+ genUTF8 = E.encodeUtf8 <$> genUnicode
+ forAll gen $ \bs -> MkProperty $ do
+ onErr <- genDecodeErr de
+ unProperty . monadicIO $ do
+ l <- run $ let len = T.length (E.decodeUtf8With onErr bs)
+ in (len `seq` return (Right len)) `Exception.catch`
+ (\(e::UnicodeException) -> return (Left e))
+ assert $ case l of
+ Left err -> length (show err) >= 0
+ Right _ -> de /= Strict
+
+t_utf8_err' :: B.ByteString -> Property
+t_utf8_err' bs = monadicIO . assert $ case E.decodeUtf8' bs of
+ Left err -> length (show err) >= 0
+ Right t -> T.length t >= 0
+
+genInvalidUTF8 :: Gen B.ByteString
+genInvalidUTF8 = B.pack <$> oneof [
+ -- invalid leading byte of a 2-byte sequence
+ (:) <$> choose (0xC0, 0xC1) <*> upTo 1 contByte
+ -- invalid leading byte of a 4-byte sequence
+ , (:) <$> choose (0xF5, 0xFF) <*> upTo 3 contByte
+ -- 4-byte sequence greater than U+10FFFF
+ , do k <- choose (0x11, 0x13)
+ let w0 = 0xF0 + (k `Bits.shiftR` 2)
+ w1 = 0x80 + ((k .&. 3) `Bits.shiftL` 4)
+ ([w0,w1]++) <$> vectorOf 2 contByte
+ -- continuation bytes without a start byte
+ , listOf1 contByte
+ -- short 2-byte sequence
+ , (:[]) <$> choose (0xC2, 0xDF)
+ -- short 3-byte sequence
+ , (:) <$> choose (0xE0, 0xEF) <*> upTo 1 contByte
+ -- short 4-byte sequence
+ , (:) <$> choose (0xF0, 0xF4) <*> upTo 2 contByte
+ -- overlong encoding
+ , do k <- choose (0,0xFFFF)
+ let c = chr k
+ case k of
+ _ | k < 0x80 -> oneof [ let (w,x) = ord2 c in return [w,x]
+ , let (w,x,y) = ord3 c in return [w,x,y]
+ , let (w,x,y,z) = ord4 c in return [w,x,y,z] ]
+ | k < 0x7FF -> oneof [ let (w,x,y) = ord3 c in return [w,x,y]
+ , let (w,x,y,z) = ord4 c in return [w,x,y,z] ]
+ | otherwise -> let (w,x,y,z) = ord4 c in return [w,x,y,z]
+ ]
+ where
+ contByte = (0x80 +) <$> choose (0, 0x3f)
+ upTo n gen = do
+ k <- choose (0,n)
+ vectorOf k gen
+
+s_Eq s = (s==) `eq` ((S.streamList s==) . S.streamList)
+ where _types = s :: String
+sf_Eq p s =
+ ((L.filter p s==) . L.filter p) `eq`
+ (((S.filter p $ S.streamList s)==) . S.filter p . S.streamList)
+t_Eq s = (s==) `eq` ((T.pack s==) . T.pack)
+tl_Eq s = (s==) `eq` ((TL.pack s==) . TL.pack)
+s_Ord s = (compare s) `eq` (compare (S.streamList s) . S.streamList)
+ where _types = s :: String
+sf_Ord p s =
+ ((compare $ L.filter p s) . L.filter p) `eq`
+ (compare (S.filter p $ S.streamList s) . S.filter p . S.streamList)
+t_Ord s = (compare s) `eq` (compare (T.pack s) . T.pack)
+tl_Ord s = (compare s) `eq` (compare (TL.pack s) . TL.pack)
+t_Read = id `eq` (T.unpack . read . show)
+tl_Read = id `eq` (TL.unpack . read . show)
+t_Show = show `eq` (show . T.pack)
+tl_Show = show `eq` (show . TL.pack)
+t_mappend s = mappend s`eqP` (unpackS . mappend (T.pack s))
+tl_mappend s = mappend s`eqP` (unpackS . mappend (TL.pack s))
+t_mconcat = unsquare $
+ mconcat `eq` (unpackS . mconcat . L.map T.pack)
+tl_mconcat = unsquare $
+ mconcat `eq` (unpackS . mconcat . L.map TL.pack)
+t_mempty = mempty === (unpackS (mempty :: T.Text))
+tl_mempty = mempty === (unpackS (mempty :: TL.Text))
+t_IsString = fromString `eqP` (T.unpack . fromString)
+tl_IsString = fromString `eqP` (TL.unpack . fromString)
+
+s_cons x = (x:) `eqP` (unpackS . S.cons x)
+s_cons_s x = (x:) `eqP` (unpackS . S.unstream . S.cons x)
+sf_cons p x = ((x:) . L.filter p) `eqP` (unpackS . S.cons x . S.filter p)
+t_cons x = (x:) `eqP` (unpackS . T.cons x)
+tl_cons x = (x:) `eqP` (unpackS . TL.cons x)
+s_snoc x = (++ [x]) `eqP` (unpackS . (flip S.snoc) x)
+t_snoc x = (++ [x]) `eqP` (unpackS . (flip T.snoc) x)
+tl_snoc x = (++ [x]) `eqP` (unpackS . (flip TL.snoc) x)
+s_append s = (s++) `eqP` (unpackS . S.append (S.streamList s))
+s_append_s s = (s++) `eqP`
+ (unpackS . S.unstream . S.append (S.streamList s))
+sf_append p s = (L.filter p s++) `eqP`
+ (unpackS . S.append (S.filter p $ S.streamList s))
+t_append s = (s++) `eqP` (unpackS . T.append (packS s))
+
+uncons (x:xs) = Just (x,xs)
+uncons _ = Nothing
+
+s_uncons = uncons `eqP` (fmap (second unpackS) . S.uncons)
+sf_uncons p = (uncons . L.filter p) `eqP`
+ (fmap (second unpackS) . S.uncons . S.filter p)
+t_uncons = uncons `eqP` (fmap (second unpackS) . T.uncons)
+tl_uncons = uncons `eqP` (fmap (second unpackS) . TL.uncons)
+
+unsnoc xs@(_:_) = Just (init xs, last xs)
+unsnoc [] = Nothing
+
+t_unsnoc = unsnoc `eqP` (fmap (first unpackS) . T.unsnoc)
+tl_unsnoc = unsnoc `eqP` (fmap (first unpackS) . TL.unsnoc)
+
+s_head = head `eqP` S.head
+sf_head p = (head . L.filter p) `eqP` (S.head . S.filter p)
+t_head = head `eqP` T.head
+tl_head = head `eqP` TL.head
+s_last = last `eqP` S.last
+sf_last p = (last . L.filter p) `eqP` (S.last . S.filter p)
+t_last = last `eqP` T.last
+tl_last = last `eqP` TL.last
+s_tail = tail `eqP` (unpackS . S.tail)
+s_tail_s = tail `eqP` (unpackS . S.unstream . S.tail)
+sf_tail p = (tail . L.filter p) `eqP` (unpackS . S.tail . S.filter p)
+t_tail = tail `eqP` (unpackS . T.tail)
+tl_tail = tail `eqP` (unpackS . TL.tail)
+s_init = init `eqP` (unpackS . S.init)
+s_init_s = init `eqP` (unpackS . S.unstream . S.init)
+sf_init p = (init . L.filter p) `eqP` (unpackS . S.init . S.filter p)
+t_init = init `eqP` (unpackS . T.init)
+tl_init = init `eqP` (unpackS . TL.init)
+s_null = null `eqP` S.null
+sf_null p = (null . L.filter p) `eqP` (S.null . S.filter p)
+t_null = null `eqP` T.null
+tl_null = null `eqP` TL.null
+s_length = length `eqP` S.length
+sf_length p = (length . L.filter p) `eqP` (S.length . S.filter p)
+sl_length = (fromIntegral . length) `eqP` SL.length
+t_length = length `eqP` T.length
+tl_length = L.genericLength `eqP` TL.length
+t_compareLength t = (compare (T.length t)) `eq` T.compareLength t
+tl_compareLength t= (compare (TL.length t)) `eq` TL.compareLength t
+
+s_map f = map f `eqP` (unpackS . S.map f)
+s_map_s f = map f `eqP` (unpackS . S.unstream . S.map f)
+sf_map p f = (map f . L.filter p) `eqP` (unpackS . S.map f . S.filter p)
+t_map f = map f `eqP` (unpackS . T.map f)
+tl_map f = map f `eqP` (unpackS . TL.map f)
+s_intercalate c = unsquare $
+ L.intercalate c `eq`
+ (unpackS . S.intercalate (packS c) . map packS)
+t_intercalate c = unsquare $
+ L.intercalate c `eq`
+ (unpackS . T.intercalate (packS c) . map packS)
+tl_intercalate c = unsquare $
+ L.intercalate c `eq`
+ (unpackS . TL.intercalate (TL.pack c) . map TL.pack)
+s_intersperse c = L.intersperse c `eqP`
+ (unpackS . S.intersperse c)
+s_intersperse_s c = L.intersperse c `eqP`
+ (unpackS . S.unstream . S.intersperse c)
+sf_intersperse p c= (L.intersperse c . L.filter p) `eqP`
+ (unpackS . S.intersperse c . S.filter p)
+t_intersperse c = unsquare $
+ L.intersperse c `eqP` (unpackS . T.intersperse c)
+tl_intersperse c = unsquare $
+ L.intersperse c `eqP` (unpackS . TL.intersperse c)
+t_transpose = unsquare $
+ L.transpose `eq` (map unpackS . T.transpose . map packS)
+tl_transpose = unsquare $
+ L.transpose `eq` (map unpackS . TL.transpose . map TL.pack)
+t_reverse = L.reverse `eqP` (unpackS . T.reverse)
+tl_reverse = L.reverse `eqP` (unpackS . TL.reverse)
+t_reverse_short n = L.reverse `eqP` (unpackS . S.reverse . shorten n . S.stream)
+
+t_replace s d = (L.intercalate d . splitOn s) `eqP`
+ (unpackS . T.replace (T.pack s) (T.pack d))
+tl_replace s d = (L.intercalate d . splitOn s) `eqP`
+ (unpackS . TL.replace (TL.pack s) (TL.pack d))
+
+splitOn :: (Eq a) => [a] -> [a] -> [[a]]
+splitOn pat src0
+ | l == 0 = error "splitOn: empty"
+ | otherwise = go src0
+ where
+ l = length pat
+ go src = search 0 src
+ where
+ search _ [] = [src]
+ search !n s@(_:s')
+ | pat `L.isPrefixOf` s = take n src : go (drop l s)
+ | otherwise = search (n+1) s'
+
+s_toCaseFold_length xs = S.length (S.toCaseFold s) >= length xs
+ where s = S.streamList xs
+sf_toCaseFold_length p xs =
+ (S.length . S.toCaseFold . S.filter p $ s) >= (length . L.filter p $ xs)
+ where s = S.streamList xs
+t_toCaseFold_length t = T.length (T.toCaseFold t) >= T.length t
+tl_toCaseFold_length t = TL.length (TL.toCaseFold t) >= TL.length t
+t_toLower_length t = T.length (T.toLower t) >= T.length t
+t_toLower_lower t = p (T.toLower t) >= p t
+ where p = T.length . T.filter isLower
+tl_toLower_lower t = p (TL.toLower t) >= p t
+ where p = TL.length . TL.filter isLower
+t_toUpper_length t = T.length (T.toUpper t) >= T.length t
+t_toUpper_upper t = p (T.toUpper t) >= p t
+ where p = T.length . T.filter isUpper
+tl_toUpper_upper t = p (TL.toUpper t) >= p t
+ where p = TL.length . TL.filter isUpper
+t_toTitle_title t = all (<= 1) (caps w)
+ where caps = fmap (T.length . T.filter isUpper) . T.words . T.toTitle
+ -- TIL: there exist uppercase-only letters
+ w = T.filter (\c -> if C.isUpper c then C.toLower c /= c else True) t
+t_toTitle_1stNotLower = and . notLow . T.toTitle . T.filter stable
+ where notLow = mapMaybe (fmap (not . isLower) . (T.find isLetter)) . T.words
+ -- Surprise! The Spanish/Portuguese ordinal indicators changed
+ -- from category Ll (letter, lowercase) to Lo (letter, other)
+ -- in Unicode 7.0
+ -- Oh, and there exist lowercase-only letters (see previous test)
+ stable c = if isLower c
+ then C.toUpper c /= c
+ else c /= '\170' && c /= '\186'
+
+justifyLeft k c xs = xs ++ L.replicate (k - length xs) c
+justifyRight m n xs = L.replicate (m - length xs) n ++ xs
+center k c xs
+ | len >= k = xs
+ | otherwise = L.replicate l c ++ xs ++ L.replicate r c
+ where len = length xs
+ d = k - len
+ r = d `div` 2
+ l = d - r
+
+s_justifyLeft k c = justifyLeft j c `eqP` (unpackS . S.justifyLeftI j c)
+ where j = fromIntegral (k :: Word8)
+s_justifyLeft_s k c = justifyLeft j c `eqP`
+ (unpackS . S.unstream . S.justifyLeftI j c)
+ where j = fromIntegral (k :: Word8)
+sf_justifyLeft p k c = (justifyLeft j c . L.filter p) `eqP`
+ (unpackS . S.justifyLeftI j c . S.filter p)
+ where j = fromIntegral (k :: Word8)
+t_justifyLeft k c = justifyLeft j c `eqP` (unpackS . T.justifyLeft j c)
+ where j = fromIntegral (k :: Word8)
+tl_justifyLeft k c = justifyLeft j c `eqP`
+ (unpackS . TL.justifyLeft (fromIntegral j) c)
+ where j = fromIntegral (k :: Word8)
+t_justifyRight k c = justifyRight j c `eqP` (unpackS . T.justifyRight j c)
+ where j = fromIntegral (k :: Word8)
+tl_justifyRight k c = justifyRight j c `eqP`
+ (unpackS . TL.justifyRight (fromIntegral j) c)
+ where j = fromIntegral (k :: Word8)
+t_center k c = center j c `eqP` (unpackS . T.center j c)
+ where j = fromIntegral (k :: Word8)
+tl_center k c = center j c `eqP` (unpackS . TL.center (fromIntegral j) c)
+ where j = fromIntegral (k :: Word8)
+
+sf_foldl p f z = (L.foldl f z . L.filter p) `eqP` (S.foldl f z . S.filter p)
+ where _types = f :: Char -> Char -> Char
+t_foldl f z = L.foldl f z `eqP` (T.foldl f z)
+ where _types = f :: Char -> Char -> Char
+tl_foldl f z = L.foldl f z `eqP` (TL.foldl f z)
+ where _types = f :: Char -> Char -> Char
+sf_foldl' p f z = (L.foldl' f z . L.filter p) `eqP`
+ (S.foldl' f z . S.filter p)
+ where _types = f :: Char -> Char -> Char
+t_foldl' f z = L.foldl' f z `eqP` T.foldl' f z
+ where _types = f :: Char -> Char -> Char
+tl_foldl' f z = L.foldl' f z `eqP` TL.foldl' f z
+ where _types = f :: Char -> Char -> Char
+sf_foldl1 p f = (L.foldl1 f . L.filter p) `eqP` (S.foldl1 f . S.filter p)
+t_foldl1 f = L.foldl1 f `eqP` T.foldl1 f
+tl_foldl1 f = L.foldl1 f `eqP` TL.foldl1 f
+sf_foldl1' p f = (L.foldl1' f . L.filter p) `eqP` (S.foldl1' f . S.filter p)
+t_foldl1' f = L.foldl1' f `eqP` T.foldl1' f
+tl_foldl1' f = L.foldl1' f `eqP` TL.foldl1' f
+sf_foldr p f z = (L.foldr f z . L.filter p) `eqP` (S.foldr f z . S.filter p)
+ where _types = f :: Char -> Char -> Char
+t_foldr f z = L.foldr f z `eqP` T.foldr f z
+ where _types = f :: Char -> Char -> Char
+tl_foldr f z = unsquare $
+ L.foldr f z `eqP` TL.foldr f z
+ where _types = f :: Char -> Char -> Char
+sf_foldr1 p f = unsquare $
+ (L.foldr1 f . L.filter p) `eqP` (S.foldr1 f . S.filter p)
+t_foldr1 f = L.foldr1 f `eqP` T.foldr1 f
+tl_foldr1 f = unsquare $
+ L.foldr1 f `eqP` TL.foldr1 f
+
+s_concat_s = unsquare $
+ L.concat `eq` (unpackS . S.unstream . S.concat . map packS)
+sf_concat p = unsquare $
+ (L.concat . map (L.filter p)) `eq`
+ (unpackS . S.concat . map (S.filter p . packS))
+t_concat = unsquare $
+ L.concat `eq` (unpackS . T.concat . map packS)
+tl_concat = unsquare $
+ L.concat `eq` (unpackS . TL.concat . map TL.pack)
+sf_concatMap p f = unsquare $ (L.concatMap f . L.filter p) `eqP`
+ (unpackS . S.concatMap (packS . f) . S.filter p)
+t_concatMap f = unsquare $
+ L.concatMap f `eqP` (unpackS . T.concatMap (packS . f))
+tl_concatMap f = unsquare $
+ L.concatMap f `eqP` (unpackS . TL.concatMap (TL.pack . f))
+sf_any q p = (L.any p . L.filter q) `eqP` (S.any p . S.filter q)
+t_any p = L.any p `eqP` T.any p
+tl_any p = L.any p `eqP` TL.any p
+sf_all q p = (L.all p . L.filter q) `eqP` (S.all p . S.filter q)
+t_all p = L.all p `eqP` T.all p
+tl_all p = L.all p `eqP` TL.all p
+sf_maximum p = (L.maximum . L.filter p) `eqP` (S.maximum . S.filter p)
+t_maximum = L.maximum `eqP` T.maximum
+tl_maximum = L.maximum `eqP` TL.maximum
+sf_minimum p = (L.minimum . L.filter p) `eqP` (S.minimum . S.filter p)
+t_minimum = L.minimum `eqP` T.minimum
+tl_minimum = L.minimum `eqP` TL.minimum
+
+sf_scanl p f z = (L.scanl f z . L.filter p) `eqP`
+ (unpackS . S.scanl f z . S.filter p)
+t_scanl f z = L.scanl f z `eqP` (unpackS . T.scanl f z)
+tl_scanl f z = L.scanl f z `eqP` (unpackS . TL.scanl f z)
+t_scanl1 f = L.scanl1 f `eqP` (unpackS . T.scanl1 f)
+tl_scanl1 f = L.scanl1 f `eqP` (unpackS . TL.scanl1 f)
+t_scanr f z = L.scanr f z `eqP` (unpackS . T.scanr f z)
+tl_scanr f z = L.scanr f z `eqP` (unpackS . TL.scanr f z)
+t_scanr1 f = L.scanr1 f `eqP` (unpackS . T.scanr1 f)
+tl_scanr1 f = L.scanr1 f `eqP` (unpackS . TL.scanr1 f)
+
+t_mapAccumL f z = L.mapAccumL f z `eqP` (second unpackS . T.mapAccumL f z)
+ where _types = f :: Int -> Char -> (Int,Char)
+tl_mapAccumL f z = L.mapAccumL f z `eqP` (second unpackS . TL.mapAccumL f z)
+ where _types = f :: Int -> Char -> (Int,Char)
+t_mapAccumR f z = L.mapAccumR f z `eqP` (second unpackS . T.mapAccumR f z)
+ where _types = f :: Int -> Char -> (Int,Char)
+tl_mapAccumR f z = L.mapAccumR f z `eqP` (second unpackS . TL.mapAccumR f z)
+ where _types = f :: Int -> Char -> (Int,Char)
+
+tl_repeat n = (L.take m . L.repeat) `eq`
+ (unpackS . TL.take (fromIntegral m) . TL.repeat)
+ where m = fromIntegral (n :: Word8)
+
+replicate n l = concat (L.replicate n l)
+
+s_replicate n = replicate m `eq`
+ (unpackS . S.replicateI (fromIntegral m) . packS)
+ where m = fromIntegral (n :: Word8)
+t_replicate n = replicate m `eq` (unpackS . T.replicate m . packS)
+ where m = fromIntegral (n :: Word8)
+tl_replicate n = replicate m `eq`
+ (unpackS . TL.replicate (fromIntegral m) . packS)
+ where m = fromIntegral (n :: Word8)
+
+tl_cycle n = (L.take m . L.cycle) `eq`
+ (unpackS . TL.take (fromIntegral m) . TL.cycle . packS)
+ where m = fromIntegral (n :: Word8)
+
+tl_iterate f n = (L.take m . L.iterate f) `eq`
+ (unpackS . TL.take (fromIntegral m) . TL.iterate f)
+ where m = fromIntegral (n :: Word8)
+
+unf :: Int -> Char -> Maybe (Char, Char)
+unf n c | fromEnum c * 100 > n = Nothing
+ | otherwise = Just (c, succ c)
+
+t_unfoldr n = L.unfoldr (unf m) `eq` (unpackS . T.unfoldr (unf m))
+ where m = fromIntegral (n :: Word16)
+tl_unfoldr n = L.unfoldr (unf m) `eq` (unpackS . TL.unfoldr (unf m))
+ where m = fromIntegral (n :: Word16)
+t_unfoldrN n m = (L.take i . L.unfoldr (unf j)) `eq`
+ (unpackS . T.unfoldrN i (unf j))
+ where i = fromIntegral (n :: Word16)
+ j = fromIntegral (m :: Word16)
+tl_unfoldrN n m = (L.take i . L.unfoldr (unf j)) `eq`
+ (unpackS . TL.unfoldrN (fromIntegral i) (unf j))
+ where i = fromIntegral (n :: Word16)
+ j = fromIntegral (m :: Word16)
+
+unpack2 :: (Stringy s) => (s,s) -> (String,String)
+unpack2 = unpackS *** unpackS
+
+s_take n = L.take n `eqP` (unpackS . S.take n)
+s_take_s m = L.take n `eqP` (unpackS . S.unstream . S.take n)
+ where n = small m
+sf_take p n = (L.take n . L.filter p) `eqP`
+ (unpackS . S.take n . S.filter p)
+t_take n = L.take n `eqP` (unpackS . T.take n)
+t_takeEnd n = (L.reverse . L.take n . L.reverse) `eqP`
+ (unpackS . T.takeEnd n)
+tl_take n = L.take n `eqP` (unpackS . TL.take (fromIntegral n))
+tl_takeEnd n = (L.reverse . L.take (fromIntegral n) . L.reverse) `eqP`
+ (unpackS . TL.takeEnd n)
+s_drop n = L.drop n `eqP` (unpackS . S.drop n)
+s_drop_s m = L.drop n `eqP` (unpackS . S.unstream . S.drop n)
+ where n = small m
+sf_drop p n = (L.drop n . L.filter p) `eqP`
+ (unpackS . S.drop n . S.filter p)
+t_drop n = L.drop n `eqP` (unpackS . T.drop n)
+t_dropEnd n = (L.reverse . L.drop n . L.reverse) `eqP`
+ (unpackS . T.dropEnd n)
+tl_drop n = L.drop n `eqP` (unpackS . TL.drop (fromIntegral n))
+tl_dropEnd n = (L.reverse . L.drop n . L.reverse) `eqP`
+ (unpackS . TL.dropEnd (fromIntegral n))
+s_take_drop m = (L.take n . L.drop n) `eqP` (unpackS . S.take n . S.drop n)
+ where n = small m
+s_take_drop_s m = (L.take n . L.drop n) `eqP`
+ (unpackS . S.unstream . S.take n . S.drop n)
+ where n = small m
+s_takeWhile p = L.takeWhile p `eqP` (unpackS . S.takeWhile p)
+s_takeWhile_s p = L.takeWhile p `eqP` (unpackS . S.unstream . S.takeWhile p)
+sf_takeWhile q p = (L.takeWhile p . L.filter q) `eqP`
+ (unpackS . S.takeWhile p . S.filter q)
+noMatch = do
+ c <- char
+ d <- suchThat char (/= c)
+ return (c,d)
+t_takeWhile p = L.takeWhile p `eqP` (unpackS . T.takeWhile p)
+tl_takeWhile p = L.takeWhile p `eqP` (unpackS . TL.takeWhile p)
+t_takeWhileEnd p = (L.reverse . L.takeWhile p . L.reverse) `eqP`
+ (unpackS . T.takeWhileEnd p)
+t_takeWhileEnd_null t = forAll noMatch $ \(c,d) -> T.null $
+ T.takeWhileEnd (==d) (T.snoc t c)
+tl_takeWhileEnd p = (L.reverse . L.takeWhile p . L.reverse) `eqP`
+ (unpackS . TL.takeWhileEnd p)
+tl_takeWhileEnd_null t = forAll noMatch $ \(c,d) -> TL.null $
+ TL.takeWhileEnd (==d) (TL.snoc t c)
+s_dropWhile p = L.dropWhile p `eqP` (unpackS . S.dropWhile p)
+s_dropWhile_s p = L.dropWhile p `eqP` (unpackS . S.unstream . S.dropWhile p)
+sf_dropWhile q p = (L.dropWhile p . L.filter q) `eqP`
+ (unpackS . S.dropWhile p . S.filter q)
+t_dropWhile p = L.dropWhile p `eqP` (unpackS . T.dropWhile p)
+tl_dropWhile p = L.dropWhile p `eqP` (unpackS . S.dropWhile p)
+t_dropWhileEnd p = (L.reverse . L.dropWhile p . L.reverse) `eqP`
+ (unpackS . T.dropWhileEnd p)
+tl_dropWhileEnd p = (L.reverse . L.dropWhile p . L.reverse) `eqP`
+ (unpackS . TL.dropWhileEnd p)
+t_dropAround p = (L.dropWhile p . L.reverse . L.dropWhile p . L.reverse)
+ `eqP` (unpackS . T.dropAround p)
+tl_dropAround p = (L.dropWhile p . L.reverse . L.dropWhile p . L.reverse)
+ `eqP` (unpackS . TL.dropAround p)
+t_stripStart = T.dropWhile isSpace `eq` T.stripStart
+tl_stripStart = TL.dropWhile isSpace `eq` TL.stripStart
+t_stripEnd = T.dropWhileEnd isSpace `eq` T.stripEnd
+tl_stripEnd = TL.dropWhileEnd isSpace `eq` TL.stripEnd
+t_strip = T.dropAround isSpace `eq` T.strip
+tl_strip = TL.dropAround isSpace `eq` TL.strip
+t_splitAt n = L.splitAt n `eqP` (unpack2 . T.splitAt n)
+tl_splitAt n = L.splitAt n `eqP` (unpack2 . TL.splitAt (fromIntegral n))
+t_span p = L.span p `eqP` (unpack2 . T.span p)
+tl_span p = L.span p `eqP` (unpack2 . TL.span p)
+
+t_breakOn_id s = squid `eq` (uncurry T.append . T.breakOn s)
+ where squid t | T.null s = error "empty"
+ | otherwise = t
+tl_breakOn_id s = squid `eq` (uncurry TL.append . TL.breakOn s)
+ where squid t | TL.null s = error "empty"
+ | otherwise = t
+t_breakOn_start (NotEmpty s) t =
+ let (k,m) = T.breakOn s t
+ in k `T.isPrefixOf` t && (T.null m || s `T.isPrefixOf` m)
+tl_breakOn_start (NotEmpty s) t =
+ let (k,m) = TL.breakOn s t
+ in k `TL.isPrefixOf` t && TL.null m || s `TL.isPrefixOf` m
+t_breakOnEnd_end (NotEmpty s) t =
+ let (m,k) = T.breakOnEnd s t
+ in k `T.isSuffixOf` t && (T.null m || s `T.isSuffixOf` m)
+tl_breakOnEnd_end (NotEmpty s) t =
+ let (m,k) = TL.breakOnEnd s t
+ in k `TL.isSuffixOf` t && (TL.null m || s `TL.isSuffixOf` m)
+t_break p = L.break p `eqP` (unpack2 . T.break p)
+tl_break p = L.break p `eqP` (unpack2 . TL.break p)
+t_group = L.group `eqP` (map unpackS . T.group)
+tl_group = L.group `eqP` (map unpackS . TL.group)
+t_groupBy p = L.groupBy p `eqP` (map unpackS . T.groupBy p)
+tl_groupBy p = L.groupBy p `eqP` (map unpackS . TL.groupBy p)
+t_inits = L.inits `eqP` (map unpackS . T.inits)
+tl_inits = L.inits `eqP` (map unpackS . TL.inits)
+t_tails = L.tails `eqP` (map unpackS . T.tails)
+tl_tails = unsquare $
+ L.tails `eqP` (map unpackS . TL.tails)
+t_findAppendId = unsquare $ \(NotEmpty s) ts ->
+ let t = T.intercalate s ts
+ in all (==t) $ map (uncurry T.append) (T.breakOnAll s t)
+tl_findAppendId = unsquare $ \(NotEmpty s) ts ->
+ let t = TL.intercalate s ts
+ in all (==t) $ map (uncurry TL.append) (TL.breakOnAll s t)
+t_findContains = unsquare $ \(NotEmpty s) ->
+ all (T.isPrefixOf s . snd) . T.breakOnAll s . T.intercalate s
+tl_findContains = unsquare $ \(NotEmpty s) -> all (TL.isPrefixOf s . snd) .
+ TL.breakOnAll s . TL.intercalate s
+sl_filterCount c = (L.genericLength . L.filter (==c)) `eqP` SL.countChar c
+t_findCount s = (L.length . T.breakOnAll s) `eq` T.count s
+tl_findCount s = (L.genericLength . TL.breakOnAll s) `eq` TL.count s
+
+t_splitOn_split s = unsquare $
+ (T.splitOn s `eq` Slow.splitOn s) . T.intercalate s
+tl_splitOn_split s = unsquare $
+ ((TL.splitOn (TL.fromStrict s) . TL.fromStrict) `eq`
+ (map TL.fromStrict . T.splitOn s)) . T.intercalate s
+t_splitOn_i (NotEmpty t) = id `eq` (T.intercalate t . T.splitOn t)
+tl_splitOn_i (NotEmpty t) = id `eq` (TL.intercalate t . TL.splitOn t)
+
+t_split p = split p `eqP` (map unpackS . T.split p)
+t_split_count c = (L.length . T.split (==c)) `eq`
+ ((1+) . T.count (T.singleton c))
+t_split_splitOn c = T.split (==c) `eq` T.splitOn (T.singleton c)
+tl_split p = split p `eqP` (map unpackS . TL.split p)
+
+split :: (a -> Bool) -> [a] -> [[a]]
+split _ [] = [[]]
+split p xs = loop xs
+ where loop s | null s' = [l]
+ | otherwise = l : loop (tail s')
+ where (l, s') = break p s
+
+t_chunksOf_same_lengths k = all ((==k) . T.length) . ini . T.chunksOf k
+ where ini [] = []
+ ini xs = init xs
+
+t_chunksOf_length k t = len == T.length t || (k <= 0 && len == 0)
+ where len = L.sum . L.map T.length $ T.chunksOf k t
+
+tl_chunksOf k = T.chunksOf k `eq` (map (T.concat . TL.toChunks) .
+ TL.chunksOf (fromIntegral k) . TL.fromStrict)
+
+t_lines = L.lines `eqP` (map unpackS . T.lines)
+tl_lines = L.lines `eqP` (map unpackS . TL.lines)
+{-
+t_lines' = lines' `eqP` (map unpackS . T.lines')
+ where lines' "" = []
+ lines' s = let (l, s') = break eol s
+ in l : case s' of
+ [] -> []
+ ('\r':'\n':s'') -> lines' s''
+ (_:s'') -> lines' s''
+ eol c = c == '\r' || c == '\n'
+-}
+t_words = L.words `eqP` (map unpackS . T.words)
+
+tl_words = L.words `eqP` (map unpackS . TL.words)
+t_unlines = unsquare $
+ L.unlines `eq` (unpackS . T.unlines . map packS)
+tl_unlines = unsquare $
+ L.unlines `eq` (unpackS . TL.unlines . map packS)
+t_unwords = unsquare $
+ L.unwords `eq` (unpackS . T.unwords . map packS)
+tl_unwords = unsquare $
+ L.unwords `eq` (unpackS . TL.unwords . map packS)
+
+s_isPrefixOf s = L.isPrefixOf s `eqP`
+ (S.isPrefixOf (S.stream $ packS s) . S.stream)
+sf_isPrefixOf p s = (L.isPrefixOf s . L.filter p) `eqP`
+ (S.isPrefixOf (S.stream $ packS s) . S.filter p . S.stream)
+t_isPrefixOf s = L.isPrefixOf s`eqP` T.isPrefixOf (packS s)
+tl_isPrefixOf s = L.isPrefixOf s`eqP` TL.isPrefixOf (packS s)
+t_isSuffixOf s = L.isSuffixOf s`eqP` T.isSuffixOf (packS s)
+tl_isSuffixOf s = L.isSuffixOf s`eqP` TL.isSuffixOf (packS s)
+t_isInfixOf s = L.isInfixOf s `eqP` T.isInfixOf (packS s)
+tl_isInfixOf s = L.isInfixOf s `eqP` TL.isInfixOf (packS s)
+
+t_stripPrefix s = (fmap packS . L.stripPrefix s) `eqP` T.stripPrefix (packS s)
+tl_stripPrefix s = (fmap packS . L.stripPrefix s) `eqP` TL.stripPrefix (packS s)
+
+stripSuffix p t = reverse `fmap` L.stripPrefix (reverse p) (reverse t)
+
+t_stripSuffix s = (fmap packS . stripSuffix s) `eqP` T.stripSuffix (packS s)
+tl_stripSuffix s = (fmap packS . stripSuffix s) `eqP` TL.stripSuffix (packS s)
+
+commonPrefixes a0@(_:_) b0@(_:_) = Just (go a0 b0 [])
+ where go (a:as) (b:bs) ps
+ | a == b = go as bs (a:ps)
+ go as bs ps = (reverse ps,as,bs)
+commonPrefixes _ _ = Nothing
+
+t_commonPrefixes a b (NonEmpty p)
+ = commonPrefixes pa pb ==
+ repack `fmap` T.commonPrefixes (packS pa) (packS pb)
+ where repack (x,y,z) = (unpackS x,unpackS y,unpackS z)
+ pa = p ++ a
+ pb = p ++ b
+
+tl_commonPrefixes a b (NonEmpty p)
+ = commonPrefixes pa pb ==
+ repack `fmap` TL.commonPrefixes (packS pa) (packS pb)
+ where repack (x,y,z) = (unpackS x,unpackS y,unpackS z)
+ pa = p ++ a
+ pb = p ++ b
+
+sf_elem p c = (L.elem c . L.filter p) `eqP` (S.elem c . S.filter p)
+sf_filter q p = (L.filter p . L.filter q) `eqP`
+ (unpackS . S.filter p . S.filter q)
+t_filter p = L.filter p `eqP` (unpackS . T.filter p)
+tl_filter p = L.filter p `eqP` (unpackS . TL.filter p)
+sf_findBy q p = (L.find p . L.filter q) `eqP` (S.findBy p . S.filter q)
+t_find p = L.find p `eqP` T.find p
+tl_find p = L.find p `eqP` TL.find p
+t_partition p = L.partition p `eqP` (unpack2 . T.partition p)
+tl_partition p = L.partition p `eqP` (unpack2 . TL.partition p)
+
+sf_index p s = forAll (choose (-l,l*2))
+ ((L.filter p s L.!!) `eq` S.index (S.filter p $ packS s))
+ where l = L.length s
+t_index s = forAll (choose (-l,l*2)) ((s L.!!) `eq` T.index (packS s))
+ where l = L.length s
+
+tl_index s = forAll (choose (-l,l*2))
+ ((s L.!!) `eq` (TL.index (packS s) . fromIntegral))
+ where l = L.length s
+
+t_findIndex p = L.findIndex p `eqP` T.findIndex p
+t_count (NotEmpty t) = (subtract 1 . L.length . T.splitOn t) `eq` T.count t
+tl_count (NotEmpty t) = (subtract 1 . L.genericLength . TL.splitOn t) `eq`
+ TL.count t
+t_zip s = L.zip s `eqP` T.zip (packS s)
+tl_zip s = L.zip s `eqP` TL.zip (packS s)
+sf_zipWith p c s = (L.zipWith c (L.filter p s) . L.filter p) `eqP`
+ (unpackS . S.zipWith c (S.filter p $ packS s) . S.filter p)
+t_zipWith c s = L.zipWith c s `eqP` (unpackS . T.zipWith c (packS s))
+tl_zipWith c s = L.zipWith c s `eqP` (unpackS . TL.zipWith c (packS s))
+
+t_indices (NotEmpty s) = Slow.indices s `eq` indices s
+tl_indices (NotEmpty s) = lazyIndices s `eq` S.indices s
+ where lazyIndices ss t = map fromIntegral $ Slow.indices (conc ss) (conc t)
+ conc = T.concat . TL.toChunks
+t_indices_occurs = unsquare $ \(NotEmpty t) ts ->
+ let s = T.intercalate t ts
+ in Slow.indices t s === indices t s
+
+-- Bit shifts.
+shiftL w = forAll (choose (0,width-1)) $ \k -> Bits.shiftL w k == U.shiftL w k
+ where width = round (log (fromIntegral m) / log 2 :: Double)
+ (m,_) = (maxBound, m == w)
+shiftR w = forAll (choose (0,width-1)) $ \k -> Bits.shiftR w k == U.shiftR w k
+ where width = round (log (fromIntegral m) / log 2 :: Double)
+ (m,_) = (maxBound, m == w)
+
+shiftL_Int = shiftL :: Int -> Property
+shiftL_Word16 = shiftL :: Word16 -> Property
+shiftL_Word32 = shiftL :: Word32 -> Property
+shiftR_Int = shiftR :: Int -> Property
+shiftR_Word16 = shiftR :: Word16 -> Property
+shiftR_Word32 = shiftR :: Word32 -> Property
+
+-- Builder.
+
+tb_singleton = id `eqP`
+ (unpackS . TB.toLazyText . mconcat . map TB.singleton)
+tb_fromText = L.concat `eq` (unpackS . TB.toLazyText . mconcat .
+ map (TB.fromText . packS))
+tb_associative s1 s2 s3 =
+ TB.toLazyText (b1 `mappend` (b2 `mappend` b3)) ==
+ TB.toLazyText ((b1 `mappend` b2) `mappend` b3)
+ where b1 = TB.fromText (packS s1)
+ b2 = TB.fromText (packS s2)
+ b3 = TB.fromText (packS s3)
+
+-- Numeric builder stuff.
+
+tb_decimal :: (Integral a, Show a) => a -> Bool
+tb_decimal = (TB.toLazyText . TB.decimal) `eq` (TL.pack . show)
+
+tb_decimal_integer (a::Integer) = tb_decimal a
+tb_decimal_integer_big (Big a) = tb_decimal a
+tb_decimal_int (a::Int) = tb_decimal a
+tb_decimal_int8 (a::Int8) = tb_decimal a
+tb_decimal_int16 (a::Int16) = tb_decimal a
+tb_decimal_int32 (a::Int32) = tb_decimal a
+tb_decimal_int64 (a::Int64) = tb_decimal a
+tb_decimal_word (a::Word) = tb_decimal a
+tb_decimal_word8 (a::Word8) = tb_decimal a
+tb_decimal_word16 (a::Word16) = tb_decimal a
+tb_decimal_word32 (a::Word32) = tb_decimal a
+tb_decimal_word64 (a::Word64) = tb_decimal a
+
+tb_decimal_big_int (BigBounded (a::Int)) = tb_decimal a
+tb_decimal_big_int64 (BigBounded (a::Int64)) = tb_decimal a
+tb_decimal_big_word (BigBounded (a::Word)) = tb_decimal a
+tb_decimal_big_word64 (BigBounded (a::Word64)) = tb_decimal a
+
+tb_hex :: (Integral a, Show a) => a -> Bool
+tb_hex = (TB.toLazyText . TB.hexadecimal) `eq` (TL.pack . flip showHex "")
+
+tb_hexadecimal_integer (a::Integer) = tb_hex a
+tb_hexadecimal_int (a::Int) = tb_hex a
+tb_hexadecimal_int8 (a::Int8) = tb_hex a
+tb_hexadecimal_int16 (a::Int16) = tb_hex a
+tb_hexadecimal_int32 (a::Int32) = tb_hex a
+tb_hexadecimal_int64 (a::Int64) = tb_hex a
+tb_hexadecimal_word (a::Word) = tb_hex a
+tb_hexadecimal_word8 (a::Word8) = tb_hex a
+tb_hexadecimal_word16 (a::Word16) = tb_hex a
+tb_hexadecimal_word32 (a::Word32) = tb_hex a
+tb_hexadecimal_word64 (a::Word64) = tb_hex a
+
+tb_realfloat :: (RealFloat a, Show a) => a -> Bool
+tb_realfloat = (TB.toLazyText . TB.realFloat) `eq` (TL.pack . show)
+
+tb_realfloat_float (a::Float) = tb_realfloat a
+tb_realfloat_double (a::Double) = tb_realfloat a
+
+showFloat :: (RealFloat a) => TB.FPFormat -> Maybe Int -> a -> ShowS
+showFloat TB.Exponent = showEFloat
+showFloat TB.Fixed = showFFloat
+showFloat TB.Generic = showGFloat
+
+tb_formatRealFloat :: (RealFloat a, Show a) =>
+ a -> TB.FPFormat -> Precision a -> Property
+tb_formatRealFloat a fmt prec =
+ TB.formatRealFloat fmt p a ===
+ TB.fromString (showFloat fmt p a "")
+ where p = precision a prec
+
+tb_formatRealFloat_float (a::Float) = tb_formatRealFloat a
+tb_formatRealFloat_double (a::Double) = tb_formatRealFloat a
+
+-- Reading.
+
+t_decimal (n::Int) s =
+ T.signed T.decimal (T.pack (show n) `T.append` t) === Right (n,t)
+ where t = T.dropWhile isDigit s
+tl_decimal (n::Int) s =
+ TL.signed TL.decimal (TL.pack (show n) `TL.append` t) === Right (n,t)
+ where t = TL.dropWhile isDigit s
+t_hexadecimal m s ox =
+ T.hexadecimal (T.concat [p, T.pack (showHex n ""), t]) === Right (n,t)
+ where t = T.dropWhile isHexDigit s
+ p = if ox then "0x" else ""
+ n = getPositive m :: Int
+tl_hexadecimal m s ox =
+ TL.hexadecimal (TL.concat [p, TL.pack (showHex n ""), t]) === Right (n,t)
+ where t = TL.dropWhile isHexDigit s
+ p = if ox then "0x" else ""
+ n = getPositive m :: Int
+
+isFloaty c = c `elem` ("+-.0123456789eE" :: String)
+
+t_read_rational p tol (n::Double) s =
+ case p (T.pack (show n) `T.append` t) of
+ Left _err -> False
+ Right (n',t') -> t == t' && abs (n-n') <= tol
+ where t = T.dropWhile isFloaty s
+
+tl_read_rational p tol (n::Double) s =
+ case p (TL.pack (show n) `TL.append` t) of
+ Left _err -> False
+ Right (n',t') -> t == t' && abs (n-n') <= tol
+ where t = TL.dropWhile isFloaty s
+
+t_double = t_read_rational T.double 1e-13
+tl_double = tl_read_rational TL.double 1e-13
+t_rational = t_read_rational T.rational 1e-16
+tl_rational = tl_read_rational TL.rational 1e-16
+
+-- Input and output.
+
+t_put_get = write_read T.unlines T.filter put get
+ where put h = withRedirect h IO.stdout . T.putStr
+ get h = withRedirect h IO.stdin T.getContents
+tl_put_get = write_read TL.unlines TL.filter put get
+ where put h = withRedirect h IO.stdout . TL.putStr
+ get h = withRedirect h IO.stdin TL.getContents
+t_write_read = write_read T.unlines T.filter T.hPutStr T.hGetContents
+tl_write_read = write_read TL.unlines TL.filter TL.hPutStr TL.hGetContents
+
+t_write_read_line e m b t = write_read head T.filter T.hPutStrLn
+ T.hGetLine e m b [t]
+tl_write_read_line e m b t = write_read head TL.filter TL.hPutStrLn
+ TL.hGetLine e m b [t]
+
+-- Low-level.
+
+t_dropWord8 m t = dropWord8 m t `T.isSuffixOf` t
+t_takeWord8 m t = takeWord8 m t `T.isPrefixOf` t
+t_take_drop_8 m t = T.append (takeWord8 n t) (dropWord8 n t) == t
+ where n = small m
+t_use_from t = monadicIO $ assert . (==t) =<< run (useAsPtr t fromPtr)
+
+t_copy t = T.copy t === t
+
+-- Regression tests.
+s_filter_eq s = S.filter p t == S.streamList (filter p s)
+ where p = (/= S.last t)
+ t = S.streamList s
+
+-- Make a stream appear shorter than it really is, to ensure that
+-- functions that consume inaccurately sized streams behave
+-- themselves.
+shorten :: Int -> S.Stream a -> S.Stream a
+shorten n t@(S.Stream arr off len)
+ | n > 0 = S.Stream arr off (smaller (exactSize n) len)
+ | otherwise = t
+
+tests :: Test
+tests =
+ testGroup "Properties" [
+ testGroup "creation/elimination" [
+ testProperty "t_pack_unpack" t_pack_unpack,
+ testProperty "tl_pack_unpack" tl_pack_unpack,
+ testProperty "t_stream_unstream" t_stream_unstream,
+ testProperty "tl_stream_unstream" tl_stream_unstream,
+ testProperty "t_reverse_stream" t_reverse_stream,
+ testProperty "t_singleton" t_singleton,
+ testProperty "tl_singleton" tl_singleton,
+ testProperty "tl_unstreamChunks" tl_unstreamChunks,
+ testProperty "tl_chunk_unchunk" tl_chunk_unchunk,
+ testProperty "tl_from_to_strict" tl_from_to_strict
+ ],
+
+ testGroup "transcoding" [
+ testProperty "t_ascii" t_ascii,
+ testProperty "tl_ascii" tl_ascii,
+ testProperty "t_latin1" t_latin1,
+ testProperty "tl_latin1" tl_latin1,
+ testProperty "t_utf8" t_utf8,
+ testProperty "t_utf8'" t_utf8',
+ testProperty "t_utf8_incr" t_utf8_incr,
+ testProperty "t_utf8_undecoded" t_utf8_undecoded,
+ testProperty "tl_utf8" tl_utf8,
+ testProperty "tl_utf8'" tl_utf8',
+ testProperty "t_utf16LE" t_utf16LE,
+ testProperty "tl_utf16LE" tl_utf16LE,
+ testProperty "t_utf16BE" t_utf16BE,
+ testProperty "tl_utf16BE" tl_utf16BE,
+ testProperty "t_utf32LE" t_utf32LE,
+ testProperty "tl_utf32LE" tl_utf32LE,
+ testProperty "t_utf32BE" t_utf32BE,
+ testProperty "tl_utf32BE" tl_utf32BE,
+ testGroup "errors" [
+ testProperty "t_utf8_err" t_utf8_err,
+ testProperty "t_utf8_err'" t_utf8_err'
+ ]
+ ],
+
+ testGroup "instances" [
+ testProperty "s_Eq" s_Eq,
+ testProperty "sf_Eq" sf_Eq,
+ testProperty "t_Eq" t_Eq,
+ testProperty "tl_Eq" tl_Eq,
+ testProperty "s_Ord" s_Ord,
+ testProperty "sf_Ord" sf_Ord,
+ testProperty "t_Ord" t_Ord,
+ testProperty "tl_Ord" tl_Ord,
+ testProperty "t_Read" t_Read,
+ testProperty "tl_Read" tl_Read,
+ testProperty "t_Show" t_Show,
+ testProperty "tl_Show" tl_Show,
+ testProperty "t_mappend" t_mappend,
+ testProperty "tl_mappend" tl_mappend,
+ testProperty "t_mconcat" t_mconcat,
+ testProperty "tl_mconcat" tl_mconcat,
+ testProperty "t_mempty" t_mempty,
+ testProperty "tl_mempty" tl_mempty,
+ testProperty "t_IsString" t_IsString,
+ testProperty "tl_IsString" tl_IsString
+ ],
+
+ testGroup "basics" [
+ testProperty "s_cons" s_cons,
+ testProperty "s_cons_s" s_cons_s,
+ testProperty "sf_cons" sf_cons,
+ testProperty "t_cons" t_cons,
+ testProperty "tl_cons" tl_cons,
+ testProperty "s_snoc" s_snoc,
+ testProperty "t_snoc" t_snoc,
+ testProperty "tl_snoc" tl_snoc,
+ testProperty "s_append" s_append,
+ testProperty "s_append_s" s_append_s,
+ testProperty "sf_append" sf_append,
+ testProperty "t_append" t_append,
+ testProperty "s_uncons" s_uncons,
+ testProperty "sf_uncons" sf_uncons,
+ testProperty "t_uncons" t_uncons,
+ testProperty "tl_uncons" tl_uncons,
+ testProperty "t_unsnoc" t_unsnoc,
+ testProperty "tl_unsnoc" tl_unsnoc,
+ testProperty "s_head" s_head,
+ testProperty "sf_head" sf_head,
+ testProperty "t_head" t_head,
+ testProperty "tl_head" tl_head,
+ testProperty "s_last" s_last,
+ testProperty "sf_last" sf_last,
+ testProperty "t_last" t_last,
+ testProperty "tl_last" tl_last,
+ testProperty "s_tail" s_tail,
+ testProperty "s_tail_s" s_tail_s,
+ testProperty "sf_tail" sf_tail,
+ testProperty "t_tail" t_tail,
+ testProperty "tl_tail" tl_tail,
+ testProperty "s_init" s_init,
+ testProperty "s_init_s" s_init_s,
+ testProperty "sf_init" sf_init,
+ testProperty "t_init" t_init,
+ testProperty "tl_init" tl_init,
+ testProperty "s_null" s_null,
+ testProperty "sf_null" sf_null,
+ testProperty "t_null" t_null,
+ testProperty "tl_null" tl_null,
+ testProperty "s_length" s_length,
+ testProperty "sf_length" sf_length,
+ testProperty "sl_length" sl_length,
+ testProperty "t_length" t_length,
+ testProperty "tl_length" tl_length,
+ testProperty "t_compareLength" t_compareLength,
+ testProperty "tl_compareLength" tl_compareLength
+ ],
+
+ testGroup "transformations" [
+ testProperty "s_map" s_map,
+ testProperty "s_map_s" s_map_s,
+ testProperty "sf_map" sf_map,
+ testProperty "t_map" t_map,
+ testProperty "tl_map" tl_map,
+ testProperty "s_intercalate" s_intercalate,
+ testProperty "t_intercalate" t_intercalate,
+ testProperty "tl_intercalate" tl_intercalate,
+ testProperty "s_intersperse" s_intersperse,
+ testProperty "s_intersperse_s" s_intersperse_s,
+ testProperty "sf_intersperse" sf_intersperse,
+ testProperty "t_intersperse" t_intersperse,
+ testProperty "tl_intersperse" tl_intersperse,
+ testProperty "t_transpose" t_transpose,
+ testProperty "tl_transpose" tl_transpose,
+ testProperty "t_reverse" t_reverse,
+ testProperty "tl_reverse" tl_reverse,
+ testProperty "t_reverse_short" t_reverse_short,
+ testProperty "t_replace" t_replace,
+ testProperty "tl_replace" tl_replace,
+
+ testGroup "case conversion" [
+ testProperty "s_toCaseFold_length" s_toCaseFold_length,
+ testProperty "sf_toCaseFold_length" sf_toCaseFold_length,
+ testProperty "t_toCaseFold_length" t_toCaseFold_length,
+ testProperty "tl_toCaseFold_length" tl_toCaseFold_length,
+ testProperty "t_toLower_length" t_toLower_length,
+ testProperty "t_toLower_lower" t_toLower_lower,
+ testProperty "tl_toLower_lower" tl_toLower_lower,
+ testProperty "t_toUpper_length" t_toUpper_length,
+ testProperty "t_toUpper_upper" t_toUpper_upper,
+ testProperty "tl_toUpper_upper" tl_toUpper_upper,
+ testProperty "t_toTitle_title" t_toTitle_title,
+ testProperty "t_toTitle_1stNotLower" t_toTitle_1stNotLower
+ ],
+
+ testGroup "justification" [
+ testProperty "s_justifyLeft" s_justifyLeft,
+ testProperty "s_justifyLeft_s" s_justifyLeft_s,
+ testProperty "sf_justifyLeft" sf_justifyLeft,
+ testProperty "t_justifyLeft" t_justifyLeft,
+ testProperty "tl_justifyLeft" tl_justifyLeft,
+ testProperty "t_justifyRight" t_justifyRight,
+ testProperty "tl_justifyRight" tl_justifyRight,
+ testProperty "t_center" t_center,
+ testProperty "tl_center" tl_center
+ ]
+ ],
+
+ testGroup "folds" [
+ testProperty "sf_foldl" sf_foldl,
+ testProperty "t_foldl" t_foldl,
+ testProperty "tl_foldl" tl_foldl,
+ testProperty "sf_foldl'" sf_foldl',
+ testProperty "t_foldl'" t_foldl',
+ testProperty "tl_foldl'" tl_foldl',
+ testProperty "sf_foldl1" sf_foldl1,
+ testProperty "t_foldl1" t_foldl1,
+ testProperty "tl_foldl1" tl_foldl1,
+ testProperty "t_foldl1'" t_foldl1',
+ testProperty "sf_foldl1'" sf_foldl1',
+ testProperty "tl_foldl1'" tl_foldl1',
+ testProperty "sf_foldr" sf_foldr,
+ testProperty "t_foldr" t_foldr,
+ testProperty "tl_foldr" tl_foldr,
+ testProperty "sf_foldr1" sf_foldr1,
+ testProperty "t_foldr1" t_foldr1,
+ testProperty "tl_foldr1" tl_foldr1,
+
+ testGroup "special" [
+ testProperty "s_concat_s" s_concat_s,
+ testProperty "sf_concat" sf_concat,
+ testProperty "t_concat" t_concat,
+ testProperty "tl_concat" tl_concat,
+ testProperty "sf_concatMap" sf_concatMap,
+ testProperty "t_concatMap" t_concatMap,
+ testProperty "tl_concatMap" tl_concatMap,
+ testProperty "sf_any" sf_any,
+ testProperty "t_any" t_any,
+ testProperty "tl_any" tl_any,
+ testProperty "sf_all" sf_all,
+ testProperty "t_all" t_all,
+ testProperty "tl_all" tl_all,
+ testProperty "sf_maximum" sf_maximum,
+ testProperty "t_maximum" t_maximum,
+ testProperty "tl_maximum" tl_maximum,
+ testProperty "sf_minimum" sf_minimum,
+ testProperty "t_minimum" t_minimum,
+ testProperty "tl_minimum" tl_minimum
+ ]
+ ],
+
+ testGroup "construction" [
+ testGroup "scans" [
+ testProperty "sf_scanl" sf_scanl,
+ testProperty "t_scanl" t_scanl,
+ testProperty "tl_scanl" tl_scanl,
+ testProperty "t_scanl1" t_scanl1,
+ testProperty "tl_scanl1" tl_scanl1,
+ testProperty "t_scanr" t_scanr,
+ testProperty "tl_scanr" tl_scanr,
+ testProperty "t_scanr1" t_scanr1,
+ testProperty "tl_scanr1" tl_scanr1
+ ],
+
+ testGroup "mapAccum" [
+ testProperty "t_mapAccumL" t_mapAccumL,
+ testProperty "tl_mapAccumL" tl_mapAccumL,
+ testProperty "t_mapAccumR" t_mapAccumR,
+ testProperty "tl_mapAccumR" tl_mapAccumR
+ ],
+
+ testGroup "unfolds" [
+ testProperty "tl_repeat" tl_repeat,
+ testProperty "s_replicate" s_replicate,
+ testProperty "t_replicate" t_replicate,
+ testProperty "tl_replicate" tl_replicate,
+ testProperty "tl_cycle" tl_cycle,
+ testProperty "tl_iterate" tl_iterate,
+ testProperty "t_unfoldr" t_unfoldr,
+ testProperty "tl_unfoldr" tl_unfoldr,
+ testProperty "t_unfoldrN" t_unfoldrN,
+ testProperty "tl_unfoldrN" tl_unfoldrN
+ ]
+ ],
+
+ testGroup "substrings" [
+ testGroup "breaking" [
+ testProperty "s_take" s_take,
+ testProperty "s_take_s" s_take_s,
+ testProperty "sf_take" sf_take,
+ testProperty "t_take" t_take,
+ testProperty "t_takeEnd" t_takeEnd,
+ testProperty "tl_take" tl_take,
+ testProperty "tl_takeEnd" tl_takeEnd,
+ testProperty "s_drop" s_drop,
+ testProperty "s_drop_s" s_drop_s,
+ testProperty "sf_drop" sf_drop,
+ testProperty "t_drop" t_drop,
+ testProperty "t_dropEnd" t_dropEnd,
+ testProperty "tl_drop" tl_drop,
+ testProperty "tl_dropEnd" tl_dropEnd,
+ testProperty "s_take_drop" s_take_drop,
+ testProperty "s_take_drop_s" s_take_drop_s,
+ testProperty "s_takeWhile" s_takeWhile,
+ testProperty "s_takeWhile_s" s_takeWhile_s,
+ testProperty "sf_takeWhile" sf_takeWhile,
+ testProperty "t_takeWhile" t_takeWhile,
+ testProperty "tl_takeWhile" tl_takeWhile,
+ testProperty "t_takeWhileEnd" t_takeWhileEnd,
+ testProperty "t_takeWhileEnd_null" t_takeWhileEnd_null,
+ testProperty "tl_takeWhileEnd" tl_takeWhileEnd,
+ testProperty "tl_takeWhileEnd_null" tl_takeWhileEnd_null,
+ testProperty "sf_dropWhile" sf_dropWhile,
+ testProperty "s_dropWhile" s_dropWhile,
+ testProperty "s_dropWhile_s" s_dropWhile_s,
+ testProperty "t_dropWhile" t_dropWhile,
+ testProperty "tl_dropWhile" tl_dropWhile,
+ testProperty "t_dropWhileEnd" t_dropWhileEnd,
+ testProperty "tl_dropWhileEnd" tl_dropWhileEnd,
+ testProperty "t_dropAround" t_dropAround,
+ testProperty "tl_dropAround" tl_dropAround,
+ testProperty "t_stripStart" t_stripStart,
+ testProperty "tl_stripStart" tl_stripStart,
+ testProperty "t_stripEnd" t_stripEnd,
+ testProperty "tl_stripEnd" tl_stripEnd,
+ testProperty "t_strip" t_strip,
+ testProperty "tl_strip" tl_strip,
+ testProperty "t_splitAt" t_splitAt,
+ testProperty "tl_splitAt" tl_splitAt,
+ testProperty "t_span" t_span,
+ testProperty "tl_span" tl_span,
+ testProperty "t_breakOn_id" t_breakOn_id,
+ testProperty "tl_breakOn_id" tl_breakOn_id,
+ testProperty "t_breakOn_start" t_breakOn_start,
+ testProperty "tl_breakOn_start" tl_breakOn_start,
+ testProperty "t_breakOnEnd_end" t_breakOnEnd_end,
+ testProperty "tl_breakOnEnd_end" tl_breakOnEnd_end,
+ testProperty "t_break" t_break,
+ testProperty "tl_break" tl_break,
+ testProperty "t_group" t_group,
+ testProperty "tl_group" tl_group,
+ testProperty "t_groupBy" t_groupBy,
+ testProperty "tl_groupBy" tl_groupBy,
+ testProperty "t_inits" t_inits,
+ testProperty "tl_inits" tl_inits,
+ testProperty "t_tails" t_tails,
+ testProperty "tl_tails" tl_tails
+ ],
+
+ testGroup "breaking many" [
+ testProperty "t_findAppendId" t_findAppendId,
+ testProperty "tl_findAppendId" tl_findAppendId,
+ testProperty "t_findContains" t_findContains,
+ testProperty "tl_findContains" tl_findContains,
+ testProperty "sl_filterCount" sl_filterCount,
+ testProperty "t_findCount" t_findCount,
+ testProperty "tl_findCount" tl_findCount,
+ testProperty "t_splitOn_split" t_splitOn_split,
+ testProperty "tl_splitOn_split" tl_splitOn_split,
+ testProperty "t_splitOn_i" t_splitOn_i,
+ testProperty "tl_splitOn_i" tl_splitOn_i,
+ testProperty "t_split" t_split,
+ testProperty "t_split_count" t_split_count,
+ testProperty "t_split_splitOn" t_split_splitOn,
+ testProperty "tl_split" tl_split,
+ testProperty "t_chunksOf_same_lengths" t_chunksOf_same_lengths,
+ testProperty "t_chunksOf_length" t_chunksOf_length,
+ testProperty "tl_chunksOf" tl_chunksOf
+ ],
+
+ testGroup "lines and words" [
+ testProperty "t_lines" t_lines,
+ testProperty "tl_lines" tl_lines,
+ --testProperty "t_lines'" t_lines',
+ testProperty "t_words" t_words,
+ testProperty "tl_words" tl_words,
+ testProperty "t_unlines" t_unlines,
+ testProperty "tl_unlines" tl_unlines,
+ testProperty "t_unwords" t_unwords,
+ testProperty "tl_unwords" tl_unwords
+ ]
+ ],
+
+ testGroup "predicates" [
+ testProperty "s_isPrefixOf" s_isPrefixOf,
+ testProperty "sf_isPrefixOf" sf_isPrefixOf,
+ testProperty "t_isPrefixOf" t_isPrefixOf,
+ testProperty "tl_isPrefixOf" tl_isPrefixOf,
+ testProperty "t_isSuffixOf" t_isSuffixOf,
+ testProperty "tl_isSuffixOf" tl_isSuffixOf,
+ testProperty "t_isInfixOf" t_isInfixOf,
+ testProperty "tl_isInfixOf" tl_isInfixOf,
+
+ testGroup "view" [
+ testProperty "t_stripPrefix" t_stripPrefix,
+ testProperty "tl_stripPrefix" tl_stripPrefix,
+ testProperty "t_stripSuffix" t_stripSuffix,
+ testProperty "tl_stripSuffix" tl_stripSuffix,
+ testProperty "t_commonPrefixes" t_commonPrefixes,
+ testProperty "tl_commonPrefixes" tl_commonPrefixes
+ ]
+ ],
+
+ testGroup "searching" [
+ testProperty "sf_elem" sf_elem,
+ testProperty "sf_filter" sf_filter,
+ testProperty "t_filter" t_filter,
+ testProperty "tl_filter" tl_filter,
+ testProperty "sf_findBy" sf_findBy,
+ testProperty "t_find" t_find,
+ testProperty "tl_find" tl_find,
+ testProperty "t_partition" t_partition,
+ testProperty "tl_partition" tl_partition
+ ],
+
+ testGroup "indexing" [
+ testProperty "sf_index" sf_index,
+ testProperty "t_index" t_index,
+ testProperty "tl_index" tl_index,
+ testProperty "t_findIndex" t_findIndex,
+ testProperty "t_count" t_count,
+ testProperty "tl_count" tl_count,
+ testProperty "t_indices" t_indices,
+ testProperty "tl_indices" tl_indices,
+ testProperty "t_indices_occurs" t_indices_occurs
+ ],
+
+ testGroup "zips" [
+ testProperty "t_zip" t_zip,
+ testProperty "tl_zip" tl_zip,
+ testProperty "sf_zipWith" sf_zipWith,
+ testProperty "t_zipWith" t_zipWith,
+ testProperty "tl_zipWith" tl_zipWith
+ ],
+
+ testGroup "regressions" [
+ testProperty "s_filter_eq" s_filter_eq
+ ],
+
+ testGroup "shifts" [
+ testProperty "shiftL_Int" shiftL_Int,
+ testProperty "shiftL_Word16" shiftL_Word16,
+ testProperty "shiftL_Word32" shiftL_Word32,
+ testProperty "shiftR_Int" shiftR_Int,
+ testProperty "shiftR_Word16" shiftR_Word16,
+ testProperty "shiftR_Word32" shiftR_Word32
+ ],
+
+ testGroup "builder" [
+ testProperty "tb_associative" tb_associative,
+ testGroup "decimal" [
+ testProperty "tb_decimal_int" tb_decimal_int,
+ testProperty "tb_decimal_int8" tb_decimal_int8,
+ testProperty "tb_decimal_int16" tb_decimal_int16,
+ testProperty "tb_decimal_int32" tb_decimal_int32,
+ testProperty "tb_decimal_int64" tb_decimal_int64,
+ testProperty "tb_decimal_integer" tb_decimal_integer,
+ testProperty "tb_decimal_integer_big" tb_decimal_integer_big,
+ testProperty "tb_decimal_word" tb_decimal_word,
+ testProperty "tb_decimal_word8" tb_decimal_word8,
+ testProperty "tb_decimal_word16" tb_decimal_word16,
+ testProperty "tb_decimal_word32" tb_decimal_word32,
+ testProperty "tb_decimal_word64" tb_decimal_word64,
+ testProperty "tb_decimal_big_int" tb_decimal_big_int,
+ testProperty "tb_decimal_big_word" tb_decimal_big_word,
+ testProperty "tb_decimal_big_int64" tb_decimal_big_int64,
+ testProperty "tb_decimal_big_word64" tb_decimal_big_word64
+ ],
+ testGroup "hexadecimal" [
+ testProperty "tb_hexadecimal_int" tb_hexadecimal_int,
+ testProperty "tb_hexadecimal_int8" tb_hexadecimal_int8,
+ testProperty "tb_hexadecimal_int16" tb_hexadecimal_int16,
+ testProperty "tb_hexadecimal_int32" tb_hexadecimal_int32,
+ testProperty "tb_hexadecimal_int64" tb_hexadecimal_int64,
+ testProperty "tb_hexadecimal_integer" tb_hexadecimal_integer,
+ testProperty "tb_hexadecimal_word" tb_hexadecimal_word,
+ testProperty "tb_hexadecimal_word8" tb_hexadecimal_word8,
+ testProperty "tb_hexadecimal_word16" tb_hexadecimal_word16,
+ testProperty "tb_hexadecimal_word32" tb_hexadecimal_word32,
+ testProperty "tb_hexadecimal_word64" tb_hexadecimal_word64
+ ],
+ testGroup "realfloat" [
+ testProperty "tb_realfloat_double" tb_realfloat_double,
+ testProperty "tb_realfloat_float" tb_realfloat_float,
+ testProperty "tb_formatRealFloat_float" tb_formatRealFloat_float,
+ testProperty "tb_formatRealFloat_double" tb_formatRealFloat_double
+ ],
+ testProperty "tb_fromText" tb_fromText,
+ testProperty "tb_singleton" tb_singleton
+ ],
+
+ testGroup "read" [
+ testProperty "t_decimal" t_decimal,
+ testProperty "tl_decimal" tl_decimal,
+ testProperty "t_hexadecimal" t_hexadecimal,
+ testProperty "tl_hexadecimal" tl_hexadecimal,
+ testProperty "t_double" t_double,
+ testProperty "tl_double" tl_double,
+ testProperty "t_rational" t_rational,
+ testProperty "tl_rational" tl_rational
+ ],
+
+ {-
+ testGroup "input-output" [
+ testProperty "t_write_read" t_write_read,
+ testProperty "tl_write_read" tl_write_read,
+ testProperty "t_write_read_line" t_write_read_line,
+ testProperty "tl_write_read_line" tl_write_read_line
+ -- These tests are subject to I/O race conditions when run under
+ -- test-framework-quickcheck2.
+ -- testProperty "t_put_get" t_put_get
+ -- testProperty "tl_put_get" tl_put_get
+ ],
+ -}
+
+ testGroup "lowlevel" [
+ testProperty "t_dropWord8" t_dropWord8,
+ testProperty "t_takeWord8" t_takeWord8,
+ testProperty "t_take_drop_8" t_take_drop_8,
+ testProperty "t_use_from" t_use_from,
+ testProperty "t_copy" t_copy
+ ],
+
+ testGroup "mul" Mul.tests
+ ]
diff --git a/tests/Tests/Properties/Mul.hs b/tests/Tests/Properties/Mul.hs
new file mode 100644
index 0000000..14877e5
--- /dev/null
+++ b/tests/Tests/Properties/Mul.hs
@@ -0,0 +1,40 @@
+{-# LANGUAGE ScopedTypeVariables #-}
+
+module Tests.Properties.Mul (tests) where
+
+import Control.Applicative ((<$>), pure)
+import Control.Exception as E (SomeException, catch, evaluate)
+import Data.Int (Int32, Int64)
+import Data.Text.Internal (mul, mul32, mul64)
+import System.IO.Unsafe (unsafePerformIO)
+import Test.Framework (Test)
+import Test.Framework.Providers.QuickCheck2 (testProperty)
+import Test.QuickCheck hiding ((.&.))
+
+mulRef :: (Integral a, Bounded a) => a -> a -> Maybe a
+mulRef a b
+ | ab < bot || ab > top = Nothing
+ | otherwise = Just (fromIntegral ab)
+ where ab = fromIntegral a * fromIntegral b
+ top = fromIntegral (maxBound `asTypeOf` a) :: Integer
+ bot = fromIntegral (minBound `asTypeOf` a) :: Integer
+
+eval :: (a -> b -> c) -> a -> b -> Maybe c
+eval f a b = unsafePerformIO $
+ (Just <$> evaluate (f a b)) `E.catch` (\(_::SomeException) -> pure Nothing)
+
+t_mul32 :: Int32 -> Int32 -> Property
+t_mul32 a b = mulRef a b === eval mul32 a b
+
+t_mul64 :: Int64 -> Int64 -> Property
+t_mul64 a b = mulRef a b === eval mul64 a b
+
+t_mul :: Int -> Int -> Property
+t_mul a b = mulRef a b === eval mul a b
+
+tests :: [Test]
+tests = [
+ testProperty "t_mul" t_mul
+ , testProperty "t_mul32" t_mul32
+ , testProperty "t_mul64" t_mul64
+ ]
diff --git a/tests/Tests/QuickCheckUtils.hs b/tests/Tests/QuickCheckUtils.hs
new file mode 100644
index 0000000..4360484
--- /dev/null
+++ b/tests/Tests/QuickCheckUtils.hs
@@ -0,0 +1,368 @@
+-- | This module provides quickcheck utilities, e.g. arbitrary and show
+-- instances, and comparison functions, so we can focus on the actual properties
+-- in the 'Tests.Properties' module.
+--
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE FlexibleInstances #-}
+{-# LANGUAGE TypeSynonymInstances #-}
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+
+-- workaround panic in GHC 8.4.2
+#if __GLASGOW_HASKELL__ == 804
+{-# OPTIONS_GHC -O0 #-}
+#endif
+
+module Tests.QuickCheckUtils
+ (
+ genUnicode
+ , unsquare
+ , smallArbitrary
+
+ , BigBounded(..)
+ , BigInt(..)
+ , NotEmpty(..)
+
+ , Small(..)
+ , small
+
+ , Precision(..)
+ , precision
+
+ , integralRandomR
+
+ , DecodeErr(..)
+ , genDecodeErr
+
+ , Stringy(..)
+ , eq
+ , eqP
+
+ , Encoding(..)
+
+ , write_read
+ ) where
+
+import Control.Applicative ((<$>))
+import Control.Arrow (first, (***))
+import Control.DeepSeq (NFData (..), deepseq)
+import Control.Exception (bracket)
+import Data.String (IsString, fromString)
+import Data.Text.Foreign (I8)
+import Data.Text.Lazy.Builder.RealFloat (FPFormat(..))
+import Data.Word (Word8, Word16)
+import Debug.Trace (trace)
+import System.Random (Random(..), RandomGen)
+import Test.QuickCheck hiding (Fixed(..), Small (..), (.&.))
+import Test.QuickCheck.Monadic (assert, monadicIO, run)
+import Test.QuickCheck.Unicode (string)
+import Tests.Utils
+import qualified Data.ByteString as B
+import qualified Data.Text as T
+import qualified Data.Text.Encoding.Error as T
+import qualified Data.Text.Internal.Fusion as TF
+import qualified Data.Text.Internal.Fusion.Common as TF
+import qualified Data.Text.Internal.Lazy as TL
+import qualified Data.Text.Internal.Lazy.Fusion as TLF
+import qualified Data.Text.Lazy as TL
+import qualified System.IO as IO
+
+#if !MIN_VERSION_base(4,4,0)
+import Data.Int (Int64)
+import Da