summaryrefslogtreecommitdiff
path: root/src/Data/Text/Short/Internal.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Data/Text/Short/Internal.hs')
-rw-r--r--src/Data/Text/Short/Internal.hs79
1 files changed, 74 insertions, 5 deletions
diff --git a/src/Data/Text/Short/Internal.hs b/src/Data/Text/Short/Internal.hs
index 0a61b6b..876985e 100644
--- a/src/Data/Text/Short/Internal.hs
+++ b/src/Data/Text/Short/Internal.hs
@@ -1,5 +1,6 @@
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
+{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE RankNTypes #-}
@@ -48,6 +49,7 @@ module Data.Text.Short.Internal
, span
, spanEnd
+ , split
, intersperse
, intercalate
@@ -108,7 +110,11 @@ import Data.ByteString.Short (ShortByteString)
import qualified Data.ByteString.Short as BSS
import qualified Data.ByteString.Short.Internal as BSSI
import Data.Char (ord)
+import Data.Data (Data(..),constrIndex, Constr,
+ mkConstr, DataType, mkDataType,
+ Fixity(Prefix))
import Data.Hashable (Hashable)
+import Data.Typeable (Typeable)
import qualified Data.List as List
import Data.Maybe (fromMaybe, isNothing)
import Data.Semigroup
@@ -150,9 +156,30 @@ import qualified PrimOps
-- In comparison, the footprint of a boxed 'ShortText' is only 4 words (i.e. 32 bytes on 64-bit systems) plus 1, 2, 3, or 4 bytes per code-point (due to the internal UTF-8 representation).
-- It can be shown that for realistic data <http://utf8everywhere.org/#asian UTF-16 has a space overhead of 50% over UTF-8>.
--
+-- __NOTE__: The `Typeable` instance isn't defined for GHC 7.8 (and older) prior to @text-short-0.1.3@
+--
-- @since 0.1
newtype ShortText = ShortText ShortByteString
- deriving (Monoid,Data.Semigroup.Semigroup,Hashable,NFData)
+ deriving (Hashable,Monoid,NFData,Data.Semigroup.Semigroup,Typeable)
+
+-- | It exposes a similar 'Data' instance abstraction as 'T.Text' (see
+-- discussion referenced there for more details), preserving the
+-- @[Char]@ data abstraction at the cost of inefficiency.
+--
+-- @since 0.1.3
+instance Data ShortText where
+ gfoldl f z txt = z fromString `f` (toString txt)
+ toConstr _ = packConstr
+ gunfold k z c = case constrIndex c of
+ 1 -> k (z fromString)
+ _ -> error "gunfold"
+ dataTypeOf _ = shortTextDataType
+
+packConstr :: Constr
+packConstr = mkConstr shortTextDataType "fromString" [] Prefix
+
+shortTextDataType :: DataType
+shortTextDataType = mkDataType "Data.Text.Short" [packConstr]
instance Eq ShortText where
{-# INLINE (==) #-}
@@ -326,13 +353,41 @@ findIndex p st = go 0 0
!sz = toB st
+
+-- | \(\mathcal{O}(n)\) Splits a string into components delimited by separators,
+-- where the predicate returns True for a separator element. The
+-- resulting components do not contain the separators. Two adjacent
+-- separators result in an empty component in the output. eg.
+--
+-- >>> split (=='a') "aabbaca"
+-- ["","","bb","c",""]
+--
+-- >>> split (=='a') ""
+-- [""]
+--
+-- prop> intercalate (singleton c) (split (== c) t) = t
+--
+-- __NOTE__: 'split' never returns an empty list to match the semantics of its counterpart from "Data.Text".
+--
+-- @since 0.1.3
+split :: (Char -> Bool) -> ShortText -> [ShortText]
+split p st0 = go 0
+ where
+ go !ofs0 = case findOfs' p st0 ofs0 of
+ Just (ofs1,ofs2) -> slice st0 ofs0 (ofs1-ofs0) : go ofs2
+ Nothing
+ | ofs0 == 0 -> st0 : []
+ | otherwise -> slice st0 ofs0 (maxOfs-ofs0) : []
+
+ !maxOfs = toB st0
+
-- internal helper
{-# INLINE findOfs #-}
findOfs :: (Char -> Bool) -> ShortText -> B -> Maybe B
findOfs p st = go
where
go :: B -> Maybe B
- go !ofs | ofs >= sz = Nothing
+ go !ofs | ofs >= sz = Nothing
go !ofs | p c = Just ofs
| otherwise = go ofs'
where
@@ -340,6 +395,20 @@ findOfs p st = go
!sz = toB st
+{-# INLINE findOfs' #-}
+findOfs' :: (Char -> Bool) -> ShortText -> B -> Maybe (B,B)
+findOfs' p st = go
+ where
+ go :: B -> Maybe (B,B)
+ go !ofs | ofs >= sz = Nothing
+ go !ofs | p c = Just (ofs,ofs')
+ | otherwise = go ofs'
+ where
+ (c,ofs') = decodeCharAtOfs st ofs
+
+ !sz = toB st
+
+
{-# INLINE findOfsRev #-}
findOfsRev :: (Char -> Bool) -> ShortText -> B -> Maybe B
findOfsRev p st = go
@@ -744,7 +813,7 @@ foreign import ccall unsafe "hs_text_short_index_cp_rev" c_text_short_index_rev
-- | \(\mathcal{O}(n)\) Split 'ShortText' into two halves.
--
--- @'splitAtOfs n t@ returns a pair of 'ShortText' with the following properties:
+-- @'splitAt' n t@ returns a pair of 'ShortText' with the following properties:
--
-- prop> length (fst (splitAt n t)) == min (length t) (max 0 n)
--
@@ -803,7 +872,7 @@ splitAtEnd i st
splitAtOfs :: B -> ShortText -> (ShortText,ShortText)
splitAtOfs ofs st
| ofs == 0 = (mempty,st)
- | ofs > stsz = (st,mempty)
+ | ofs >= stsz = (st,mempty)
| otherwise = (slice st 0 ofs, slice st ofs (stsz-ofs))
where
!stsz = toB st
@@ -1451,7 +1520,7 @@ instance GHC.Exts.IsList ShortText where
-- | __Note__: Surrogate pairs (@[U+D800 .. U+DFFF]@) in string literals are replaced by U+FFFD.
--
--- This matches the behaviour of 'IsString' instance for 'T.Text'.
+-- This matches the behaviour of 'S.IsString' instance for 'T.Text'.
instance S.IsString ShortText where
fromString = fromStringLit