summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog.md10
-rw-r--r--src-test/Tests.hs7
-rw-r--r--src/Data/Text/Short.hs4
-rw-r--r--src/Data/Text/Short/Internal.hs79
-rw-r--r--text-short.cabal12
5 files changed, 101 insertions, 11 deletions
diff --git a/ChangeLog.md b/ChangeLog.md
index fcc2942..9622e21 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -1,3 +1,13 @@
+## 0.1.3
+
+ * Add `Data ShortText` instance
+ * Define `Typeable ShortText` also for GHC 7.8 as well
+ (NB: for GHC 7.10.3 and up `Typeable` instances are automatically
+ defined even when not mentioned explicitly in a `deriving` clause)
+ * Add equivalent verb `Data.Text.split` to `Data.Text.Short` API
+
+ split :: (Char -> Bool) -> ShortText -> [ShortText]
+
## 0.1.2
* Add `IsList ShortText` and `PrintfArg ShortText` instances
diff --git a/src-test/Tests.hs b/src-test/Tests.hs
index 884dda8..51f3cac 100644
--- a/src-test/Tests.hs
+++ b/src-test/Tests.hs
@@ -72,6 +72,9 @@ qcProps = testGroup "Properties"
let t' = IUT.fromText t
mapBoth f (x,y) = (f x, f y)
in and [ mapBoth IUT.toText (IUT.splitAt i t') == T.splitAt i t | i <- [-5 .. 5+T.length t ] ]
+ , QC.testProperty "intercalate/split" $ \t c ->
+ let t' = IUT.fromText t
+ in IUT.intercalate (IUT.singleton c) (IUT.split (== c) t') == t'
, QC.testProperty "intersperse" $ \t c -> IUT.intersperse c (IUT.fromText t) == IUT.fromText (T.intersperse c t)
, QC.testProperty "intercalate" $ \t1 t2 -> IUT.intercalate (IUT.fromText t1) (map IUT.fromText t2) == IUT.fromText (T.intercalate t1 t2)
@@ -166,6 +169,10 @@ unitTests = testGroup "Unit-tests"
, testCase "singleton" $ [ c | c <- [minBound..maxBound], IUT.singleton c /= IUT.fromText (T.singleton c) ] @?= []
, testCase "splitAtEnd" $ IUT.splitAtEnd 1 "€€" @?= ("€","€")
+ , testCase "split#1" $ IUT.split (== 'a') "aabbaca" @?= ["", "", "bb", "c", ""]
+ , testCase "split#2" $ IUT.split (const False) "aabbaca" @?= ["aabbaca"]
+ , testCase "split#3" $ IUT.split (const True) "abc" @?= ["","","",""]
+ , testCase "split#4" $ IUT.split (const True) "" @?= [""]
, testCase "literal0" $ IUT.unpack testLit0 @?= []
, testCase "literal1" $ IUT.unpack testLit1 @?= ['€','\0','€','\0']
diff --git a/src/Data/Text/Short.hs b/src/Data/Text/Short.hs
index e34544e..3e9b1a0 100644
--- a/src/Data/Text/Short.hs
+++ b/src/Data/Text/Short.hs
@@ -76,6 +76,9 @@ module Data.Text.Short
, spanEnd
, breakEnd
+ -- ** Breaking into many substrings
+ , split
+
-- ** Suffix & Prefix operations
, stripPrefix
, stripSuffix
@@ -322,6 +325,7 @@ dropWhile p = snd . span p
dropWhileEnd :: (Char -> Bool) -> ShortText -> ShortText
dropWhileEnd p = fst . spanEnd p
+
-- $setup
-- >>> :set -XOverloadedStrings
-- >>> import Text.Show.Functions ()
diff --git a/src/Data/Text/Short/Internal.hs b/src/Data/Text/Short/Internal.hs
index 0a61b6b..876985e 100644
--- a/src/Data/Text/Short/Internal.hs
+++ b/src/Data/Text/Short/Internal.hs
@@ -1,5 +1,6 @@
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
+{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE RankNTypes #-}
@@ -48,6 +49,7 @@ module Data.Text.Short.Internal
, span
, spanEnd
+ , split
, intersperse
, intercalate
@@ -108,7 +110,11 @@ import Data.ByteString.Short (ShortByteString)
import qualified Data.ByteString.Short as BSS
import qualified Data.ByteString.Short.Internal as BSSI
import Data.Char (ord)
+import Data.Data (Data(..),constrIndex, Constr,
+ mkConstr, DataType, mkDataType,
+ Fixity(Prefix))
import Data.Hashable (Hashable)
+import Data.Typeable (Typeable)
import qualified Data.List as List
import Data.Maybe (fromMaybe, isNothing)
import Data.Semigroup
@@ -150,9 +156,30 @@ import qualified PrimOps
-- In comparison, the footprint of a boxed 'ShortText' is only 4 words (i.e. 32 bytes on 64-bit systems) plus 1, 2, 3, or 4 bytes per code-point (due to the internal UTF-8 representation).
-- It can be shown that for realistic data <http://utf8everywhere.org/#asian UTF-16 has a space overhead of 50% over UTF-8>.
--
+-- __NOTE__: The `Typeable` instance isn't defined for GHC 7.8 (and older) prior to @text-short-0.1.3@
+--
-- @since 0.1
newtype ShortText = ShortText ShortByteString
- deriving (Monoid,Data.Semigroup.Semigroup,Hashable,NFData)
+ deriving (Hashable,Monoid,NFData,Data.Semigroup.Semigroup,Typeable)
+
+-- | It exposes a similar 'Data' instance abstraction as 'T.Text' (see
+-- discussion referenced there for more details), preserving the
+-- @[Char]@ data abstraction at the cost of inefficiency.
+--
+-- @since 0.1.3
+instance Data ShortText where
+ gfoldl f z txt = z fromString `f` (toString txt)
+ toConstr _ = packConstr
+ gunfold k z c = case constrIndex c of
+ 1 -> k (z fromString)
+ _ -> error "gunfold"
+ dataTypeOf _ = shortTextDataType
+
+packConstr :: Constr
+packConstr = mkConstr shortTextDataType "fromString" [] Prefix
+
+shortTextDataType :: DataType
+shortTextDataType = mkDataType "Data.Text.Short" [packConstr]
instance Eq ShortText where
{-# INLINE (==) #-}
@@ -326,13 +353,41 @@ findIndex p st = go 0 0
!sz = toB st
+
+-- | \(\mathcal{O}(n)\) Splits a string into components delimited by separators,
+-- where the predicate returns True for a separator element. The
+-- resulting components do not contain the separators. Two adjacent
+-- separators result in an empty component in the output. eg.
+--
+-- >>> split (=='a') "aabbaca"
+-- ["","","bb","c",""]
+--
+-- >>> split (=='a') ""
+-- [""]
+--
+-- prop> intercalate (singleton c) (split (== c) t) = t
+--
+-- __NOTE__: 'split' never returns an empty list to match the semantics of its counterpart from "Data.Text".
+--
+-- @since 0.1.3
+split :: (Char -> Bool) -> ShortText -> [ShortText]
+split p st0 = go 0
+ where
+ go !ofs0 = case findOfs' p st0 ofs0 of
+ Just (ofs1,ofs2) -> slice st0 ofs0 (ofs1-ofs0) : go ofs2
+ Nothing
+ | ofs0 == 0 -> st0 : []
+ | otherwise -> slice st0 ofs0 (maxOfs-ofs0) : []
+
+ !maxOfs = toB st0
+
-- internal helper
{-# INLINE findOfs #-}
findOfs :: (Char -> Bool) -> ShortText -> B -> Maybe B
findOfs p st = go
where
go :: B -> Maybe B
- go !ofs | ofs >= sz = Nothing
+ go !ofs | ofs >= sz = Nothing
go !ofs | p c = Just ofs
| otherwise = go ofs'
where
@@ -340,6 +395,20 @@ findOfs p st = go
!sz = toB st
+{-# INLINE findOfs' #-}
+findOfs' :: (Char -> Bool) -> ShortText -> B -> Maybe (B,B)
+findOfs' p st = go
+ where
+ go :: B -> Maybe (B,B)
+ go !ofs | ofs >= sz = Nothing
+ go !ofs | p c = Just (ofs,ofs')
+ | otherwise = go ofs'
+ where
+ (c,ofs') = decodeCharAtOfs st ofs
+
+ !sz = toB st
+
+
{-# INLINE findOfsRev #-}
findOfsRev :: (Char -> Bool) -> ShortText -> B -> Maybe B
findOfsRev p st = go
@@ -744,7 +813,7 @@ foreign import ccall unsafe "hs_text_short_index_cp_rev" c_text_short_index_rev
-- | \(\mathcal{O}(n)\) Split 'ShortText' into two halves.
--
--- @'splitAtOfs n t@ returns a pair of 'ShortText' with the following properties:
+-- @'splitAt' n t@ returns a pair of 'ShortText' with the following properties:
--
-- prop> length (fst (splitAt n t)) == min (length t) (max 0 n)
--
@@ -803,7 +872,7 @@ splitAtEnd i st
splitAtOfs :: B -> ShortText -> (ShortText,ShortText)
splitAtOfs ofs st
| ofs == 0 = (mempty,st)
- | ofs > stsz = (st,mempty)
+ | ofs >= stsz = (st,mempty)
| otherwise = (slice st 0 ofs, slice st ofs (stsz-ofs))
where
!stsz = toB st
@@ -1451,7 +1520,7 @@ instance GHC.Exts.IsList ShortText where
-- | __Note__: Surrogate pairs (@[U+D800 .. U+DFFF]@) in string literals are replaced by U+FFFD.
--
--- This matches the behaviour of 'IsString' instance for 'T.Text'.
+-- This matches the behaviour of 'S.IsString' instance for 'T.Text'.
instance S.IsString ShortText where
fromString = fromStringLit
diff --git a/text-short.cabal b/text-short.cabal
index 0b704c5..a72d7a2 100644
--- a/text-short.cabal
+++ b/text-short.cabal
@@ -1,7 +1,7 @@
cabal-version: 1.18
name: text-short
-version: 0.1.2
+version: 0.1.3
synopsis: Memory-efficient representation of Unicode text strings
license: BSD3
license-file: LICENSE
@@ -14,7 +14,7 @@ description: This package provides the 'ShortText' type which is suitabl
.
The main difference between 'Text' and 'ShortText' is that 'ShortText' uses UTF-8 instead of UTF-16 internally and also doesn't support zero-copy slicing (thereby saving 2 words). Consequently, the memory footprint of a (boxed) 'ShortText' value is 4 words (2 words when unboxed) plus the length of the UTF-8 encoded payload.
-tested-with: GHC==8.4.1, GHC==8.2.2, GHC==8.0.2, GHC==7.10.3, GHC==7.8.4
+tested-with: GHC==8.6.5, GHC==8.4.4, GHC==8.2.2, GHC==8.0.2, GHC==7.10.3, GHC==7.8.4
extra-source-files: ChangeLog.md
Source-Repository head
@@ -33,16 +33,16 @@ library
other-modules: Data.Text.Short.Internal
- build-depends: base >= 4.7 && < 4.12
+ build-depends: base >= 4.7 && < 4.13
, bytestring >= 0.10.4 && < 0.11
- , hashable >= 1.2.6 && < 1.3
+ , hashable >= 1.2.6 && < 1.4
, deepseq >= 1.3 && < 1.5
, text >= 1.0 && < 1.3
, binary >= 0.7.1 && < 0.9
, ghc-prim >= 0.3.1 && < 0.6
if !impl(ghc >= 8.0)
- build-depends: semigroups >= 0.18.2 && < 0.19
+ build-depends: semigroups >= 0.18.2 && < 0.20
-- GHC version specific PrimOps
if impl(ghc >= 8.4)
@@ -82,7 +82,7 @@ test-suite tests
, text
, text-short
-- deps which don't inherit constraints from library stanza:
- , tasty >= 1.0.0 && < 1.1
+ , tasty >= 1.0.0 && < 1.3
, tasty-quickcheck >= 0.10 && < 0.11
, tasty-hunit >= 0.10.0 && < 0.11
, quickcheck-instances >= 0.3.14 && < 0.4