summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHerbertValerioRiedel <>2018-03-04 21:11:00 (GMT)
committerhdiff <hdiff@hdiff.luite.com>2018-03-04 21:11:00 (GMT)
commitb339ab28e5b12540ebf6719c832742ad69b19823 (patch)
treebd3c15d24ffbd848e3ae3ae5c021044314e33c4b
parent1cc77942846e2f36d23bd209ac5622945cef3937 (diff)
version 0.1.20.1.2
-rw-r--r--ChangeLog.md68
-rw-r--r--cbits/cbits.c610
-rw-r--r--cbits/memcmp.c12
-rw-r--r--src-ghc708/PrimOps.hs23
-rw-r--r--src-ghc804/PrimOps.hs6
-rw-r--r--src-test/Tests.hs204
-rw-r--r--src/Data/Text/Short.hs296
-rw-r--r--src/Data/Text/Short/Internal.hs1345
-rw-r--r--src/Data/Text/Short/Partial.hs100
-rw-r--r--text-short.cabal42
10 files changed, 2592 insertions, 114 deletions
diff --git a/ChangeLog.md b/ChangeLog.md
index 1307299..fcc2942 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -1,10 +1,72 @@
-# Revision history for `text-short`
+## 0.1.2
-## 0.1
+ * Add `IsList ShortText` and `PrintfArg ShortText` instances
+ * Expose partial functions via new `Data.Text.Short.Partial` module
-* First version. Released on an unsuspecting world.
+ foldl1 :: (Char -> Char -> Char) -> ShortText -> Char
+ foldl1' :: (Char -> Char -> Char) -> ShortText -> Char
+ foldr1 :: (Char -> Char -> Char) -> ShortText -> Char
+ head :: ShortText -> Char
+ index :: ShortText -> Int -> Char
+ init :: ShortText -> ShortText
+ last :: ShortText -> Char
+ tail :: ShortText -> ShortText
+
+ * Add several `Data.Text` verbs to `Data.Text.Short` API
+
+ (!?) :: ShortText -> Int -> Maybe Char
+ all :: (Char -> Bool) -> ShortText -> Bool
+ any :: (Char -> Bool) -> ShortText -> Bool
+ append :: ShortText -> ShortText -> ShortText
+ break :: (Char -> Bool) -> ShortText -> (ShortText, ShortText)
+ breakEnd :: (Char -> Bool) -> ShortText -> (ShortText, ShortText)
+ concat :: [ShortText] -> ShortText
+ cons :: Char -> ShortText -> ShortText
+ drop :: Int -> ShortText -> ShortText
+ dropAround :: (Char -> Bool) -> ShortText -> ShortText
+ dropEnd :: Int -> ShortText -> ShortText
+ dropWhile :: (Char -> Bool) -> ShortText -> ShortText
+ dropWhileEnd :: (Char -> Bool) -> ShortText -> ShortText
+ empty :: ShortText
+ filter :: (Char -> Bool) -> ShortText -> ShortText
+ find :: (Char -> Bool) -> ShortText -> Maybe Char
+ findIndex :: (Char -> Bool) -> ShortText -> Maybe Int
+ foldl :: (a -> Char -> a) -> a -> ShortText -> a
+ foldl' :: (a -> Char -> a) -> a -> ShortText -> a
+ foldr :: (Char -> a -> a) -> a -> ShortText -> a
+ indexEndMaybe :: ShortText -> Int -> Maybe Char
+ indexMaybe :: ShortText -> Int -> Maybe Char
+ intercalate :: ShortText -> [ShortText] -> ShortText
+ intersperse :: Char -> ShortText -> ShortText
+ isPrefixOf :: ShortText -> ShortText -> Bool
+ isSuffixOf :: ShortText -> ShortText -> Bool
+ pack :: [Char] -> ShortText
+ replicate :: Int -> ShortText -> ShortText
+ reverse :: ShortText -> ShortText
+ singleton :: Char -> ShortText
+ snoc :: ShortText -> Char -> ShortText
+ span :: (Char -> Bool) -> ShortText -> (ShortText, ShortText)
+ spanEnd :: (Char -> Bool) -> ShortText -> (ShortText, ShortText)
+ splitAt :: Int -> ShortText -> (ShortText, ShortText)
+ splitAtEnd :: Int -> ShortText -> (ShortText, ShortText)
+ stripPrefix :: ShortText -> ShortText -> Maybe ShortText
+ stripSuffix :: ShortText -> ShortText -> Maybe ShortText
+ take :: Int -> ShortText -> ShortText
+ takeEnd :: Int -> ShortText -> ShortText
+ takeWhile :: (Char -> Bool) -> ShortText -> ShortText
+ takeWhileEnd :: (Char -> Bool) -> ShortText -> ShortText
+ uncons :: ShortText -> Maybe (Char, ShortText)
+ unpack :: ShortText -> [Char]
+ unsnoc :: ShortText -> Maybe (ShortText, Char)
+
+ * Optimise low-level primitives
+ * Add support for GHC 8.4
## 0.1.1
* Expose *unsafe* conversion API via `Data.Text.Short.Unsafe` module
* Minor documentation improvement
+
+## 0.1
+
+* First version. Released on an unsuspecting world.
diff --git a/cbits/cbits.c b/cbits/cbits.c
index 7870579..b1acc59 100644
--- a/cbits/cbits.c
+++ b/cbits/cbits.c
@@ -1,23 +1,23 @@
/*
* Copyright (c) 2017, Herbert Valerio Riedel
- *
+ *
* All rights reserved.
- *
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
- *
+ *
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
- *
+ *
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials provided
* with the distribution.
- *
+ *
* * Neither the name of Herbert Valerio Riedel nor the names of other
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
- *
+ *
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -31,28 +31,326 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#if !defined(NDEBUG)
+# warning assert(3) checks enabled
+#endif
+
#include <stdint.h>
+#include <stdbool.h>
#include <string.h>
+#include <stdlib.h>
#include <assert.h>
+#include <HsFFI.h>
+
+#if !defined(SIZEOF_VOID_P)
+# error <HsFFI.h> SIZEOF_VOID_P not defined
+#endif
+
+#if (SIZEOF_VOID_P) == 8
+const static bool is_64bit = true;
+#elif (SIZEOF_VOID_P) == 4
+const static bool is_64bit = false;
+#else
+# error unexpected SIZEOF_VOID_P value
+#endif
+
+#if (WORDS_BIGENDIAN)
+const static bool is_bigendian = true;
+#else
+const static bool is_bigendian = false;
+#endif
+
+#if defined(__GNUC__)
+# define likely(x) __builtin_expect(!!(x),1)
+# define unlikely(x) __builtin_expect(!!(x),0)
+#else
+# define likely(x) (x)
+# define unlikely(x) (x)
+#endif
+
+/* test whether octet in UTF-8 steam is not a continuation byte, i.e. a leading byte */
+#define utf8_lead_p(octet) (((octet) & 0xc0) != 0x80)
+
+/* 0 <= x <= 0x110000 */
+typedef HsWord codepoint_t;
/* Count number of code-points in well-formed utf8 string */
size_t
hs_text_short_length(const uint8_t buf[], const size_t n)
{
- size_t j,l = 0;
+ size_t j = 0;
+ size_t l = 0;
+
+ /* Both GCC & Clang are able to optimise the code below quite well at -O3 */
for (j = 0; j < n; j++)
- if ((buf[j] & 0xc0) != 0x80)
+ if (utf8_lead_p(buf[j]))
l++;
return l;
}
+/* Locate offset of j-th code-point in well-formed utf8 string
+ *
+ */
+size_t
+hs_text_short_index_ofs(const uint8_t buf[], const size_t n, const size_t i)
+{
+ if (!n)
+ return n;
+
+ size_t m = 0;
+ size_t j = 0;
+
+ for (;;) {
+ assert(m >= 0);
+ assert(j <= i);
+ assert(j <= m);
+
+ if (j == i)
+ return m; /* found */
+
+ if (i-j >= n-m)
+ return n; /* i-th char is >= buf+n */
+
+ assert(m < n);
+ const uint8_t b0 = buf[m];
+
+ if (!(b0 & 0x80))
+ m += 1; /* 0_______ */
+ else
+ switch(b0 >> 4) {
+ case 0xf: /* 11110___ */
+ m += 4;
+ break;
+ case 0xe: /* 1110____ */
+ m += 3;
+ break;
+ default: /* 110_____ */
+ m += 2;
+ break;
+ }
+
+ j += 1;
+ }
+
+ assert(0);
+}
+
+/* Locate offset of j-th code-point (in reverse direction) in
+ * well-formed utf8 string starting at end of buffer.
+ *
+ * The 0-th character from the end is the last character in the utf8
+ * string (if it exists).
+ *
+ * Returns original 'n' if out of bounds.
+ *
+ */
+size_t
+hs_text_short_index_ofs_rev(const uint8_t buf[], const size_t n, const size_t i)
+{
+ size_t m = n;
+ size_t j = i;
+
+ for (;;) {
+ assert(m <= n);
+ assert(j >= 0);
+
+ if (j >= m)
+ return n; /* i-th char is < buf */
+
+ /* if (m == i-j) /\* suffix is made up only of ASCII chars, so we can shortcut *\/ */
+ /* return 0; */
+
+ /* scan until octet does not match 10_ */
+ assert(m > 0);
+ if (!(buf[--m] & 0x80))
+ goto l_cont;
+
+ assert(m > 0);
+ if (utf8_lead_p(buf[--m])) {
+ assert ((buf[m] & 0xe0) == 0xc0); /* 110_ */
+ goto l_cont;
+ }
+
+ assert(m > 0);
+ if (utf8_lead_p(buf[--m])) {
+ assert ((buf[m] & 0xf0) == 0xe0); /* 1110_ */
+ goto l_cont;
+ }
+
+ /* this must be a non-10_ octet in a well-formed stream */
+ assert(m > 0);
+ m -= 1;
+
+ assert ((buf[m] & 0xf8) == 0xf0); /* 11110_ */
+
+ l_cont:
+ assert(utf8_lead_p(buf[m]));
+
+ if (!j)
+ return m; /* found */
+
+ j -= 1;
+ }
+
+ assert(0);
+}
+
+/* Decode UTF8 code units into code-point
+ * Assumes buf[] points to start of a valid UTF8-encoded code-point
+ */
+static inline uint32_t
+hs_text_short_decode_cp(const uint8_t buf[])
+{
+ /* 7 bits | 0xxxxxxx
+ * 11 bits | 110yyyyx 10xxxxxx
+ * 16 bits | 1110yyyy 10yxxxxx 10xxxxxx
+ * 21 bits | 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
+ */
+
+ const uint8_t b0 = buf[0];
+
+ if (!(b0 & 0x80))
+ return b0;
+
+ uint32_t cp = 0;
+
+ switch(b0 >> 4) {
+ case 0xf: /* 11110___ */
+ assert((b0 & 0xf8) == 0xf0);
+ assert(!utf8_lead_p(buf[1]));
+ assert(!utf8_lead_p(buf[2]));
+ assert(!utf8_lead_p(buf[3]));
+ cp = ((uint32_t)(b0 & 0x07)) << (6+6+6);
+ cp |= ((uint32_t)(buf[1] & 0x3f)) << (6+6);
+ cp |= ((uint32_t)(buf[2] & 0x3f)) << 6;
+ cp |= buf[3] & 0x3f;
+ assert (cp > 0xffff); assert (cp < 0x110000);
+ return cp;
+
+ case 0xe: /* 1110____ */
+ assert(!utf8_lead_p(buf[1]));
+ assert(!utf8_lead_p(buf[2]));
+ cp = ((uint32_t)(b0 & 0x0f)) << (6+6);
+ cp |= ((uint32_t)(buf[1] & 0x3f)) << 6;
+ cp |= buf[2] & 0x3f;
+ assert (cp > 0x7ff); assert (cp < 0x10000);
+ assert (cp < 0xd800 || cp > 0xdfff);
+ return cp;
+
+ default: /* 110_____ */
+ assert((b0 & 0xe0) == 0xc0);
+ assert(!utf8_lead_p(buf[1]));
+ cp = ((uint32_t)(b0 & 0x1f)) << 6;
+ cp |= buf[1] & 0x3f;
+ assert (cp > 0x7f); assert (cp < 0x800);
+ return cp;
+ }
+}
+
+/* decode codepoint starting at buf[ofs] */
+codepoint_t
+hs_text_short_ofs_cp(const uint8_t buf[], const size_t ofs)
+{
+ return hs_text_short_decode_cp(buf+ofs);
+}
+
+/* reverse-decode codepoint starting at offset right after a code-point */
+codepoint_t
+hs_text_short_ofs_cp_rev(const uint8_t *buf, const size_t ofs)
+{
+ /* 7 bits | 0xxxxxxx
+ * 11 bits | 110yyyyx 10xxxxxx
+ * 16 bits | 1110yyyy 10yxxxxx 10xxxxxx
+ * 21 bits | 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
+ */
+
+ buf = buf + ofs - 1;
+
+ /* this octet is either 10_ or 0_ */
+ uint32_t cp = *buf;
+
+ if (!(cp & 0x80))
+ return cp;
+
+ assert (!utf8_lead_p(cp));
+ cp &= 0x3f;
+
+ /* this octet is either 10_ or 110_ */
+ {
+ const uint8_t b = *(--buf);
+ assert (!utf8_lead_p(b) || ((b & 0xe0) == 0xc0));
+
+ cp |= (b & 0x3f) << 6;
+
+ if (b & 0x40) {
+ assert (cp > 0x7f); assert (cp < 0x800);
+ return cp;
+ }
+ }
+
+ /* this octet is either 10_ or 1110_ */
+ {
+ const uint8_t b = *(--buf);
+ assert (!utf8_lead_p(b) || ((b & 0xf0) == 0xe0));
+
+ if (b & 0x40) {
+ cp |= (b & 0xf) << 12;
+
+ assert (cp > 0x7ff); assert (cp < 0x10000);
+ assert (cp < 0xd800 || cp > 0xdfff);
+ return cp;
+ }
+
+ cp |= (b & 0x3f) << 12;
+ }
+
+ /* this octet must be 11110_ */
+ const uint8_t b = *(buf-1);
+ assert ((b & 0xf8) == 0xf0);
+
+ cp |= (b & 0x7) << 18;
+
+ assert (cp > 0xffff); assert (cp < 0x110000);
+ return cp;
+}
+
+/* Retrieve i-th code-point in (valid) UTF8 stream
+ *
+ * Returns -1 if out of bounds
+ */
+codepoint_t
+hs_text_short_index_cp(const uint8_t buf[], const size_t n, const size_t i)
+{
+ const size_t ofs = hs_text_short_index_ofs(buf, n, i);
+
+ if (ofs >= n)
+ return -1;
+
+ return hs_text_short_decode_cp(&buf[ofs]);
+}
+
+/* Retrieve i-th code-point in (valid) UTF8 stream
+ *
+ * Returns -1 if out of bounds
+ */
+codepoint_t
+hs_text_short_index_cp_rev(const uint8_t buf[], const size_t n, const size_t i)
+{
+ const size_t ofs = hs_text_short_index_ofs_rev(buf, n, i);
+
+ if (ofs >= n)
+ return -1;
+
+ return hs_text_short_decode_cp(&buf[ofs]);
+}
+
/* Validate UTF8 encoding
7 bits | 0xxxxxxx
-11 bits | 110yyyyx 10xxxxxx
-16 bits | 1110yyyy 10yxxxxx 10xxxxxx
-21 bits | 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
+11 bits | 110yyyyx 10xxxxxx
+16 bits | 1110yyyy 10yxxxxx 10xxxxxx
+21 bits | 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
Valid code-points:
@@ -60,9 +358,13 @@ Valid code-points:
Return values:
- 0 -> ok
- 1 -> invalid byte/code-point
- 2 -> truncated
+ 0 -> ok
+
+ 1 -> invalid byte/code-point
+
+ -1 -> truncated (1 byte missing)
+ -2 -> truncated (2 byte missing)
+ -3 -> truncated (3 byte missing)
*/
@@ -75,50 +377,66 @@ hs_text_short_is_valid_utf8(const uint8_t buf[], const size_t n)
const uint8_t b0 = buf[j++];
if (!(b0 & 0x80))
- continue;
+ continue; /* b0 elem [ 0x00 .. 0x7f ] */
- if ((b0 & 0xe0) == 0xc0) {
- if (!(b0 & 0x1e)) return 1; /* denorm */
- if (j >= n) return 2;
-
- /* b1 */
- if ((buf[j++] & 0xc0) != 0x80) return 1;
- continue;
+ if ((b0 & 0xe0) == 0xc0) { /* [ 0xc0 .. 0xdf ] */
+ if (!(b0 & 0x1e)) return 1; /* 0xc0 or 0xc1; denorm */
+ if (j >= n) return -1;
+
+ goto l_trail1; /* b1 */
}
- if ((b0 & 0xf0) == 0xe0) {
- if ((j+1) >= n) return 2;
+ if ((b0 & 0xf0) == 0xe0) { /* [ 0xe0 .. 0xef ] */
+ if ((j+1) >= n) return (n-(j+2));
const uint8_t b1 = buf[j++];
- if ((b1 & 0xc0) != 0x80) return 1;
+ if (utf8_lead_p(b1)) return 1; /* b1 elem [ 0x80 .. 0xbf ] */
+
+ /* if b0==0xe0: b1 elem [ 0xa0 .. 0xbf ] */
if (!((b0 & 0x0f) | (b1 & 0x20))) return 1; /* denorm */
+
/* UTF16 Surrogate pairs [U+D800 .. U+DFFF] */
+ /* if b0==0xed: b1 elem [ 0x80 .. 0x9f ] */
if ((b0 == 0xed) && (b1 & 0x20)) return 1;
-
- /* b2 */
- if ((buf[j++] & 0xc0) != 0x80) return 1;
-
- continue;
+
+ goto l_trail1; /* b2 */
}
- if ((b0 & 0xf8) == 0xf0) {
- if ((j+2) >= n) return 2;
-
+ if ((b0 & 0xfc) == 0xf0) { /* [ 0xf0 .. 0xf3 ] */
+ if ((j+2) >= n) return (n-(j+3));
+
const uint8_t b1 = buf[j++];
- if ((b1 & 0xc0) != 0x80) return 1;
- if (!((b0 & 0x07) | (b1 & 0x30))) return 1; /* denorm */
- /* make sure we're below U+10FFFF */
- if (b0 > 0xf4) return 1;
- if ((b0 == 0xf4) && (b1 & 0x30)) return 1;
-
+
+ if (utf8_lead_p(b1)) /* b1 elem [ 0x80 .. 0xbf ] */
+ return 1;
+
+ if (!((b0 & 0x03) | (b1 & 0x30))) /* if b0==0xf0: b1 elem [ 0x90 .. 0xbf ] */
+ return 1;
+
+ goto l_trail2; /* b1, b2 */
+ }
+
+ if (b0 == 0xf4) {
+ if ((j+2) >= n) return (n-(j+3));
+
+ /* b1 */
+ if ((buf[j++] & 0xf0) != 0x80) return 1;
+ /* b1 elem [ 0x80 .. 0x8f ] */
+
+ l_trail2:
/* b2 */
- if ((buf[j++] & 0xc0) != 0x80) return 1;
+ if (utf8_lead_p(buf[j++])) return 1;
+ /* b2 elem [ 0x80 .. 0xbf ] */
+
+ l_trail1:
/* b3 */
- if ((buf[j++] & 0xc0) != 0x80) return 1;
+ if (utf8_lead_p(buf[j++])) return 1;
+ /* b3 elem [ 0x80 .. 0xbf ] */
continue;
}
-
+
+ /* invalid b0 byte */
return 1;
}
@@ -128,14 +446,212 @@ hs_text_short_is_valid_utf8(const uint8_t buf[], const size_t n)
}
-/* Test whether well-formed UTF8 string contains only ASCII code-points
- * Returns length of longest ASCII-code-point prefix.
+/* Returns length of longest ASCII-code-point prefix.
*/
size_t
-hs_text_short_is_ascii(const uint8_t buf[], const size_t n)
+hs_text_short_ascii_length(const uint8_t buf[], const size_t n)
{
- size_t j;
- for (j = 0; j < n; j++)
+ size_t j = 0;
+
+ if (is_64bit) {
+ /* "vectorized" optimisation checking 8 octets at once
+ *
+ * NB: A 64-bit aligned buffer is assumed. This is assumption is
+ * justified when the buffer is the payload of a `ByteArray#`.
+ */
+ const uint64_t *buf64 = (const uint64_t*)buf;
+
+ for (; (j+7) < n; j+=8, ++buf64)
+ if (*buf64 & UINT64_C(0x8080808080808080))
+ break;
+ } else {
+ /* "vectorized" optimisation checking 4 octets at once */
+ const uint32_t *buf32 = (const uint32_t*)buf;
+
+ for (; (j+3) < n; j+=4, ++buf32)
+ if (*buf32 & UINT64_C(0x80808080))
+ break;
+ }
+
+ for (; j < n; ++j)
if (buf[j] & 0x80)
return j;
+
+ return j;
+}
+
+/* Test whether well-formed UTF8 string contains only ASCII code-points
+ * returns 0 if not ASCII
+ *
+ * This code assumes a naturally aligned buf[]
+ */
+int
+hs_text_short_is_ascii(const uint8_t buf[], const size_t n)
+{
+ size_t j = 0;
+
+ if (n < 2)
+ return 1;
+
+ if (is_64bit) {
+ /* "vectorized" optimisation checking 8 octets at once
+ *
+ * NB: A 64-bit aligned buffer is assumed. This is assumption is
+ * justified when the buffer is the payload of a `ByteArray#`.
+ *
+ */
+ const uint64_t *buf64 = (const uint64_t*)buf;
+
+ for (; (j+7) < n; j+=8, ++buf64)
+ if (*buf64 & UINT64_C(0x8080808080808080))
+ return 0;
+
+ if (j < n) {
+ const int maskshift = (8 - (n - j)) << 3;
+ const uint64_t mask = is_bigendian ? (UINT64_C(0x8080808080808080) << maskshift) /* big endian */
+ : (UINT64_C(0x8080808080808080) >> maskshift); /* little endian */
+
+ if (*buf64 & mask)
+ return 0;
+ }
+ } else {
+ /* "vectorized" optimisation checking 4 octets at once */
+ const uint32_t *buf32 = (const uint32_t*)buf;
+
+ for (; (j+3) < n; j+=4, ++buf32)
+ if (*buf32 & UINT64_C(0x80808080))
+ return 0;
+
+ for (; j < n; ++j)
+ if (buf[j] & 0x80)
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * Compute length of (transcoded) mutf8 literal
+ *
+ * If the mutf8 literal does not contain either surrogates nor escaped
+ * NULs, a positive length is returned which matches what strlen(3)
+ * would have returned.
+ *
+ * Otherwise, a negated size is returned which corresponds to the size
+ * of a the mutf8->utf8 transcoded string.
+ *
+ */
+HsInt
+hs_text_short_mutf8_strlen(const uint8_t buf[])
+{
+ size_t j = 0;
+ size_t nulls = 0;
+ bool surr_seen = false;
+
+ for (;;) {
+ const uint8_t b0 = buf[j];
+
+ if (unlikely(!b0))
+ break;
+
+ if (likely(!(b0 & 0x80)))
+ j += 1; /* 0_______ */
+ else
+ switch(b0 >> 4) {
+ case 0xf: /* 11110___ */
+ j += 4;
+ break;
+ case 0xe: /* 1110____ */
+ /* UTF16 Surrogate pairs [U+D800 .. U+DFFF] */
+ if (unlikely(!surr_seen && (b0 == 0xed) && (buf[j+1] & 0x20)))
+ surr_seen = true;
+ j += 3;
+ break;
+ default: /* 110_____ */
+ /* escaped NUL */
+ if (unlikely((b0 == 0xc0) && (buf[j+1] == 0x80)))
+ nulls += 1;
+ j += 2;
+ break;
+ }
+ } /* for */
+
+
+ if ((nulls > 0) || surr_seen)
+ return -(HsInt)(j - nulls);
+
+ return j;
+}
+
+/* Transcode Modified UTF-8 to proper UTF-8
+ *
+ * This involves
+ *
+ * 1. Unescape denormal 2-byte NULs (0xC0 0x80)
+ * 2. Rewrite surrogate pairs to U+FFFD
+ */
+void
+hs_text_short_mutf8_trans(const uint8_t src0[], uint8_t dst0[])
+{
+ const uint8_t *src = src0;
+ uint8_t *dst = dst0;
+
+ for (;;) {
+ const uint8_t b0 = *src++;
+ assert(utf8_lead_p(b0));
+
+ if (likely(!(b0 & 0x80))) { /* 0_______ */
+ if (unlikely(!b0))
+ break;
+
+ *dst++ = b0;
+ continue;
+ }
+
+ switch(b0 >> 4) {
+ case 0xf: /* 11110___ */
+ assert(!utf8_lead_p(src[0]));
+ assert(!utf8_lead_p(src[1]));
+ assert(!utf8_lead_p(src[2]));
+ *dst++ = b0;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ break;
+
+ case 0xe: { /* 1110____ */
+ const uint8_t b1 = *src++;
+ const uint8_t b2 = *src++;
+ assert(!utf8_lead_p(b1));
+ assert(!utf8_lead_p(b2));
+ if (unlikely((b0 == 0xed) && (b1 & 0x20))) {
+ /* UTF16 Surrogate pairs [U+D800 .. U+DFFF]
+ * -> translate into U+FFFD
+ */
+ *dst++ = 0xef;
+ *dst++ = 0xbf;
+ *dst++ = 0xbd;
+ } else {
+ *dst++ = b0;
+ *dst++ = b1;
+ *dst++ = b2;
+ }
+ break;
+ }
+ default: { /* 110_____ */
+ const uint8_t b1 = *src++;
+ assert(!utf8_lead_p(b1));
+ if (unlikely((b0 == 0xc0) && (b1 == 0x80))) {
+ /* escaped/denormal U+0000 -> normalize */
+ *dst++ = 0x00;
+ } else {
+ *dst++ = b0;
+ *dst++ = b1;
+ }
+ break;
+ }
+ } /* switch */
+ } /* for */
+
+ assert(labs(hs_text_short_mutf8_strlen(src0)) == (dst - dst0));
}
diff --git a/cbits/memcmp.c b/cbits/memcmp.c
new file mode 100644
index 0000000..4e5042f
--- /dev/null
+++ b/cbits/memcmp.c
@@ -0,0 +1,12 @@
+#include <string.h>
+
+int
+hs_text_short_memcmp(const void *s1, const size_t s1ofs, const void *s2, const size_t s2ofs, const size_t n)
+{
+ if (!n) return 0;
+
+ const void *s1_ = s1+s1ofs;
+ const void *s2_ = s2+s2ofs;
+
+ return (s1_ == s2_) ? 0 : memcmp(s1_, s2_, n);
+}
diff --git a/src-ghc708/PrimOps.hs b/src-ghc708/PrimOps.hs
new file mode 100644
index 0000000..601bc07
--- /dev/null
+++ b/src-ghc708/PrimOps.hs
@@ -0,0 +1,23 @@
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnliftedFFITypes #-}
+{-# LANGUAGE Unsafe #-}
+
+module PrimOps ( compareByteArrays# ) where
+
+import Foreign.C.Types (CInt (..), CSize (..))
+import GHC.Exts (Int (I#))
+import GHC.Exts (ByteArray#, Int#)
+import System.IO.Unsafe (unsafeDupablePerformIO)
+
+-- | Emulate GHC 8.4's 'GHC.Prim.compareByteArrays#'
+compareByteArrays# :: ByteArray# -> Int# -> ByteArray# -> Int# -> Int# -> Int#
+compareByteArrays# ba1# ofs1# ba2# ofs2# n#
+ = unI (fromIntegral (unsafeDupablePerformIO (c_memcmp ba1# ofs1 ba2# ofs2 n)))
+ where
+ unI (I# i#) = i#
+ ofs1 = fromIntegral (I# ofs1#)
+ ofs2 = fromIntegral (I# ofs2#)
+ n = fromIntegral (I# n#)
+
+foreign import ccall unsafe "hs_text_short_memcmp"
+ c_memcmp :: ByteArray# -> CSize -> ByteArray# -> CSize -> CSize -> IO CInt
diff --git a/src-ghc804/PrimOps.hs b/src-ghc804/PrimOps.hs
new file mode 100644
index 0000000..b4faa72
--- /dev/null
+++ b/src-ghc804/PrimOps.hs
@@ -0,0 +1,6 @@
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE Unsafe #-}
+
+module PrimOps ( compareByteArrays# ) where
+
+import GHC.Exts (compareByteArrays#)
diff --git a/src-test/Tests.hs b/src-test/Tests.hs
index ad4e91e..884dda8 100644
--- a/src-test/Tests.hs
+++ b/src-test/Tests.hs
@@ -1,46 +1,224 @@
+{-# LANGUAGE OverloadedLists #-}
{-# LANGUAGE OverloadedStrings #-}
module Main(main) where
-import qualified Data.Text.Short as IUT
-import Test.Tasty
-import Test.Tasty.QuickCheck as QC
-import Test.Tasty.HUnit
-import Test.QuickCheck.Instances ()
-import qualified Data.Text as T
-import qualified Data.Text.Encoding as T
-import qualified Data.String as D.S
-import Data.Binary
-import Data.Char
-import Data.Monoid
+import Data.Binary
+import Data.Char
+import Data.Maybe
+import Data.Monoid
+import qualified Data.String as D.S
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as T
+import qualified Data.Text.Short as IUT
+import qualified Data.Text.Short.Partial as IUT
+import Test.QuickCheck.Instances ()
+import Test.Tasty
+import Test.Tasty.HUnit
+import Test.Tasty.QuickCheck as QC
+import Text.Show.Functions ()
fromByteStringRef = either (const Nothing) (Just . IUT.fromText) . T.decodeUtf8'
main :: IO ()
-main = defaultMain tests
+main = defaultMain (adjustOption (QuickCheckTests 50000 `max`) $ tests)
tests :: TestTree
tests = testGroup "Tests" [unitTests,qcProps]
+-- ShortText w/ in-bounds index
+data STI = STI IUT.ShortText Int
+ deriving (Eq,Show)
+
+newtype ST = ST IUT.ShortText
+ deriving (Eq,Show)
+
+instance Arbitrary STI where
+ arbitrary = do
+ t <- arbitrary
+ i <- choose (0, T.length t - 1)
+ return $! STI (IUT.fromText t) i
+
+instance Arbitrary ST where
+ arbitrary = fmap (ST . IUT.fromText) arbitrary
+ shrink (ST st) = map (ST . IUT.fromText) (shrink (IUT.toText st))
+
qcProps :: TestTree
qcProps = testGroup "Properties"
[ QC.testProperty "length/fromText" $ \t -> IUT.length (IUT.fromText t) == T.length t
, QC.testProperty "length/fromString" $ \s -> IUT.length (IUT.fromString s) == length s
+ , QC.testProperty "length/append" $ \(ST t1) (ST t2) -> IUT.length t1 + IUT.length t2 == IUT.length (IUT.append t1 t2)
+ , QC.testProperty "compare" $ \t1 t2 -> IUT.fromText t1 `compare` IUT.fromText t2 == t1 `compare` t2
+ , QC.testProperty "(==)" $ \t1 t2 -> (IUT.fromText t1 == IUT.fromText t2) == (t1 == t2)
+ , QC.testProperty "(!?)" $ \t ->
+ let t' = IUT.fromText t
+ in and ([ mapMaybe (t' IUT.!?) ([0 .. T.length t -1 ] :: [Int]) == T.unpack t
+ , mapMaybe (t' IUT.!?) [-5 .. -1] == []
+ , mapMaybe (t' IUT.!?) [T.length t .. T.length t + 5] == []
+ ] :: [Bool])
+ , QC.testProperty "indexEndMaybe" $ \t ->
+ let t' = IUT.fromText t
+ in and ([ mapMaybe (IUT.indexEndMaybe t') [0 .. T.length t -1 ] == T.unpack (T.reverse t)
+ , mapMaybe (IUT.indexEndMaybe t') [-5 .. -1] == []
+ , mapMaybe (IUT.indexEndMaybe t') [T.length t .. T.length t + 5] == []
+ ] :: [Bool])
, QC.testProperty "toText.fromText" $ \t -> (IUT.toText . IUT.fromText) t == t
, QC.testProperty "fromByteString" $ \b -> IUT.fromByteString b == fromByteStringRef b
, QC.testProperty "fromByteString.toByteString" $ \t -> let ts = IUT.fromText t in (IUT.fromByteString . IUT.toByteString) ts == Just ts
, QC.testProperty "toString.fromString" $ \s -> (IUT.toString . IUT.fromString) s == s
, QC.testProperty "isAscii" $ \s -> IUT.isAscii (IUT.fromString s) == all isAscii s
, QC.testProperty "isAscii2" $ \t -> IUT.isAscii (IUT.fromText t) == T.all isAscii t
+ , QC.testProperty "splitAt" $ \t ->
+ let t' = IUT.fromText t
+ mapBoth f (x,y) = (f x, f y)
+ in and [ mapBoth IUT.toText (IUT.splitAt i t') == T.splitAt i t | i <- [-5 .. 5+T.length t ] ]
+
+ , QC.testProperty "intersperse" $ \t c -> IUT.intersperse c (IUT.fromText t) == IUT.fromText (T.intersperse c t)
+ , QC.testProperty "intercalate" $ \t1 t2 -> IUT.intercalate (IUT.fromText t1) (map IUT.fromText t2) == IUT.fromText (T.intercalate t1 t2)
+ , QC.testProperty "reverse.singleton" $ \c -> IUT.reverse (IUT.singleton c) == IUT.singleton c
+ , QC.testProperty "reverse" $ \t -> IUT.reverse (IUT.fromText t) == IUT.fromText (T.reverse t)
+ , QC.testProperty "filter" $ \p t -> IUT.filter p (IUT.fromText t) == IUT.fromText (T.filter p t)
+ , QC.testProperty "replicate" $ \n t -> IUT.replicate n (IUT.fromText t) == IUT.fromText (T.replicate n t)
+ , QC.testProperty "dropAround" $ \p t -> IUT.dropAround p (IUT.fromText t) == IUT.fromText (T.dropAround p t)
+
+ , QC.testProperty "foldl" $ \f z t -> IUT.foldl f (z :: Char) (IUT.fromText t) == T.foldl f (z :: Char) t
+ , QC.testProperty "foldl #2" $ \t -> IUT.foldl (\n _ -> (n+1)) 0 (IUT.fromText t) == T.length t
+ , QC.testProperty "foldl #3" $ \t -> IUT.foldl (\s c -> c : s) [] (IUT.fromText t) == T.unpack (T.reverse t)
+
+ , QC.testProperty "foldl'" $ \f z t -> IUT.foldl' f (z :: Char) (IUT.fromText t) == T.foldl' f (z :: Char) t
+ , QC.testProperty "foldl' #2" $ \t -> IUT.foldl' (\n _ -> (n+1)) 0 (IUT.fromText t) == T.length t
+ , QC.testProperty "foldl' #3" $ \t -> IUT.foldl' (\s c -> c : s) [] (IUT.fromText t) == T.unpack (T.reverse t)
+
+ , QC.testProperty "foldr" $ \f z t -> IUT.foldr f (z :: Char) (IUT.fromText t) == T.foldr f (z :: Char) t
+ , QC.testProperty "foldr #2" $ \t -> IUT.foldr (\_ n -> (n+1)) 0 (IUT.fromText t) == T.length t
+ , QC.testProperty "foldr #3" $ \t -> IUT.foldr (:) [] (IUT.fromText t) == T.unpack t
+
+ , QC.testProperty "foldr1" $ \f t -> (not (T.null t)) ==> IUT.foldr1 f (IUT.fromText t) == T.foldr1 f t
+ , QC.testProperty "foldl1" $ \f t -> (not (T.null t)) ==> IUT.foldl1 f (IUT.fromText t) == T.foldl1 f t
+ , QC.testProperty "foldl1'" $ \f t -> (not (T.null t)) ==> IUT.foldl1' f (IUT.fromText t) == T.foldl1' f t
+
+ , QC.testProperty "splitAtEnd" $ \t ->
+ let t' = IUT.fromText t
+ n' = IUT.length t'
+ in and [ (IUT.splitAt (n'-i) t') == IUT.splitAtEnd i t' | i <- [-5 .. 5+n' ] ]
+
+ , QC.testProperty "find" $ \t -> IUT.find Data.Char.isAscii (IUT.fromText t) == T.find Data.Char.isAscii t
+ , QC.testProperty "findIndex" $ \t -> IUT.findIndex Data.Char.isAscii (IUT.fromText t) == T.findIndex Data.Char.isAscii t
+
+ , QC.testProperty "isSuffixOf" $ \t1 t2 -> IUT.fromText t1 `IUT.isSuffixOf` IUT.fromText t2 == t1 `T.isSuffixOf` t2
+ , QC.testProperty "isPrefixOf" $ \t1 t2 -> IUT.fromText t1 `IUT.isPrefixOf` IUT.fromText t2 == t1 `T.isPrefixOf` t2
+
+ , QC.testProperty "stripPrefix" $ \t1 t2 -> IUT.stripPrefix (IUT.fromText t1) (IUT.fromText t2) ==
+ fmap IUT.fromText (T.stripPrefix t1 t2)
+
+ , QC.testProperty "stripSuffix" $ \t1 t2 -> IUT.stripSuffix (IUT.fromText t1) (IUT.fromText t2) ==
+ fmap IUT.fromText (T.stripSuffix t1 t2)
+
+ , QC.testProperty "stripPrefix 2" $ \(STI t i) ->
+ let (pfx,sfx) = IUT.splitAt i t
+ in IUT.stripPrefix pfx t == Just sfx
+
+ , QC.testProperty "stripSuffix 2" $ \(STI t i) ->
+ let (pfx,sfx) = IUT.splitAt i t
+ in IUT.stripSuffix sfx t == Just pfx
+
+ , QC.testProperty "cons" $ \c t -> IUT.singleton c <> IUT.fromText t == IUT.cons c (IUT.fromText t)
+ , QC.testProperty "snoc" $ \c t -> IUT.fromText t <> IUT.singleton c == IUT.snoc (IUT.fromText t) c
+
+ , QC.testProperty "uncons" $ \c t -> IUT.uncons (IUT.singleton c <> IUT.fromText t) == Just (c, IUT.fromText t)
+
+ , QC.testProperty "unsnoc" $ \c t -> IUT.unsnoc (IUT.fromText t <> IUT.singleton c) == Just (IUT.fromText t, c)
+
+ , QC.testProperty "break" $ \t -> let (l,r) = IUT.break Data.Char.isAscii (IUT.fromText t)
+ in T.break Data.Char.isAscii t == (IUT.toText l,IUT.toText r)
+
+ , QC.testProperty "span" $ \t -> let (l,r) = IUT.span Data.Char.isAscii (IUT.fromText t)
+ in T.span Data.Char.isAscii t == (IUT.toText l,IUT.toText r)
+
+ , QC.testProperty "breakEnd" $ \t -> let (l,r) = IUT.breakEnd Data.Char.isAscii (IUT.fromText t)
+ in t_breakEnd Data.Char.isAscii t == (IUT.toText l,IUT.toText r)
+
+ , QC.testProperty "spanEnd" $ \t -> let (l,r) = IUT.spanEnd Data.Char.isAscii (IUT.fromText t)
+ in t_spanEnd Data.Char.isAscii t == (IUT.toText l,IUT.toText r)
+
+ , QC.testProperty "splitAt/isPrefixOf" $ \t ->
+ let t' = IUT.fromText t
+ in and [ IUT.isPrefixOf (fst (IUT.splitAt i t')) t' | i <- [-5 .. 5+T.length t ] ]
+ , QC.testProperty "splitAt/isSuffixOf" $ \t ->
+ let t' = IUT.fromText t
+ in and [ IUT.isSuffixOf (snd (IUT.splitAt i t')) t' | i <- [-5 .. 5+T.length t ] ]
]
+t_breakEnd p t = t_spanEnd (not . p) t
+t_spanEnd p t = (T.dropWhileEnd p t, T.takeWhileEnd p t)
+
unitTests = testGroup "Unit-tests"
[ testCase "fromText mempty" $ IUT.fromText mempty @?= mempty
, testCase "fromShortByteString [0xc0,0x80]" $ IUT.fromShortByteString "\xc0\x80" @?= Nothing
, testCase "fromByteString [0xc0,0x80]" $ IUT.fromByteString "\xc0\x80" @?= Nothing
+ , testCase "fromByteString [0xf0,0x90,0x80,0x80]" $ IUT.fromByteString "\xf0\x90\x80\x80" @?= Just "\x10000"
+ , testCase "fromByteString [0xf4,0x90,0x80,0x80]" $ IUT.fromByteString "\244\144\128\128" @?= Nothing
, testCase "IsString U+D800" $ "\xFFFD" @?= (IUT.fromString "\xD800")
-- , testCase "IsString U+D800" $ (IUT.fromString "\xD800") @?= IUT.fromText ("\xD800" :: T.Text)
, testCase "Binary.encode" $ encode ("Hello \8364 & \171581!\NUL" :: IUT.ShortText) @?= "\NUL\NUL\NUL\NUL\NUL\NUL\NUL\DC2Hello \226\130\172 & \240\169\184\189!\NUL"
, testCase "Binary.decode" $ decode ("\NUL\NUL\NUL\NUL\NUL\NUL\NUL\DC2Hello \226\130\172 & \240\169\184\189!\NUL") @?= ("Hello \8364 & \171581!\NUL" :: IUT.ShortText)
- ]
+ , testCase "singleton" $ [ c | c <- [minBound..maxBound], IUT.singleton c /= IUT.fromText (T.singleton c) ] @?= []
+
+ , testCase "splitAtEnd" $ IUT.splitAtEnd 1 "€€" @?= ("€","€")
+
+ , testCase "literal0" $ IUT.unpack testLit0 @?= []
+ , testCase "literal1" $ IUT.unpack testLit1 @?= ['€','\0','€','\0']
+ , testCase "literal2" $ IUT.unpack testLit2 @?= ['\xFFFD','\xD7FF','\xFFFD','\xE000']
+ , testCase "literal3" $ IUT.unpack testLit3 @?= ['\1'..'\x7f']
+ , testCase "literal4" $ IUT.unpack testLit4 @?= map toEnum [0,1,126,127,128,129,130,256,2046,2047,2048,2049,2050,65530,65531,65532,65533,65534,65533,65535,65536,65537,65538,1114110,1114111]
+ , testCase "literal5" $ IUT.unpack testLit5 @?= map toEnum [28961]
+ , testCase "literal6" $ IUT.unpack testLit6 @?= map toEnum [0]
+ , testCase "literal7" $ IUT.unpack testLit7 @?= map toEnum [66328]
+ , testCase "literal8" $ IUT.unpack testLit8 @?= map toEnum [127]
+
+ -- list literals
+ , testCase "literal9" $ [] @?= ("" :: IUT.ShortText)
+ , testCase "literal10" $ ['¤','€','$'] @?= ("¤€$" :: IUT.ShortText)
+ , testCase "literal12" $ IUT.unpack ['\xD800','\xD7FF','\xDFFF','\xE000'] @?= ['\xFFFD','\xD7FF','\xFFFD','\xE000']
+ ]
+
+-- isScalar :: Char -> Bool
+-- isScalar c = c < '\xD800' || c >= '\xE000'
+
+
+{-# NOINLINE testLit0 #-}
+testLit0 :: IUT.ShortText
+testLit0 = ""
+
+{-# NOINLINE testLit1 #-}
+testLit1 :: IUT.ShortText
+testLit1 = "€\NUL€\NUL"
+
+{-# NOINLINE testLit2 #-}
+testLit2 :: IUT.ShortText
+testLit2 = "\xD800\xD7FF\xDFFF\xE000"
+
+{-# NOINLINE testLit3 #-}
+testLit3 :: IUT.ShortText
+testLit3 = "\SOH\STX\ETX\EOT\ENQ\ACK\a\b\t\n\v\f\r\SO\SI\DLE\DC1\DC2\DC3\DC4\NAK\SYN\ETB\CAN\EM\SUB\ESC\FS\GS\RS\US !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\DEL"
+
+{-# NOINLINE testLit4 #-}
+testLit4 :: IUT.ShortText
+testLit4 = "\NUL\SOH~\DEL\128\129\130\256\2046\2047\2048\2049\2050\65530\65531\65532\65533\65534\65533\65535\65536\65537\65538\1114110\1114111"
+
+{-# NOINLINE testLit5 #-}
+testLit5 :: IUT.ShortText
+testLit5 = "無"
+
+{-# NOINLINE testLit6 #-}
+testLit6 :: IUT.ShortText
+testLit6 = "\NUL"
+
+{-# NOINLINE testLit7 #-}
+testLit7 :: IUT.ShortText
+testLit7 = "𐌘"
+
+{-# NOINLINE testLit8 #-}
+testLit8 :: IUT.ShortText
+testLit8 = "\x7f"
diff --git a/src/Data/Text/Short.hs b/src/Data/Text/Short.hs
index f49c865..e34544e 100644
--- a/src/Data/Text/Short.hs
+++ b/src/Data/Text/Short.hs
@@ -8,25 +8,99 @@
-- Stability : stable
--
-- Memory-efficient representation of Unicode text strings.
+--
+-- This module is intended to be imported @qualified@, to avoid name
+-- clashes with "Prelude" functions, e.g.
+--
+-- > import qualified Data.Text.Short as TS
+-- > import qualified Data.Text.Short (ShortText)
+--
+-- This modules deliberately omits (common) partial functions, which
+-- can be found in "Data.Text.Short.Partial" instead.
+--
+-- @since 0.1
module Data.Text.Short
( -- * The 'ShortText' type
ShortText
-- * Basic operations
+ -- ** Construction
+ , empty
+ , singleton
+ , pack
+ , append
+ , concat
+ , cons
+ , snoc
+ , replicate
+
+ -- ** Deconstruction
+ , unpack
+ , uncons
+ , unsnoc
+
+ -- ** Querying & predicates
, null
, length
, isAscii
+ , all
+ , any
+ , find
+ , isPrefixOf
+ , isSuffixOf
+
+ -- ** Lookup & indexing
+ , (!?)
+ , indexMaybe
+ , indexEndMaybe
+ , findIndex
+
+ -- * Splitting 'ShortText's
+ -- ** Basic functions
+ , take
+ , takeEnd
+ , drop
+ , dropEnd
+ , takeWhile
+ , takeWhileEnd
+ , dropWhile
+ , dropWhileEnd
+
+ , dropAround
+
+ -- ** Pair-valued functions
+ , splitAt
+ , splitAtEnd
+ , span
+ , break
+ , spanEnd
+ , breakEnd
+
+ -- ** Suffix & Prefix operations
+ , stripPrefix
+ , stripSuffix
+
+ -- * Transformations
+ , intersperse
+ , intercalate
+ , reverse
+ , filter
+
+ -- * Folds
+ , foldl
+ , foldl'
+ , foldr
-- * Conversions
-- ** 'String'
, fromString
, toString
- -- ** 'T.Text'
+ -- ** 'Text'
, fromText
, toText
- -- ** 'BS.ByteString'
+ -- ** 'ByteString'
, fromShortByteString
, toShortByteString
@@ -37,5 +111,219 @@ module Data.Text.Short
) where
-import Data.Text.Short.Internal
-import Prelude ()
+import Data.Semigroup
+import Data.Text.Short.Internal
+import Prelude ()
+
+-- | \(\mathcal{O}(n)\) Variant of 'span' with negated predicate.
+--
+-- >>> break (> 'c') "abcdabcd"
+-- ("abc","dabcd")
+--
+-- prop> break p t == span (not . p) t
+--
+-- prop> fst (break p t) <> snd (break p t) == t
+--
+-- @since 0.1.2
+break :: (Char -> Bool) -> ShortText -> (ShortText,ShortText)
+break p st = span (not . p) st
+
+-- | \(\mathcal{O}(n)\) Variant of 'spanEnd' with negated predicate.
+--
+-- >>> breakEnd (< 'c') "abcdabcd"
+-- ("abcdab","cd")
+--
+-- prop> breakEnd p t == spanEnd (not . p) t
+--
+-- prop> fst (breakEnd p t) <> snd (breakEnd p t) == t
+--
+-- @since 0.1.2
+breakEnd :: (Char -> Bool) -> ShortText -> (ShortText,ShortText)
+breakEnd p st = spanEnd (not . p) st
+
+-- | \(\mathcal{O}(n)\) Index /i/-th code-point in 'ShortText'.
+--
+-- Infix operator alias of 'indexMaybe'
+--
+-- >>> "abcdefg" !? 2
+-- Just 'c'
+--
+-- @since 0.1.2
+(!?) :: ShortText -> Int -> Maybe Char
+(!?) = indexMaybe
+
+-- | \(\mathcal{O}(n)\) Test whether /any/ code points in 'ShortText' satisfy a predicate.
+--
+-- >>> any (> 'c') "abcdabcd"
+-- True
+--
+-- >>> any (const True) ""
+-- False
+--
+-- >>> any (== 'c') "abdabd"
+-- False
+--
+-- prop> any p t == not (all (not . p) t)
+--
+-- @since 0.1.2
+any :: (Char -> Bool) -> ShortText -> Bool
+any p st = case find p st of
+ Nothing -> False
+ Just _ -> True
+
+-- | \(\mathcal{O}(n)\) Concatenate two 'ShortText's
+--
+-- This is a type-specialised alias of '<>'.
+--
+-- >>> append "foo" "bar"
+-- "foobar"
+--
+-- prop> length (append t1 t2) == length t1 + length t2
+--
+-- @since 0.1.2
+append :: ShortText -> ShortText -> ShortText
+append = (<>)
+
+-- | \(\mathcal{O}(n)\) Concatenate list of 'ShortText's
+--
+-- This is a type-specialised alias of 'mconcat'.
+--
+-- >>> concat []
+-- ""
+--
+-- >>> concat ["foo","bar","doo"]
+-- "foobardoo"
+--
+-- @since 0.1.2
+concat :: [ShortText] -> ShortText
+concat = mconcat
+
+-- | \(\mathcal{O}(0)\) The /empty/ 'ShortText'.
+--
+-- This is a type-specialised alias of 'mempty'.
+--
+-- >>> empty
+-- ""
+--
+-- >>> null empty
+-- True
+--
+-- @since 0.1.2
+empty :: ShortText
+empty = mempty
+
+-- | \(\mathcal{O}(n)\) Construct a 'ShortText' from a list of 'Char's.
+--
+-- This is an alias for 'fromString'.
+--
+-- @since 0.1.2
+pack :: [Char] -> ShortText
+pack = fromString
+
+-- | \(\mathcal{O}(n)\) Convert 'ShortText' into a list of 'Char's.
+--
+-- This is an alias for 'toString'.
+--
+-- prop> (pack . unpack) t == t
+--
+-- @since 0.1.2
+unpack :: ShortText -> [Char]
+unpack = toString
+
+-- | \(\mathcal{O}(n)\) Take prefix of given length or return whole 'ShortText' if too short.
+--
+-- >>> take 3 "abcdef"
+-- "abc"
+--
+-- >>> take 3 "ab"
+-- "ab"
+--
+-- @since 0.1.2
+take :: Int -> ShortText -> ShortText
+take n = fst . splitAt n
+
+-- | \(\mathcal{O}(n)\) Take suffix of given length or return whole 'ShortText' if too short.
+--
+-- >>> takeEnd 3 "abcdefg"
+-- "efg"
+--
+-- >>> takeEnd 3 "ab"
+-- "ab"
+--
+-- @since 0.1.2
+takeEnd :: Int -> ShortText -> ShortText
+takeEnd n = snd . splitAtEnd n
+
+-- | \(\mathcal{O}(n)\) Take remove prefix of given length from 'ShortText' or return 'empty' 'ShortText' if too short.
+--
+-- >>> drop 4 "abcdef"
+-- "ef"
+--
+-- >>> drop 4 "ab"
+-- ""
+--
+-- @since 0.1.2
+drop :: Int -> ShortText -> ShortText
+drop n = snd . splitAt n
+
+-- | \(\mathcal{O}(n)\) Take remove suffix of given length from 'ShortText' or return 'empty' 'ShortText' if too short.
+--
+-- >>> drop 4 "abcdefghi"
+-- "efghi"
+--
+-- >>> drop 4 "ab"
+-- ""
+--
+-- @since 0.1.2
+dropEnd :: Int -> ShortText -> ShortText
+dropEnd n = fst . splitAtEnd n
+
+-- | \(\mathcal{O}(n)\) Take longest prefix satisfying given predicate.
+--
+-- prop> takeWhile p t == fst (span p t)
+--
+-- >>> takeWhile (< 'c') "abcdabcd"
+-- "ab"
+--
+-- @since 0.1.2
+takeWhile :: (Char -> Bool) -> ShortText -> ShortText
+takeWhile p = fst . span p
+
+-- | \(\mathcal{O}(n)\) Take longest suffix satisfying given predicate.
+--
+-- prop> takeWhileEnd p t == snd (spanEnd p t)
+--
+-- >>> takeWhileEnd (>= 'c') "abcdabcd"
+-- "cd"
+--
+-- @since 0.1.2
+takeWhileEnd :: (Char -> Bool) -> ShortText -> ShortText
+takeWhileEnd p = snd . spanEnd p
+
+-- | \(\mathcal{O}(n)\) Remove longest prefix satisfying given predicate.
+--
+-- prop> dropWhile p t == snd (span p t)
+--
+-- >>> dropWhile (< 'c') "abcdabcd"
+-- "cdabcd"
+--
+-- @since 0.1.2
+dropWhile :: (Char -> Bool) -> ShortText -> ShortText
+dropWhile p = snd . span p
+
+-- | \(\mathcal{O}(n)\) Remove longest suffix satisfying given predicate.
+--
+-- prop> dropWhileEnd p t == fst (spanEnd p t)
+--
+-- >>> dropWhileEnd (>= 'c') "abcdabcd"
+-- "abcdab"
+--
+-- @since 0.1.2
+dropWhileEnd :: (Char -> Bool) -> ShortText -> ShortText
+dropWhileEnd p = fst . spanEnd p
+
+-- $setup
+-- >>> :set -XOverloadedStrings
+-- >>> import Text.Show.Functions ()
+-- >>> import qualified Test.QuickCheck.Arbitrary as QC
+-- >>> instance QC.Arbitrary ShortText where { arbitrary = fmap fromString QC.arbitrary }
diff --git a/src/Data/Text/Short/Internal.hs b/src/Data/Text/Short/Internal.hs
index a6a912f..0a61b6b 100644
--- a/src/Data/Text/Short/Internal.hs
+++ b/src/Data/Text/Short/Internal.hs
@@ -1,4 +1,13 @@
-{-# LANGUAGE CPP, GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes, Unsafe #-}
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE RankNTypes #-}
+{-# LANGUAGE TypeFamilies #-}
+{-# LANGUAGE UnboxedTuples #-}
+{-# LANGUAGE UnliftedFFITypes #-}
+{-# LANGUAGE Unsafe #-}
+{-# LANGUAGE ViewPatterns #-}
-- |
-- Module : Data.Text.Short.Internal
@@ -9,16 +18,56 @@
-- Stability : stable
--
-- Memory-efficient representation of Unicode text strings.
+--
+-- @since 0.1
module Data.Text.Short.Internal
( -- * The 'ShortText' type
ShortText(..)
-- * Basic operations
- , Data.Text.Short.Internal.null
- , Data.Text.Short.Internal.length
- , Data.Text.Short.Internal.isAscii
+ , null
+ , length
+ , isAscii
+ , splitAt
+ , splitAtEnd
+ , indexEndMaybe
+ , indexMaybe
+ , isPrefixOf
+ , stripPrefix
+ , isSuffixOf
+ , stripSuffix
+
+ , cons
+ , snoc
+ , uncons
+ , unsnoc
+
+ , findIndex
+ , find
+ , all
+
+ , span
+ , spanEnd
+
+ , intersperse
+ , intercalate
+ , reverse
+ , replicate
+
+ , filter
+ , dropAround
+
+ , foldl
+ , foldl'
+ , foldr
+ , foldl1
+ , foldl1'
+ , foldr1
-- * Conversions
+ -- ** 'Char'
+ , singleton
+
-- ** 'String'
, Data.Text.Short.Internal.fromString
, toString
@@ -38,39 +87,100 @@ module Data.Text.Short.Internal
, toBuilder
+ -- * misc
+ -- ** For Haddock
+
+ , BS.ByteString
+ , T.Text
+ , module Prelude
+
+ -- ** Internals
+ , isValidUtf8
) where
-import Control.DeepSeq (NFData)
--- import Control.Exception as E
-import qualified Data.ByteString as BS
-import Data.ByteString.Short (ShortByteString)
-import qualified Data.ByteString.Short as BSS
+import Control.DeepSeq (NFData)
+import Control.Monad.ST (stToIO)
+import Data.Binary
+import Data.Bits
+import qualified Data.ByteString as BS
+import qualified Data.ByteString.Builder as BB
+import Data.ByteString.Short (ShortByteString)
+import qualified Data.ByteString.Short as BSS
import qualified Data.ByteString.Short.Internal as BSSI
-import Data.Char
-import Data.Hashable (Hashable)
+import Data.Char (ord)
+import Data.Hashable (Hashable)
+import qualified Data.List as List
+import Data.Maybe (fromMaybe, isNothing)
import Data.Semigroup
-import qualified Data.String as S
-import qualified Data.Text as T
-import qualified Data.Text.Encoding as T
+import qualified Data.String as S
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as T
import Foreign.C
-import GHC.Exts (ByteArray#)
-import qualified GHC.Foreign as GHC
+import GHC.Base (assert, unsafeChr)
+import qualified GHC.CString as GHC
+import GHC.Exts (Addr#, ByteArray#, Int (I#),
+ Int#, MutableByteArray#,
+ Ptr (..), RealWorld, Word (W#))
+import qualified GHC.Exts
+import qualified GHC.Foreign as GHC
import GHC.IO.Encoding
+import GHC.ST
+import Prelude hiding (all, any, break, concat,
+ drop, dropWhile, filter, foldl,
+ foldl1, foldr, foldr1, head,
+ init, last, length, null,
+ replicate, reverse, span,
+ splitAt, tail, take, takeWhile)
import System.IO.Unsafe
-import Data.Binary
-import qualified Data.ByteString.Builder as BB
+import Text.Printf (PrintfArg, formatArg,
+ formatString)
+
+import qualified PrimOps
-- | A compact representation of Unicode strings.
--
+-- A 'ShortText' value is a sequence of Unicode scalar values, as defined in
+-- <http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#page=35 §3.9, definition D76 of the Unicode 5.2 standard >;
+-- This means that a 'ShortText' is a list of (scalar) Unicode code-points (i.e. code-points in the range @[U+00 .. U+D7FF] ∪ [U+E000 .. U+10FFFF]@).
+--
-- This type relates to 'T.Text' as 'ShortByteString' relates to 'BS.ByteString' by providing a more compact type. Please consult the documentation of "Data.ByteString.Short" for more information.
--
-- Currently, a boxed unshared 'T.Text' has a memory footprint of 6 words (i.e. 48 bytes on 64-bit systems) plus 2 or 4 bytes per code-point (due to the internal UTF-16 representation). Each 'T.Text' value which can share its payload with another 'T.Text' requires only 4 words additionally. Unlike 'BS.ByteString', 'T.Text' use unpinned memory.
--
--- In comparison, the footprint of a boxed 'ShortText' is only 4 words (i.e. 32 bytes on 64-bit systems) plus 1/2/3/4 bytes per code-point (due to the internal UTF-8 representation).
+-- In comparison, the footprint of a boxed 'ShortText' is only 4 words (i.e. 32 bytes on 64-bit systems) plus 1, 2, 3, or 4 bytes per code-point (due to the internal UTF-8 representation).
-- It can be shown that for realistic data <http://utf8everywhere.org/#asian UTF-16 has a space overhead of 50% over UTF-8>.
--
+-- @since 0.1
newtype ShortText = ShortText ShortByteString
- deriving (Eq,Ord,Monoid,Semigroup,Hashable,NFData)
+ deriving (Monoid,Data.Semigroup.Semigroup,Hashable,NFData)
+
+instance Eq ShortText where
+ {-# INLINE (==) #-}
+ (==) x y
+ | lx /= ly = False
+ | lx == 0 = True
+ | otherwise = case PrimOps.compareByteArrays# (toByteArray# x) 0# (toByteArray# y) 0# n# of
+ 0# -> True
+ _ -> False
+ where
+ !lx@(I# n#) = toLength x
+ !ly = toLength y
+
+instance Ord ShortText where
+ compare t1 t2
+ | n == 0 = compare n1 n2
+ | otherwise = case PrimOps.compareByteArrays# ba1# 0# ba2# 0# n# of
+ r# | I# r# < 0 -> LT
+ | I# r# > 0 -> GT
+ | n1 < n2 -> LT
+ | n1 > n2 -> GT
+ | otherwise -> EQ
+ where
+ ba1# = toByteArray# t1
+ ba2# = toByteArray# t2
+ !n1 = toLength t1
+ !n2 = toLength t2
+ !n@(I# n#) = n1 `min` n2
instance Show ShortText where
showsPrec p (ShortText b) = showsPrec p (decodeStringShort' utf8 b)
@@ -79,9 +189,9 @@ instance Show ShortText where
instance Read ShortText where
readsPrec p = map (\(x,s) -> (ShortText $ encodeStringShort utf8 x,s)) . readsPrec p
--- | Behaviour for @[U+D800 .. U+DFFF]@ matches the 'IsString' instance for 'T.Text'
-instance S.IsString ShortText where
- fromString = fromString
+-- | @since 0.1.2
+instance PrintfArg ShortText where
+ formatArg txt = formatString $ toString txt
-- | The 'Binary' encoding matches the one for 'T.Text'
#if MIN_VERSION_binary(0,8,1)
@@ -104,53 +214,360 @@ instance Binary ShortText where
#endif
-- | \(\mathcal{O}(1)\) Test whether a 'ShortText' is empty.
+--
+-- >>> null ""
+-- True
+--
+-- prop> null (singleton c) == False
+--
+-- prop> null t == (length t == 0)
+--
+-- @since 0.1
null :: ShortText -> Bool
null = BSS.null . toShortByteString
-- | \(\mathcal{O}(n)\) Count the number of Unicode code-points in a 'ShortText'.
+--
+-- >>> length "abcd€"
+-- 5
+--
+-- >>> length ""
+-- 0
+--
+-- prop> length t >= 0
+--
+-- @since 0.1
length :: ShortText -> Int
-length st = fromIntegral $ unsafePerformIO (c_text_short_length (toByteArray# st) (toCSize st))
+length st = fromIntegral $ unsafeDupablePerformIO (c_text_short_length (toByteArray# st) (toCSize st))
foreign import ccall unsafe "hs_text_short_length" c_text_short_length :: ByteArray# -> CSize -> IO CSize
-- | \(\mathcal{O}(n)\) Test whether 'ShortText' contains only ASCII code-points (i.e. only U+0000 through U+007F).
+--
+-- This is a more efficient version of @'all' 'Data.Char.isAscii'@.
+--
+-- >>> isAscii ""
+-- True
+--
+-- >>> isAscii "abc\NUL"
+-- True
+--
+-- >>> isAscii "abcd€"
+-- False
+--
+-- prop> isAscii t == all (< '\x80') t
+--
+-- @since 0.1
isAscii :: ShortText -> Bool
-isAscii st = (== sz) $ unsafePerformIO (c_text_short_is_ascii (toByteArray# st) sz)
+isAscii st = (/= 0) $ unsafeDupablePerformIO (c_text_short_is_ascii (toByteArray# st) sz)
where
sz = toCSize st
-foreign import ccall unsafe "hs_text_short_is_ascii" c_text_short_is_ascii :: ByteArray# -> CSize -> IO CSize
+foreign import ccall unsafe "hs_text_short_is_ascii" c_text_short_is_ascii :: ByteArray# -> CSize -> IO CInt
+
+-- | \(\mathcal{O}(n)\) Test whether /all/ code points in 'ShortText' satisfy a predicate.
+--
+-- >>> all (const False) ""
+-- True
+--
+-- >>> all (> 'c') "abcdabcd"
+-- False
+--
+-- >>> all (/= 'c') "abdabd"
+-- True
+--
+-- @since 0.1.2
+all :: (Char -> Bool) -> ShortText -> Bool
+all p st = isNothing (findOfs (not . p) st (B 0))
+
+-- | \(\mathcal{O}(n)\) Return the left-most codepoint in 'ShortText' that satisfies the given predicate.
+--
+-- >>> find (> 'b') "abcdabcd"
+-- Just 'c'
+--
+-- >>> find (> 'b') "ababab"
+-- Nothing
+--
+-- @since 0.1.2
+find :: (Char -> Bool) -> ShortText -> Maybe Char
+find p st = go 0
+ where
+ go !ofs
+ | ofs >= sz = Nothing
+ | otherwise = let (c,ofs') = decodeCharAtOfs st ofs
+ in c `seq` ofs' `seq`
+ if p c
+ then Just c
+ else go ofs'
+
+ !sz = toB st
+
+-- | \(\mathcal{O}(n)\) Return the index of the left-most codepoint in 'ShortText' that satisfies the given predicate.
+--
+-- >>> findIndex (> 'b') "abcdabcdef"
+-- Just 2
+--
+-- >>> findIndex (> 'b') "ababab"
+-- Nothing
+--
+-- prop> (indexMaybe t =<< findIndex p t) == find p t
+--
+-- @since 0.1.2
+findIndex :: (Char -> Bool) -> ShortText -> Maybe Int
+findIndex p st = go 0 0
+ where
+ go !ofs !i
+ | ofs >= sz = Nothing
+ | otherwise = let (c,ofs') = decodeCharAtOfs st ofs
+ in c `seq` ofs' `seq`
+ if p c
+ then Just i
+ else go ofs' (i+1)
+
+ !sz = toB st
+
+-- internal helper
+{-# INLINE findOfs #-}
+findOfs :: (Char -> Bool) -> ShortText -> B -> Maybe B
+findOfs p st = go
+ where
+ go :: B -> Maybe B
+ go !ofs | ofs >= sz = Nothing
+ go !ofs | p c = Just ofs
+ | otherwise = go ofs'
+ where
+ (c,ofs') = decodeCharAtOfs st ofs
+
+ !sz = toB st
+
+{-# INLINE findOfsRev #-}
+findOfsRev :: (Char -> Bool) -> ShortText -> B -> Maybe B
+findOfsRev p st = go
+ where
+ go (B 0) = Nothing
+ go !ofs
+ | p (cp2ch cp) = Just ofs
+ | otherwise = go (ofs-cpLen cp)
+ where
+ !cp = readCodePointRev st ofs
+
+-- | \(\mathcal{O}(n)\) Split 'ShortText' into longest prefix satisfying the given predicate and the remaining suffix.
+--
+-- >>> span (< 'c') "abcdabcd"
+-- ("ab","cdabcd")
+--
+-- prop> fst (span p t) <> snd (span p t) == t
+--
+-- @since 0.1.2
+span :: (Char -> Bool) -> ShortText -> (ShortText,ShortText)
+span p st
+ | Just ofs <- findOfs (not . p) st (B 0) = splitAtOfs ofs st
+ | otherwise = (st,mempty)
+
+-- | \(\mathcal{O}(n)\) Split 'ShortText' into longest suffix satisfying the given predicate and the preceding prefix.
+--
+-- >>> spanEnd (> 'c') "abcdabcd"
+-- ("abcdabc","d")
+--
+-- prop> fst (spanEnd p t) <> snd (spanEnd p t) == t
+--
+-- @since 0.1.2
+spanEnd :: (Char -> Bool) -> ShortText -> (ShortText,ShortText)
+spanEnd p st
+ | Just ofs <- findOfsRev (not . p) st (toB st) = splitAtOfs ofs st
+ | otherwise = (mempty,st)
----------------------------------------------------------------------------
toCSize :: ShortText -> CSize
toCSize = fromIntegral . BSS.length . toShortByteString
+toB :: ShortText -> B
+toB = fromIntegral . BSS.length . toShortByteString
+
+toLength :: ShortText -> Int
+toLength st = I# (toLength# st)
+
+toLength# :: ShortText -> Int#
+toLength# st = GHC.Exts.sizeofByteArray# (toByteArray# st)
+
toByteArray# :: ShortText -> ByteArray#
toByteArray# (ShortText (BSSI.SBS ba#)) = ba#
-- | \(\mathcal{O}(0)\) Converts to UTF-8 encoded 'ShortByteString'
--
-- This operation has effectively no overhead, as it's currently merely a @newtype@-cast.
+--
+-- @since 0.1
toShortByteString :: ShortText -> ShortByteString
toShortByteString (ShortText b) = b
-- | \(\mathcal{O}(n)\) Converts to UTF-8 encoded 'BS.ByteString'
+--
+-- @since 0.1
toByteString :: ShortText -> BS.ByteString
toByteString = BSS.fromShort . toShortByteString
-- | Construct a 'BB.Builder' that encodes 'ShortText' as UTF-8.
+--
+-- @since 0.1
toBuilder :: ShortText -> BB.Builder
toBuilder = BB.shortByteString . toShortByteString
-- | \(\mathcal{O}(n)\) Convert to 'String'
+--
+-- prop> (fromString . toString) t == t
+--
+-- __Note__: See documentation of 'fromString' for why @('toString' . 'fromString')@ is not an identity function.
+--
+-- @since 0.1
toString :: ShortText -> String
-toString = decodeStringShort' utf8 . toShortByteString
+-- NOTE: impl below beats
+-- toString = decodeStringShort' utf8 . toShortByteString
+-- except for smallish strings
+toString st = go 0
+ where
+ go !ofs
+ | ofs >= sz = []
+ | otherwise = let (c,ofs') = decodeCharAtOfs st ofs
+ in c `seq` ofs' `seq` (c : go ofs')
+
+ !sz = toB st
+
+----------------------------------------------------------------------------
+-- Folds
+
+-- | \(\mathcal{O}(n)\) Reduces the characters of the 'ShortText' with
+-- the binary operator and an initial in forward direction (i.e. from
+-- left to right).
+--
+-- >>> foldl (\_ _ -> True) False ""
+-- False
+--
+-- >>> foldl (\s c -> c : s) ['.'] "abcd"
+-- "dcba."
+--
+-- @since 0.1.2
+foldl :: (a -> Char -> a) -> a -> ShortText -> a
+foldl f z st = go 0 z
+ where
+ go !ofs acc
+ | ofs >= sz = acc
+ | otherwise = let (c,ofs') = decodeCharAtOfs st ofs
+ in c `seq` ofs' `seq` go ofs' (f acc c)
+
+ !sz = toB st
+
+-- | \(\mathcal{O}(n)\) Reduces the characters of the 'ShortText' with the binary operator.
+--
+-- >>> foldl1 max "abcdcba"
+-- 'd'
+--
+-- >>> foldl1 const "abcd"
+-- 'a'
+--
+-- >>> foldl1 (flip const) "abcd"
+-- 'd'
+--
+-- __Note__: Will throw an 'error' exception if index is out of bounds.
+--
+-- @since 0.1.2
+foldl1 :: (Char -> Char -> Char) -> ShortText -> Char
+foldl1 f st
+ | sz == 0 = error "foldl1: empty ShortText"
+ | otherwise = go c0sz c0
+ where
+ go !ofs acc
+ | ofs >= sz = acc
+ | otherwise = let (c,ofs') = decodeCharAtOfs st ofs
+ in c `seq` ofs' `seq` go ofs' (f acc c)
+ !sz = toB st
+ (c0,c0sz) = decodeCharAtOfs st (B 0)
+
+-- | \(\mathcal{O}(n)\) Strict version of 'foldl'.
+--
+-- @since 0.1.2
+foldl' :: (a -> Char -> a) -> a -> ShortText -> a
+foldl' f !z st = go 0 z
+ where
+ go !ofs !acc
+ | ofs >= sz = acc
+ | otherwise = let (c,ofs') = decodeCharAtOfs st ofs
+ in c `seq` ofs' `seq` go ofs' (f acc c)
+
+ !sz = toB st
+
+-- | \(\mathcal{O}(n)\) Strict version of 'foldl1'.
+--
+-- @since 0.1.2
+foldl1' :: (Char -> Char -> Char) -> ShortText -> Char
+foldl1' f st
+ | sz == 0 = error "foldl1: empty ShortText"
+ | otherwise = go c0sz c0
+ where
+ go !ofs !acc
+ | ofs >= sz = acc
+ | otherwise = let (c,ofs') = decodeCharAtOfs st ofs
+ in c `seq` ofs' `seq` go ofs' (f acc c)
+ !sz = toB st
+ (c0,c0sz) = decodeCharAtOfs st (B 0)
+
+-- | \(\mathcal{O}(n)\) Reduces the characters of the 'ShortText' with
+-- the binary operator and an initial in reverse direction (i.e. from
+-- right to left).
+--
+-- >>> foldr (\_ _ -> True) False ""
+-- False
+--
+-- >>> foldr (:) ['.'] "abcd"
+-- "abcd."
+--
+-- @since 0.1.2
+foldr :: (Char -> a -> a) -> a -> ShortText -> a
+foldr f z st = go 0
+ where
+ go !ofs
+ | ofs >= sz = z
+ | otherwise = let (c,ofs') = decodeCharAtOfs st ofs
+ in c `seq` ofs' `seq` f c (go ofs')
+
+ !sz = toB st
+
+-- | \(\mathcal{O}(n)\) Reduces the characters of the 'ShortText' with the binary operator.
+--
+-- >>> foldr1 max "abcdcba"
+-- 'd'
+--
+-- >>> foldr1 const "abcd"
+-- 'a'
+--
+-- >>> foldr1 (flip const) "abcd"
+-- 'd'
+--
+-- __Note__: Will throw an 'error' exception if index is out of bounds.
+--
+-- @since 0.1.2
+foldr1 :: (Char -> Char -> Char) -> ShortText -> Char
+foldr1 f st
+ | sz == 0 = error "foldr1: empty ShortText"
+ | otherwise = go 0
+ where
+ go !ofs = let (c,ofs') = decodeCharAtOfs st ofs
+ in c `seq` ofs' `seq`
+ (if ofs' >= sz then c else f c (go ofs'))
+
+ !sz = toB st
-- | \(\mathcal{O}(n)\) Convert to 'T.Text'
--
+-- prop> (fromText . toText) t == t
+--
+-- prop> (toText . fromText) t == t
+--
-- This is currently not \(\mathcal{O}(1)\) because currently 'T.Text' uses UTF-16 as its internal representation.
-- In the event that 'T.Text' will change its internal representation to UTF-8 this operation will become \(\mathcal{O}(1)\).
+--
+-- @since 0.1
toText :: ShortText -> T.Text
toText = T.decodeUtf8 . toByteString
@@ -158,19 +575,32 @@ toText = T.decodeUtf8 . toByteString
-- | \(\mathcal{O}(n)\) Construct/pack from 'String'
--
--- Note: This function is total because it replaces the (invalid) code-points U+D800 through U+DFFF with the replacement character U+FFFD.
+-- >>> fromString []
+-- ""
+--
+-- >>> fromString ['a','b','c']
+-- "abc"
+--
+-- >>> fromString ['\55295','\55296','\57343','\57344'] -- U+D7FF U+D800 U+DFFF U+E000
+-- "\55295\65533\65533\57344"
+--
+-- __Note__: This function is total because it replaces the (invalid) code-points U+D800 through U+DFFF with the replacement character U+FFFD.
+--
+-- @since 0.1
fromString :: String -> ShortText
-fromString = ShortText . encodeStringShort utf8 . map r
+fromString [] = mempty
+fromString [c] = singleton c
+fromString s = ShortText . encodeStringShort utf8 . map r $ s
where
- r c | 0xd800 <= x && x < 0xe000 = '\xFFFD'
- | otherwise = c
- where
- x = ord c
+ r c | isSurr (ord c) = '\xFFFD'
+ | otherwise = c
-- | \(\mathcal{O}(n)\) Construct 'ShortText' from 'T.Text'
--
-- This is currently not \(\mathcal{O}(1)\) because currently 'T.Text' uses UTF-16 as its internal representation.
-- In the event that 'T.Text' will change its internal representation to UTF-8 this operation will become \(\mathcal{O}(1)\).
+--
+-- @since 0.1
fromText :: T.Text -> ShortText
fromText = fromByteStringUnsafe . T.encodeUtf8
@@ -180,6 +610,25 @@ fromText = fromByteStringUnsafe . T.encodeUtf8
-- cannot be \(\mathcal{O}(1)\) because we need to validate the UTF-8 encoding.
--
-- Returns 'Nothing' in case of invalid UTF-8 encoding.
+--
+-- >>> fromShortByteString "\x00\x38\xF0\x90\x8C\x9A" -- U+00 U+38 U+1031A
+-- Just "\NUL8\66330"
+--
+-- >>> fromShortByteString "\xC0\x80" -- invalid denormalised U+00
+-- Nothing
+--
+-- >>> fromShortByteString "\xED\xA0\x80" -- U+D800 (non-scalar code-point)
+-- Nothing
+--
+-- >>> fromShortByteString "\xF4\x8f\xbf\xbf" -- U+10FFFF
+-- Just "\1114111"
+--
+-- >>> fromShortByteString "\xF4\x90\x80\x80" -- U+110000 (invalid)
+-- Nothing
+--
+-- prop> fromShortByteString (toShortByteString t) == Just t
+--
+-- @since 0.1
fromShortByteString :: ShortByteString -> Maybe ShortText
fromShortByteString sbs
| isValidUtf8 st = Just st
@@ -194,12 +643,18 @@ fromShortByteString sbs
-- __WARNING__: Unlike the safe 'fromShortByteString' conversion, this
-- conversion is /unsafe/ as it doesn't validate the well-formedness of the
-- UTF-8 encoding.
+--
+-- @since 0.1.1
fromShortByteStringUnsafe :: ShortByteString -> ShortText
fromShortByteStringUnsafe = ShortText
-- | \(\mathcal{O}(n)\) Construct 'ShortText' from UTF-8 encoded 'BS.ByteString'
--
+-- 'fromByteString' accepts (or rejects) the same input data as 'fromShortByteString'.
+--
-- Returns 'Nothing' in case of invalid UTF-8 encoding.
+--
+-- @since 0.1
fromByteString :: BS.ByteString -> Maybe ShortText
fromByteString = fromShortByteString . BSS.toShort
@@ -211,6 +666,8 @@ fromByteString = fromShortByteString . BSS.toShort
-- __WARNING__: Unlike the safe 'fromByteString' conversion, this
-- conversion is /unsafe/ as it doesn't validate the well-formedness of the
-- UTF-8 encoding.
+--
+-- @since 0.1.1
fromByteStringUnsafe :: BS.ByteString -> ShortText
fromByteStringUnsafe = ShortText . BSS.toShort
@@ -234,13 +691,827 @@ decodeStringShort' te = decodeString' te . BSS.fromShort
encodeStringShort :: TextEncoding -> String -> BSS.ShortByteString
encodeStringShort te = BSS.toShort . encodeString te
+-- isValidUtf8' :: ShortText -> Int
+-- isValidUtf8' st = fromIntegral $ unsafeDupablePerformIO (c_text_short_is_valid_utf8 (toByteArray# st) (toCSize st))
isValidUtf8 :: ShortText -> Bool
-isValidUtf8 st = (==0) $ unsafePerformIO (c_text_short_is_valid_utf8 (toByteArray# st) (toCSize st))
+isValidUtf8 st = (==0) $ unsafeDupablePerformIO (c_text_short_is_valid_utf8 (toByteArray# st) (toCSize st))
+
+type CCodePoint = Word
foreign import ccall unsafe "hs_text_short_is_valid_utf8" c_text_short_is_valid_utf8 :: ByteArray# -> CSize -> IO CInt
-{- TODO:
-{-# RULES "ShortText strlit" forall s . fromString (unpackCString# s) = fromAddr# #-}
-...
+foreign import ccall unsafe "hs_text_short_index_cp" c_text_short_index :: ByteArray# -> CSize -> CSize -> IO CCodePoint
+
+-- | \(\mathcal{O}(n)\) Lookup /i/-th code-point in 'ShortText'.
+--
+-- Returns 'Nothing' if out of bounds.
+--
+-- prop> indexMaybe (singleton c) 0 == Just c
+--
+-- prop> indexMaybe t 0 == fmap fst (uncons t)
+--
+-- prop> indexMaybe mempty i == Nothing
+--
+-- @since 0.1.2
+indexMaybe :: ShortText -> Int -> Maybe Char
+indexMaybe st i
+ | i < 0 = Nothing
+ | otherwise = cp2chSafe cp
+ where
+ cp = CP $ unsafeDupablePerformIO (c_text_short_index (toByteArray# st) (toCSize st) (fromIntegral i))
+
+-- | \(\mathcal{O}(n)\) Lookup /i/-th code-point from the end of 'ShortText'.
+--
+-- Returns 'Nothing' if out of bounds.
+--
+-- prop> indexEndMaybe (singleton c) 0 == Just c
+--
+-- prop> indexEndMaybe t 0 == fmap snd (unsnoc t)
+--
+-- prop> indexEndMaybe mempty i == Nothing
+--
+-- @since 0.1.2
+indexEndMaybe :: ShortText -> Int -> Maybe Char
+indexEndMaybe st i
+ | i < 0 = Nothing
+ | otherwise = cp2chSafe cp
+ where
+ cp = CP $ unsafeDupablePerformIO (c_text_short_index_rev (toByteArray# st) (toCSize st) (fromIntegral i))
+
+foreign import ccall unsafe "hs_text_short_index_cp_rev" c_text_short_index_rev :: ByteArray# -> CSize -> CSize -> IO CCodePoint
+
+
+-- | \(\mathcal{O}(n)\) Split 'ShortText' into two halves.
+--
+-- @'splitAtOfs n t@ returns a pair of 'ShortText' with the following properties:
+--
+-- prop> length (fst (splitAt n t)) == min (length t) (max 0 n)
+--
+-- prop> fst (splitAt n t) <> snd (splitAt n t) == t
+--
+-- >>> splitAt 2 "abcdef"
+-- ("ab","cdef")
+--
+-- >>> splitAt 10 "abcdef"
+-- ("abcdef","")
+--
+-- >>> splitAt (-1) "abcdef"
+-- ("","abcdef")
+--
+-- @since 0.1.2
+splitAt :: Int -> ShortText -> (ShortText,ShortText)
+splitAt i st
+ | i <= 0 = (mempty,st)
+ | otherwise = splitAtOfs ofs st
+ where
+ ofs = csizeToB $
+ unsafeDupablePerformIO (c_text_short_index_ofs (toByteArray# st) stsz (fromIntegral i))
+ stsz = toCSize st
+
+-- | \(\mathcal{O}(n)\) Split 'ShortText' into two halves.
+--
+-- @'splitAtEnd' n t@ returns a pair of 'ShortText' with the following properties:
+--
+-- prop> length (snd (splitAtEnd n t)) == min (length t) (max 0 n)
+--
+-- prop> fst (splitAtEnd n t) <> snd (splitAtEnd n t) == t
+--
+-- prop> splitAtEnd n t == splitAt (length t - n) t
+--
+-- >>> splitAtEnd 2 "abcdef"
+-- ("abcd","ef")
+--
+-- >>> splitAtEnd 10 "abcdef"
+-- ("","abcdef")
+--
+-- >>> splitAtEnd (-1) "abcdef"
+-- ("abcdef","")
+--
+-- @since 0.1.2
+splitAtEnd :: Int -> ShortText -> (ShortText,ShortText)
+splitAtEnd i st
+ | i <= 0 = (st,mempty)
+ | ofs >= stsz = (mempty,st)
+ | otherwise = splitAtOfs ofs st
+ where
+ ofs = csizeToB $
+ unsafeDupablePerformIO (c_text_short_index_ofs_rev (toByteArray# st) (toCSize st) (fromIntegral (i-1)))
+ stsz = toB st
+
+{-# INLINE splitAtOfs #-}
+splitAtOfs :: B -> ShortText -> (ShortText,ShortText)
+splitAtOfs ofs st
+ | ofs == 0 = (mempty,st)
+ | ofs > stsz = (st,mempty)
+ | otherwise = (slice st 0 ofs, slice st ofs (stsz-ofs))
+ where
+ !stsz = toB st
+
+foreign import ccall unsafe "hs_text_short_index_ofs" c_text_short_index_ofs :: ByteArray# -> CSize -> CSize -> IO CSize
+
+foreign import ccall unsafe "hs_text_short_index_ofs_rev" c_text_short_index_ofs_rev :: ByteArray# -> CSize -> CSize -> IO CSize
+
+
+-- | \(\mathcal{O}(n)\) Inverse operation to 'cons'
+--
+-- Returns 'Nothing' for empty input 'ShortText'.
+--
+-- prop> uncons (cons c t) == Just (c,t)
+--
+-- >>> uncons ""
+-- Nothing
+--
+-- >>> uncons "fmap"
+-- Just ('f',"map")
+--
+-- @since 0.1.2
+uncons :: ShortText -> Maybe (Char,ShortText)
+uncons st
+ | null st = Nothing
+ | len2 == 0 = Just (c0, mempty)
+ | otherwise = Just (c0, slice st ofs len2)
+ where
+ c0 = cp2ch cp0
+ cp0 = readCodePoint st 0
+ ofs = cpLen cp0
+ len2 = toB st - ofs
+
+-- | \(\mathcal{O}(n)\) Inverse operation to 'snoc'
+--
+-- Returns 'Nothing' for empty input 'ShortText'.
+--
+-- prop> unsnoc (snoc t c) == Just (t,c)
+--
+-- >>> unsnoc ""
+-- Nothing
+--
+-- >>> unsnoc "fmap"
+-- Just ("fma",'p')
+--
+-- @since 0.1.2
+unsnoc :: ShortText -> Maybe (ShortText,Char)
+unsnoc st
+ | null st = Nothing
+ | len1 == 0 = Just (mempty, c0)
+ | otherwise = Just (slice st 0 len1, c0)
+ where
+ c0 = cp2ch cp0
+ cp0 = readCodePointRev st stsz
+ stsz = toB st
+ len1 = stsz - cpLen cp0
+
+-- | \(\mathcal{O}(n)\) Tests whether the first 'ShortText' is a prefix of the second 'ShortText'
+--
+-- >>> isPrefixOf "ab" "abcdef"
+-- True
+--
+-- >>> isPrefixOf "ac" "abcdef"
+-- False
+--
+-- prop> isPrefixOf "" t == True
+--
+-- prop> isPrefixOf t t == True
+--
+-- @since 0.1.2
+isPrefixOf :: ShortText -> ShortText -> Bool
+isPrefixOf x y
+ | lx > ly = False
+ | lx == 0 = True
+ | otherwise = case PrimOps.compareByteArrays# (toByteArray# x) 0# (toByteArray# y) 0# n# of
+ 0# -> True
+ _ -> False
+ where
+ !lx@(I# n#) = toLength x
+ !ly = toLength y
+
+-- | \(\mathcal{O}(n)\) Strip prefix from second 'ShortText' argument.
+--
+-- Returns 'Nothing' if first argument is not a prefix of the second argument.
+--
+-- >>> stripPrefix "text-" "text-short"
+-- Just "short"
+--
+-- >>> stripPrefix "test-" "text-short"
+-- Nothing
+--
+-- @since 0.1.2
+stripPrefix :: ShortText -> ShortText -> Maybe ShortText
+stripPrefix pfx t
+ | isPrefixOf pfx t = Just $! snd (splitAtOfs (toB pfx) t)
+ | otherwise = Nothing
+
+-- | \(\mathcal{O}(n)\) Tests whether the first 'ShortText' is a suffix of the second 'ShortText'
+--
+-- >>> isSuffixOf "ef" "abcdef"
+-- True
+--
+-- >>> isPrefixOf "df" "abcdef"
+-- False
+--
+-- prop> isSuffixOf "" t == True
+--
+-- prop> isSuffixOf t t == True
+--
+-- @since 0.1.2
+isSuffixOf :: ShortText -> ShortText -> Bool
+isSuffixOf x y
+ | lx > ly = False
+ | lx == 0 = True
+ | otherwise = case PrimOps.compareByteArrays# (toByteArray# x) 0# (toByteArray# y) ofs2# n# of
+ 0# -> True
+ _ -> False
+ where
+ !(I# ofs2#) = ly - lx
+ !lx@(I# n#) = toLength x
+ !ly = toLength y
+
+-- | \(\mathcal{O}(n)\) Strip suffix from second 'ShortText' argument.
+--
+-- Returns 'Nothing' if first argument is not a suffix of the second argument.
+--
+-- >>> stripSuffix "-short" "text-short"
+-- Just "text"
+--
+-- >>> stripSuffix "-utf8" "text-short"
+-- Nothing
+--
+-- @since 0.1.2
+stripSuffix :: ShortText -> ShortText -> Maybe ShortText
+stripSuffix sfx t
+ | isSuffixOf sfx t = Just $! fst (splitAtOfs pfxLen t)
+ | otherwise = Nothing
+ where
+ pfxLen = toB t - toB sfx
+
+----------------------------------------------------------------------------
+
+-- | \(\mathcal{O}(n)\) Insert character between characters of 'ShortText'.
+--
+-- >>> intersperse '*' "_"
+-- "_"
+--
+-- >>> intersperse '*' "MASH"
+-- "M*A*S*H"
+--
+-- @since 0.1.2
+intersperse :: Char -> ShortText -> ShortText
+intersperse c st
+ | null st = mempty
+ | sn == 1 = st
+ | otherwise = create newsz $ \mba -> do
+ let !cp0 = readCodePoint st 0
+ !cp0sz = cpLen cp0
+ writeCodePointN cp0sz mba 0 cp0
+ go mba (sn - 1) cp0sz cp0sz
+ where
+ newsz = ssz + ((sn-1) `mulB` csz)
+ ssz = toB st
+ sn = length st
+ csz = cpLen cp
+ cp = ch2cp c
+
+ go :: MBA s -> Int -> B -> B -> ST s ()
+ go _ 0 !_ !_ = return ()
+ go mba n ofs ofs2 = do
+ let !cp1 = readCodePoint st ofs2
+ !cp1sz = cpLen cp1
+ writeCodePointN csz mba ofs cp
+ writeCodePointN cp1sz mba (ofs+csz) cp1
+ go mba (n-1) (ofs+csz+cp1sz) (ofs2+cp1sz)
+
+-- | \(\mathcal{O}(n)\) Insert 'ShortText' inbetween list of 'ShortText's.
+--
+-- >>> intercalate ", " []
+-- ""
+--
+-- >>> intercalate ", " ["foo"]
+-- "foo"
+--
+-- >>> intercalate ", " ["foo","bar","doo"]
+-- "foo, bar, doo"
+--
+-- prop> intercalate "" ts == concat ts
+--
+-- @since 0.1.2
+intercalate :: ShortText -> [ShortText] -> ShortText
+intercalate _ [] = mempty
+intercalate _ [t] = t
+intercalate sep ts
+ | null sep = mconcat ts
+ | otherwise = mconcat (List.intersperse sep ts)
+
+-- | \(\mathcal{O}(n*m)\) Replicate a 'ShortText'.
+--
+-- A repetition count smaller than 1 results in an empty string result.
+--
+-- >>> replicate 3 "jobs!"
+-- "jobs!jobs!jobs!"
+--
+-- >>> replicate 10000 ""
+-- ""
+--
+-- >>> replicate 0 "nothing"
+-- ""
+--
+-- prop> length (replicate n t) == max 0 n * length t
+--
+-- @since 0.1.2
+replicate :: Int -> ShortText -> ShortText
+replicate n0 t
+ | n0 < 1 = mempty
+ | null t = mempty
+ | otherwise = create (n0 `mulB` sz) (go 0)
+ where
+ go :: Int -> MBA s -> ST s ()
+ go j mba
+ | j == n0 = return ()
+ | otherwise = do
+ copyByteArray t 0 mba (j `mulB` sz) sz
+ go (j+1) mba
+
+ sz = toB t
+
+-- | \(\mathcal{O}(n)\) Reverse characters in 'ShortText'.
+--
+-- >>> reverse "star live desserts"
+-- "stressed evil rats"
+--
+-- prop> reverse (singleton c) == singleton c
+--
+-- prop> reverse (reverse t) == t
+--
+-- @since 0.1.2
+reverse :: ShortText -> ShortText
+reverse st
+ | null st = mempty
+ | sn == 1 = st
+ | otherwise = create sz $ go sn 0
+ where
+ sz = toB st
+ sn = length st
+
+ go :: Int -> B -> MBA s -> ST s ()
+ go 0 !_ _ = return ()
+ go i ofs mba = do
+ let !cp = readCodePoint st ofs
+ !cpsz = cpLen cp
+ !ofs' = ofs+cpsz
+ writeCodePointN cpsz mba (sz - ofs') cp
+ go (i-1) ofs' mba
+
+
+-- | \(\mathcal{O}(n)\) Remove characters from 'ShortText' which don't satisfy given predicate.
+--
+-- >>> filter (`notElem` ['a','e','i','o','u']) "You don't need vowels to convey information!"
+-- "Y dn't nd vwls t cnvy nfrmtn!"
+--
+-- prop> filter (const False) t == ""
+--
+-- prop> filter (const True) t == t
+--
+-- prop> length (filter p t) <= length t
+--
+-- prop> filter p t == pack [ c | c <- unpack t, p c ]
+--
+-- @since 0.1.2
+filter :: (Char -> Bool) -> ShortText -> ShortText
+filter p t
+ = case (mofs1,mofs2) of
+ (Nothing, _) -> t -- no non-accepted characters found
+ (Just 0, Nothing) -> mempty -- no accepted characters found
+ (Just ofs1, Nothing) -> slice t 0 ofs1 -- only prefix accepted
+ (Just ofs1, Just ofs2) -> createShrink (t0sz-(ofs2-ofs1)) $ \mba -> do
+ -- copy accepted prefix
+ copyByteArray t 0 mba 0 ofs1
+ -- [ofs1 .. ofs2) are a non-accepted region
+ -- filter rest after ofs2
+ t1sz <- go mba ofs2 ofs1
+ return t1sz
+ where
+ mofs1 = findOfs (not . p) t (B 0) -- first non-accepted Char
+ mofs2 = findOfs p t (fromMaybe (B 0) mofs1) -- first accepted Char
+
+ t0sz = toB t
+
+ go :: MBA s -> B -> B -> ST s B
+ go mba !t0ofs !t1ofs
+ | t0ofs >= t0sz = return t1ofs
+ | otherwise = let !cp = readCodePoint t t0ofs
+ !cpsz = cpLen cp
+ in if p (cp2ch cp)
+ then writeCodePointN cpsz mba t1ofs cp >>
+ go mba (t0ofs+cpsz) (t1ofs+cpsz)
+ else go mba (t0ofs+cpsz) t1ofs -- skip code-point
+
+-- | \(\mathcal{O}(n)\) Strip characters from the beginning end and of 'ShortText' which satisfy given predicate.
+--
+-- >>> dropAround (== ' ') " white space "
+-- "white space"
+--
+-- >>> dropAround (> 'a') "bcdefghi"
+-- ""
+--
+-- @since 0.1.2
+dropAround :: (Char -> Bool) -> ShortText -> ShortText
+dropAround p t0 = case (mofs1,mofs2) of
+ (Nothing,_) -> mempty
+ (Just ofs1,Just ofs2)
+ | ofs1 == 0, ofs2 == t0sz -> t0
+ | ofs1 < ofs2 -> create (ofs2-ofs1) $ \mba -> do
+ copyByteArray t0 ofs1 mba (B 0) (ofs2-ofs1)
+ (_,_) -> error "dropAround: the impossible happened"
+ where
+ mofs1 = findOfs (not . p) t0 (B 0)
+ mofs2 = findOfsRev (not . p) t0 t0sz
+ t0sz = toB t0
+
+----------------------------------------------------------------------------
+
+-- | Construct a new 'ShortText' from an existing one by slicing
+--
+-- NB: The 'CSize' arguments refer to byte-offsets
+slice :: ShortText -> B -> B -> ShortText
+slice st ofs len
+ | ofs < 0 = error "invalid offset"
+ | len < 0 = error "invalid length"
+ | len' == 0 = mempty
+ | otherwise = create len' $ \mba -> copyByteArray st ofs' mba 0 len'
+ where
+ len0 = toB st
+ len' = max 0 (min len (len0-ofs))
+ ofs' = max 0 ofs
+
+----------------------------------------------------------------------------
+-- low-level MutableByteArray# helpers
+
+-- | Byte offset (or size) in bytes
+--
+-- This currently wraps an 'Int' because this is what GHC's primops
+-- currently use for byte offsets/sizes.
+newtype B = B { unB :: Int }
+ deriving (Ord,Eq,Num)
+
+{- TODO: introduce operators for 'B' to avoid 'Num' -}
+
+mulB :: Int -> B -> B
+mulB n (B b) = B (n*b)
+
+csizeFromB :: B -> CSize
+csizeFromB = fromIntegral . unB
+
+csizeToB :: CSize -> B
+csizeToB = B . fromIntegral
+
+data MBA s = MBA# { unMBA# :: MutableByteArray# s }
+
+{-# INLINE create #-}
+create :: B -> (forall s. MBA s -> ST s ()) -> ShortText
+create n go = runST $ do
+ mba <- newByteArray n
+ go mba
+ unsafeFreeze mba
+
+{-# INLINE createShrink #-}
+createShrink :: B -> (forall s. MBA s -> ST s B) -> ShortText
+createShrink n go = runST $ do
+ mba <- newByteArray n
+ n' <- go mba
+ if n' < n
+ then unsafeFreezeShrink mba n'
+ else unsafeFreeze mba
+
+{-# INLINE unsafeFreeze #-}
+unsafeFreeze :: MBA s -> ST s ShortText
+unsafeFreeze (MBA# mba#)
+ = ST $ \s -> case GHC.Exts.unsafeFreezeByteArray# mba# s of
+ (# s', ba# #) -> (# s', ShortText (BSSI.SBS ba#) #)
+
+{-# INLINE copyByteArray #-}
+copyByteArray :: ShortText -> B -> MBA s -> B -> B -> ST s ()
+copyByteArray (ShortText (BSSI.SBS src#)) (B (I# src_off#)) (MBA# dst#) (B (I# dst_off#)) (B (I# len#))
+ = ST $ \s -> case GHC.Exts.copyByteArray# src# src_off# dst# dst_off# len# s of
+ s' -> (# s', () #)
+
+{-# INLINE newByteArray #-}
+newByteArray :: B -> ST s (MBA s)
+newByteArray (B (I# n#))
+ = ST $ \s -> case GHC.Exts.newByteArray# n# s of
+ (# s', mba# #) -> (# s', MBA# mba# #)
+
+{-# INLINE writeWord8Array #-}
+writeWord8Array :: MBA s -> B -> Word -> ST s ()
+writeWord8Array (MBA# mba#) (B (I# i#)) (W# w#)
+ = ST $ \s -> case GHC.Exts.writeWord8Array# mba# i# w# s of
+ s' -> (# s', () #)
+{- not needed yet
+{-# INLINE indexWord8Array #-}
+indexWord8Array :: ShortText -> B -> Word
+indexWord8Array (ShortText (BSSI.SBS ba#)) (B (I# i#)) = W# (GHC.Exts.indexWord8Array# ba# i#)
+-}
+
+{-# INLINE copyAddrToByteArray #-}
+copyAddrToByteArray :: Ptr a -> MBA RealWorld -> B -> B -> ST RealWorld ()
+copyAddrToByteArray (Ptr src#) (MBA# dst#) (B (I# dst_off#)) (B (I# len#))
+ = ST $ \s -> case GHC.Exts.copyAddrToByteArray# src# dst# dst_off# len# s of
+ s' -> (# s', () #)
+
+----------------------------------------------------------------------------
+-- unsafeFreezeShrink
+
+#if __GLASGOW_HASKELL__ >= 710
+-- for GHC versions which have the 'shrinkMutableByteArray#' primop
+{-# INLINE unsafeFreezeShrink #-}
+unsafeFreezeShrink :: MBA s -> B -> ST s ShortText
+unsafeFreezeShrink mba n = do
+ shrink mba n
+ unsafeFreeze mba
+
+{-# INLINE shrink #-}
+shrink :: MBA s -> B -> ST s ()
+shrink (MBA# mba#) (B (I# i#))
+ = ST $ \s -> case GHC.Exts.shrinkMutableByteArray# mba# i# s of
+ s' -> (# s', () #)
+#else
+-- legacy code for GHC versions which lack `shrinkMutableByteArray#` primop
+{-# INLINE unsafeFreezeShrink #-}
+unsafeFreezeShrink :: MBA s -> B -> ST s ShortText
+unsafeFreezeShrink mba0 n = do
+ mba' <- newByteArray n
+ copyByteArray2 mba0 0 mba' 0 n
+ unsafeFreeze mba'
+
+{-# INLINE copyByteArray2 #-}
+copyByteArray2 :: MBA s -> B -> MBA s -> B -> B -> ST s ()
+copyByteArray2 (MBA# src#) (B (I# src_off#)) (MBA# dst#) (B (I# dst_off#)) (B( I# len#))
+ = ST $ \s -> case GHC.Exts.copyMutableByteArray# src# src_off# dst# dst_off# len# s of
+ s' -> (# s', () #)
+#endif
+
+----------------------------------------------------------------------------
+-- Helpers for encoding code points into UTF-8 code units
+--
+-- 7 bits| < 0x80 | 0xxxxxxx
+-- 11 bits| < 0x800 | 110yyyyx 10xxxxxx
+-- 16 bits| < 0x10000 | 1110yyyy 10yxxxxx 10xxxxxx
+-- 21 bits| | 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
+
+-- | Unicode Code-point
+--
+-- Keeping it as a 'Word' is more convenient for bit-ops and FFI
+newtype CP = CP Word
+
+{-# INLINE ch2cp #-}
+ch2cp :: Char -> CP
+ch2cp (ord -> ci)
+ | isSurr ci = CP 0xFFFD
+ | otherwise = CP (fromIntegral ci)
+
+{-# INLINE isSurr #-}
+isSurr :: (Num i, Bits i) => i -> Bool
+isSurr ci = ci .&. 0xfff800 == 0xd800
+
+{-# INLINE cp2ch #-}
+cp2ch :: CP -> Char
+cp2ch (CP w) = (w < 0x110000) `assert` unsafeChr (fromIntegral w)
+
+-- used/needed by index-lookup functions to encode out of bounds
+cp2chSafe :: CP -> Maybe Char
+cp2chSafe cp
+ | cpNull cp = Nothing
+ | otherwise = Just $! cp2ch cp
+ where
+ cpNull :: CP -> Bool
+ cpNull (CP w) = w >= 0x110000
+
+{-# INLINE cpLen #-}
+cpLen :: CP -> B
+cpLen (CP cp)
+ | cp < 0x80 = B 1
+ | cp < 0x800 = B 2
+ | cp < 0x10000 = B 3
+ | otherwise = B 4
+
+-- convenience wrapper; unsafe like readCodePoint
+{-# INLINE decodeCharAtOfs #-}
+decodeCharAtOfs :: ShortText -> B -> (Char,B)
+decodeCharAtOfs st ofs = (c,ofs')
+ where
+ c = cp2ch cp
+ ofs' = ofs + cpLen cp
+ cp = readCodePoint st ofs
+{- pure version of decodeCharAtOfs, but unfortunately significantly slower
+
+decodeCharAtOfs st ofs
+ | b0 < 0x80 = (cp2ch $ CP b0,ofs + B 1)
+ | otherwise = case b0 `unsafeShiftR` 4 of
+ 0xf -> (cp2ch $ CP go4, ofs + B 4)
+ 0xe -> (cp2ch $ CP go3, ofs + B 3)
+ _ -> (cp2ch $ CP go2, ofs + B 2)
+ where
+ b0 = buf 0
+ buf j = indexWord8Array st (ofs+j)
+
+ go2 = ((b0 .&. 0x1f) `unsafeShiftL` 6)
+ .|. (buf 1 .&. 0x3f)
+
+ go3 = ((b0 .&. 0x0f) `unsafeShiftL` (6+6))
+ .|. ((buf 1 .&. 0x3f) `unsafeShiftL` 6)
+ .|. (buf 2 .&. 0x3f)
+
+ go4 = ((b0 .&. 0x07) `unsafeShiftL` (6+6+6))
+ .|. ((buf 1 .&. 0x3f) `unsafeShiftL` (6+6))
+ .|. ((buf 2 .&. 0x3f) `unsafeShiftL` 6)
+ .|. (buf 3 .&. 0x3f)
+-}
+
+
+-- | \(\mathcal{O}(1)\) Construct 'ShortText' from single codepoint.
+--
+-- prop> singleton c == pack [c]
+--
+-- prop> length (singleton c) == 1
+--
+-- >>> singleton 'A'
+-- "A"
+--
+-- >>> map singleton ['\55295','\55296','\57343','\57344'] -- U+D7FF U+D800 U+DFFF U+E000
+-- ["\55295","\65533","\65533","\57344"]
+--
+-- __Note__: This function is total because it replaces the (invalid) code-points U+D800 through U+DFFF with the replacement character U+FFFD.
+--
+-- @since 0.1.2
+singleton :: Char -> ShortText
+singleton = singleton' . ch2cp
+
+singleton' :: CP -> ShortText
+singleton' cp@(CP cpw)
+ | cpw < 0x80 = create 1 $ \mba -> writeCodePoint1 mba 0 cp
+ | cpw < 0x800 = create 2 $ \mba -> writeCodePoint2 mba 0 cp
+ | cpw < 0x10000 = create 3 $ \mba -> writeCodePoint3 mba 0 cp
+ | otherwise = create 4 $ \mba -> writeCodePoint4 mba 0 cp
+
+-- | \(\mathcal{O}(n)\) Prepend a character to a 'ShortText'.
+--
+-- prop> cons c t == singleton c <> t
+--
+-- @since 0.1.2
+cons :: Char -> ShortText -> ShortText
+cons (ch2cp -> cp@(CP cpw)) sfx
+ | n == 0 = singleton' cp
+ | cpw < 0x80 = create (n+1) $ \mba -> writeCodePoint1 mba 0 cp >> copySfx 1 mba
+ | cpw < 0x800 = create (n+2) $ \mba -> writeCodePoint2 mba 0 cp >> copySfx 2 mba
+ | cpw < 0x10000 = create (n+3) $ \mba -> writeCodePoint3 mba 0 cp >> copySfx 3 mba
+ | otherwise = create (n+4) $ \mba -> writeCodePoint4 mba 0 cp >> copySfx 4 mba
+ where
+ !n = toB sfx
+
+ copySfx :: B -> MBA s -> ST s ()
+ copySfx ofs mba = copyByteArray sfx 0 mba ofs n
+
+-- | \(\mathcal{O}(n)\) Append a character to the ond of a 'ShortText'.
+--
+-- prop> snoc t c == t <> singleton c
+--
+-- @since 0.1.2
+snoc :: ShortText -> Char -> ShortText
+snoc pfx (ch2cp -> cp@(CP cpw))
+ | n == 0 = singleton' cp
+ | cpw < 0x80 = create (n+1) $ \mba -> copyPfx mba >> writeCodePoint1 mba n cp
+ | cpw < 0x800 = create (n+2) $ \mba -> copyPfx mba >> writeCodePoint2 mba n cp
+ | cpw < 0x10000 = create (n+3) $ \mba -> copyPfx mba >> writeCodePoint3 mba n cp
+ | otherwise = create (n+4) $ \mba -> copyPfx mba >> writeCodePoint4 mba n cp
+ where
+ !n = toB pfx
+
+ copyPfx :: MBA s -> ST s ()
+ copyPfx mba = copyByteArray pfx 0 mba 0 n
+
+{-
+writeCodePoint :: MBA s -> Int -> Word -> ST s ()
+writeCodePoint mba ofs cp
+ | cp < 0x80 = writeCodePoint1 mba ofs cp
+ | cp < 0x800 = writeCodePoint2 mba ofs cp
+ | cp < 0x10000 = writeCodePoint3 mba ofs cp
+ | otherwise = writeCodePoint4 mba ofs cp
-}
+
+writeCodePointN :: B -> MBA s -> B -> CP -> ST s ()
+writeCodePointN 1 = writeCodePoint1
+writeCodePointN 2 = writeCodePoint2
+writeCodePointN 3 = writeCodePoint3
+writeCodePointN 4 = writeCodePoint4
+writeCodePointN _ = undefined
+
+writeCodePoint1 :: MBA s -> B -> CP -> ST s ()
+writeCodePoint1 mba ofs (CP cp) =
+ writeWord8Array mba ofs cp
+
+writeCodePoint2 :: MBA s -> B -> CP -> ST s ()
+writeCodePoint2 mba ofs (CP cp) = do
+ writeWord8Array mba ofs (0xc0 .|. (cp `unsafeShiftR` 6))
+ writeWord8Array mba (ofs+1) (0x80 .|. (cp .&. 0x3f))
+
+writeCodePoint3 :: MBA s -> B -> CP -> ST s ()
+writeCodePoint3 mba ofs (CP cp) = do
+ writeWord8Array mba ofs (0xe0 .|. (cp `unsafeShiftR` 12))
+ writeWord8Array mba (ofs+1) (0x80 .|. ((cp `unsafeShiftR` 6) .&. 0x3f))
+ writeWord8Array mba (ofs+2) (0x80 .|. (cp .&. 0x3f))
+
+writeCodePoint4 :: MBA s -> B -> CP -> ST s ()
+writeCodePoint4 mba ofs (CP cp) = do
+ writeWord8Array mba ofs (0xf0 .|. (cp `unsafeShiftR` 18))
+ writeWord8Array mba (ofs+1) (0x80 .|. ((cp `unsafeShiftR` 12) .&. 0x3f))
+ writeWord8Array mba (ofs+2) (0x80 .|. ((cp `unsafeShiftR` 6) .&. 0x3f))
+ writeWord8Array mba (ofs+3) (0x80 .|. (cp .&. 0x3f))
+
+-- beware: UNSAFE!
+readCodePoint :: ShortText -> B -> CP
+readCodePoint st (csizeFromB -> ofs)
+ = CP $ unsafeDupablePerformIO (c_text_short_ofs_cp (toByteArray# st) ofs)
+
+foreign import ccall unsafe "hs_text_short_ofs_cp" c_text_short_ofs_cp :: ByteArray# -> CSize -> IO CCodePoint
+
+readCodePointRev :: ShortText -> B -> CP
+readCodePointRev st (csizeFromB -> ofs)
+ = CP $ unsafeDupablePerformIO (c_text_short_ofs_cp_rev (toByteArray# st) ofs)
+
+foreign import ccall unsafe "hs_text_short_ofs_cp_rev" c_text_short_ofs_cp_rev :: ByteArray# -> CSize -> IO CCodePoint
+
+----------------------------------------------------------------------------
+-- string & list literals
+
+-- | __Note__: Surrogate pairs (@[U+D800 .. U+DFFF]@) character literals are replaced by U+FFFD.
+--
+-- @since 0.1.2
+instance GHC.Exts.IsList ShortText where
+ type (Item ShortText) = Char
+ fromList = fromString
+ toList = toString
+
+-- | __Note__: Surrogate pairs (@[U+D800 .. U+DFFF]@) in string literals are replaced by U+FFFD.
+--
+-- This matches the behaviour of 'IsString' instance for 'T.Text'.
+instance S.IsString ShortText where
+ fromString = fromStringLit
+
+-- i.e., don't inline before Phase 0
+{-# INLINE [0] fromStringLit #-}
+fromStringLit :: String -> ShortText
+fromStringLit = fromString
+
+{-# RULES "ShortText empty literal" fromStringLit "" = mempty #-}
+
+-- TODO: this doesn't seem to fire
+{-# RULES "ShortText singleton literal" forall c . fromStringLit [c] = singleton c #-}
+
+{-# RULES "ShortText literal ASCII" forall s . fromStringLit (GHC.unpackCString# s) = fromLitAsciiAddr# s #-}
+
+{-# RULES "ShortText literal UTF-8" forall s . fromStringLit (GHC.unpackCStringUtf8# s) = fromLitMUtf8Addr# s #-}
+
+{-# NOINLINE fromLitAsciiAddr# #-}
+fromLitAsciiAddr# :: Addr# -> ShortText
+fromLitAsciiAddr# (Ptr -> ptr) = unsafeDupablePerformIO $ do
+ sz <- csizeToB `fmap` c_strlen ptr
+
+ case sz `compare` 0 of
+ EQ -> return mempty -- should not happen if rules fire correctly
+ GT -> stToIO $ do
+ mba <- newByteArray sz
+ copyAddrToByteArray ptr mba 0 sz
+ unsafeFreeze mba
+ LT -> return (error "fromLitAsciiAddr#")
+ -- NOTE: should never happen unless strlen(3) overflows (NB: CSize
+ -- is unsigned; the overflow would occur when converting to
+ -- 'B')
+
+foreign import ccall unsafe "strlen" c_strlen :: CString -> IO CSize
+
+-- GHC uses an encoding resembling Modified UTF-8 for non-ASCII string-literals
+{-# NOINLINE fromLitMUtf8Addr# #-}
+fromLitMUtf8Addr# :: Addr# -> ShortText
+fromLitMUtf8Addr# (Ptr -> ptr) = unsafeDupablePerformIO $ do
+ sz <- B `fmap` c_text_short_mutf8_strlen ptr
+
+ case sz `compare` 0 of
+ EQ -> return mempty -- should not happen if rules fire correctly
+ GT -> stToIO $ do
+ mba <- newByteArray sz
+ copyAddrToByteArray ptr mba 0 sz
+ unsafeFreeze mba
+ LT -> do
+ mba <- stToIO (newByteArray (abs sz))
+ c_text_short_mutf8_trans ptr (unMBA# mba)
+ stToIO (unsafeFreeze mba)
+
+foreign import ccall unsafe "hs_text_short_mutf8_strlen" c_text_short_mutf8_strlen :: CString -> IO Int
+
+foreign import ccall unsafe "hs_text_short_mutf8_trans" c_text_short_mutf8_trans :: CString -> MutableByteArray# RealWorld -> IO ()
+
+-- $setup
+-- >>> :set -XOverloadedStrings
+-- >>> import Data.Text.Short (pack, unpack, concat)
+-- >>> import Text.Show.Functions ()
+-- >>> import qualified Test.QuickCheck.Arbitrary as QC
+-- >>> import Test.QuickCheck.Instances ()
+-- >>> instance QC.Arbitrary ShortText where { arbitrary = fmap fromString QC.arbitrary }
diff --git a/src/Data/Text/Short/Partial.hs b/src/Data/Text/Short/Partial.hs
new file mode 100644
index 0000000..8703516
--- /dev/null
+++ b/src/Data/Text/Short/Partial.hs
@@ -0,0 +1,100 @@
+{-# LANGUAGE Trustworthy #-}
+
+-- |
+-- Module : Data.Text.Short.Partial
+-- Copyright : © Herbert Valerio Riedel 2018
+-- License : BSD3
+--
+-- Maintainer : hvr@gnu.org
+-- Stability : stable
+--
+-- Partial functions vocabulary
+--
+-- This module provides common partial functions for operating on 'ShortText'.
+--
+-- The use of these functions is discouraged as they tend to be error-prone.
+--
+-- @since 0.1.2
+module Data.Text.Short.Partial
+ ( head
+ , tail
+ , init
+ , last
+ , index
+
+ , foldl1
+ , foldl1'
+ , foldr1
+ ) where
+
+import Data.Text.Short
+import Data.Text.Short.Internal
+import Prelude ()
+
+-- | \(\mathcal{O}(1)\) Returns first character of a non-empty 'ShortText'
+--
+-- >>> head "abcd"
+-- 'a'
+--
+-- __Note__: Will throw an 'error' exception for empty 'ShortText's.
+-- Consider using the total functions 'uncons' or 'indexMaybe'
+-- instead.
+--
+-- @since 0.1.2
+head :: ShortText -> Char
+head = maybe (error "head: empty ShortText") fst . uncons
+
+-- | \(\mathcal{O}(n)\) Drop first character from non-empty 'ShortText'.
+--
+-- >>> tail "abcd"
+-- "bcd"
+--
+-- __Note__: Will throw an 'error' exception for empty 'ShortText's.
+-- Consider using the total functions 'uncons' or 'drop' instead.
+--
+-- @since 0.1.2
+tail :: ShortText -> ShortText
+tail = maybe (error "tail: empty ShortText") snd . uncons
+
+-- | \(\mathcal{O}(n)\) Drop last character from non-empty 'ShortText'.
+--
+-- >>> tail "abcd"
+-- "bcd"
+--
+-- __Note__: Will throw an 'error' exception for empty 'ShortText's.
+-- Consider using the total functions 'unsnoc' or 'dropEnd' instead.
+--
+-- @since 0.1.2
+init :: ShortText -> ShortText
+init = maybe (error "init: empty ShortText") fst . unsnoc
+
+-- | \(\mathcal{O}(1)\) Return last character from non-empty 'ShortText'.
+--
+-- >>> last "abcd"
+-- 'd'
+--
+-- __Note__: Will throw an 'error' exception for empty 'ShortText's.
+-- Consider using the total functions 'unsnoc' or 'indexEndMaybe'
+-- instead.
+--
+-- @since 0.1.2
+last :: ShortText -> Char
+last = maybe (error "last: empty ShortText") snd . unsnoc
+
+-- | \(\mathcal{O}(n)\) Retrieve \(i\)-th character (code-point)
+--
+-- >>> index "abcd" 1
+-- 'b'
+--
+-- __Note__: Will throw an 'error' exception if index is out of
+-- bounds. Consider using the total functions 'indexMaybe' or
+-- 'indexEndMaybe' instead.
+--
+-- @since 0.1.2
+index :: ShortText -> Int -> Char
+index st i = case indexMaybe st i of
+ Nothing -> error "index: not within ShortText"
+ Just c -> c
+
+-- $setup
+-- >>> :set -XOverloadedStrings
diff --git a/text-short.cabal b/text-short.cabal
index 94a4cce..0b704c5 100644
--- a/text-short.cabal
+++ b/text-short.cabal
@@ -1,7 +1,7 @@
-cabal-version: 1.12
+cabal-version: 1.18
name: text-short
-version: 0.1.1
+version: 0.1.2
synopsis: Memory-efficient representation of Unicode text strings
license: BSD3
license-file: LICENSE
@@ -12,32 +12,46 @@ category: Data
build-type: Simple
description: This package provides the 'ShortText' type which is suitable for keeping many short strings in memory. This is similiar to how 'ShortByteString' relates to 'ByteString'.
.
- The main difference between 'Text' and 'ShortText' is that 'ShortText' uses UTF-8 instead of UTF-16 internally and also doesn't support slicing (thereby saving 2 words). Consequently, the memory footprint of a (boxed) 'ShortText' value is 4 words (2 words when unboxed) plus the length of the UTF-8 encoded payload.
+ The main difference between 'Text' and 'ShortText' is that 'ShortText' uses UTF-8 instead of UTF-16 internally and also doesn't support zero-copy slicing (thereby saving 2 words). Consequently, the memory footprint of a (boxed) 'ShortText' value is 4 words (2 words when unboxed) plus the length of the UTF-8 encoded payload.
+tested-with: GHC==8.4.1, GHC==8.2.2, GHC==8.0.2, GHC==7.10.3, GHC==7.8.4
extra-source-files: ChangeLog.md
Source-Repository head
Type: git
Location: https://github.com/hvr/text-short.git
-library
- default-language: Haskell2010
+flag asserts
+ description: Enable runtime-checks via @assert@
+ default: False
+ manual: True
+library
exposed-modules: Data.Text.Short
+ Data.Text.Short.Partial
Data.Text.Short.Unsafe
other-modules: Data.Text.Short.Internal
- build-depends: base >= 4.7 && < 4.11
+ build-depends: base >= 4.7 && < 4.12
, bytestring >= 0.10.4 && < 0.11
, hashable >= 1.2.6 && < 1.3
, deepseq >= 1.3 && < 1.5
, text >= 1.0 && < 1.3
, binary >= 0.7.1 && < 0.9
+ , ghc-prim >= 0.3.1 && < 0.6
if !impl(ghc >= 8.0)
build-depends: semigroups >= 0.18.2 && < 0.19
+ -- GHC version specific PrimOps
+ if impl(ghc >= 8.4)
+ hs-source-dirs: src-ghc804
+ else
+ c-sources: cbits/memcmp.c
+ hs-source-dirs: src-ghc708
+ other-modules: PrimOps
+
hs-source-dirs: src
default-language: Haskell2010
@@ -49,10 +63,16 @@ library
, Unsafe
c-sources: cbits/cbits.c
+
+ if flag(asserts)
+ ghc-options: -fno-ignore-asserts
+ else
+ cc-options: -DNDEBUG=1
+
ghc-options: -Wall
+ cc-options: -O3 -Wall
test-suite tests
- default-language: Haskell2010
type: exitcode-stdio-1.0
hs-source-dirs: src-test
main-is: Tests.hs
@@ -62,7 +82,9 @@ test-suite tests
, text
, text-short
-- deps which don't inherit constraints from library stanza:
- , tasty >= 0.11.2 && < 0.12
- , tasty-quickcheck >= 0.8.4 && < 0.9
- , tasty-hunit >= 0.9.2 && < 0.10
+ , tasty >= 1.0.0 && < 1.1
+ , tasty-quickcheck >= 0.10 && < 0.11
+ , tasty-hunit >= 0.10.0 && < 0.11
, quickcheck-instances >= 0.3.14 && < 0.4
+
+ default-language: Haskell2010