summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChrisPenner <>2019-07-11 20:00:00 (GMT)
committerhdiff <hdiff@hdiff.luite.com>2019-07-11 20:00:00 (GMT)
commitfc197622ba11f3fadb4099e074b31e0e6e496172 (patch)
tree0ecec090e43c9c6f414223bc4637c0202e94d293
parent638f7bf55a1e76aa5fb86aa872595d7f65b41571 (diff)
version 0.3.1.0HEAD0.3.1.0master
-rw-r--r--ChangeLog.md4
-rw-r--r--README.md36
-rw-r--r--lens-regex-pcre.cabal7
-rw-r--r--src/Control/Lens/Regex.hs77
-rw-r--r--test/Spec.hs14
5 files changed, 84 insertions, 54 deletions
diff --git a/ChangeLog.md b/ChangeLog.md
index 526414e..4f765ff 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -1,5 +1,9 @@
# Changelog for lens-regex-pcre
+# 0.3.1.0
+Match -> Match text
+Added regexBS to run regex on ByteStrings directly
+
# 0.3.0.0
Unify `iregex` into `regex` as a single indexed traversal
diff --git a/README.md b/README.md
index 003b40d..d3275ea 100644
--- a/README.md
+++ b/README.md
@@ -45,13 +45,13 @@ True
"roses on raindrops and kittens on whiskers"
-- Get the third match
-λ> txt ^? iregex [rx|\w+|] . index 2 . match
+λ> txt ^? regex [rx|\w+|] . index 2 . match
Just "roses"
-- Match integers, 'Read' them into ints, then sort them in-place
-- dumping them back into the source text afterwards.
λ> "Monday: 29, Tuesday: 99, Wednesday: 3"
- & partsOf (iregex [rx|\d+|] . match . unpacked . _Show @Int) %~ sort
+ & partsOf (regex [rx|\d+|] . match . unpacked . _Show @Int) %~ sort
"Monday: 3, Tuesday: 29, Wednesday: 99"
```
@@ -92,6 +92,14 @@ describe "regex" $ do
"abc" ^.. regex [rx|\w+?|] . match
`shouldBe`["a", "b", "c"]
+ it "should allow folding with index" $ do
+ ("one two three" ^.. (regex [rx|\w+|] <. match) . withIndex)
+ `shouldBe` [(0, "one"), (1, "two"), (2, "three")]
+
+ it "should allow getting with index" $ do
+ ("one two three" ^.. regex [rx|\w+|] . index 1 . match)
+ `shouldBe` ["two"]
+
describe "setting" $ do
it "should allow setting" $ do
("one two three" & regex [rx|two|] . match .~ "new")
@@ -109,23 +117,13 @@ describe "regex" $ do
("one two three" & regex [rx|two|] . match %~ T.toUpper)
`shouldBe` "one TWO three"
-describe "iregex" $ do
- describe "match" $ do
- it "should allow folding with index" $ do
- ("one two three" ^.. (iregex [rx|\w+|] <. match) . withIndex)
- `shouldBe` [(0, "one"), (1, "two"), (2, "three")]
-
- it "should allow getting with index" $ do
- ("one two three" ^.. iregex [rx|\w+|] . index 1 . match)
- `shouldBe` ["two"]
-
- it "should allow setting with index" $ do
- ("one two three" & iregex [rx|\w+|] <. match .@~ T.pack . show)
- `shouldBe` "0 1 2"
+ it "should allow setting with index" $ do
+ ("one two three" & regex [rx|\w+|] <. match .@~ T.pack . show)
+ `shouldBe` "0 1 2"
- it "should allow mutating with index" $ do
- ("one two three" & iregex [rx|\w+|] <. match %@~ \i s -> (T.pack $ show i) <> ": " <> s)
- `shouldBe` "0: one 1: two 2: three"
+ it "should allow mutating with index" $ do
+ ("one two three" & regex [rx|\w+|] <. match %@~ \i s -> (T.pack $ show i) <> ": " <> s)
+ `shouldBe` "0: one 1: two 2: three"
describe "groups" $ do
describe "getting" $ do
@@ -172,7 +170,7 @@ describe "groups" $ do
`shouldBe` "0: one 1: two 0: three 1: four"
it "should compose indices with matches" $ do
- ("one two three four" ^.. (iregex [rx|(\w+) (\w+)|] <.> groups . traversed) . withIndex)
+ ("one two three four" ^.. (regex [rx|(\w+) (\w+)|] <.> groups . traversed) . withIndex)
`shouldBe` [((0, 0), "one"), ((0, 1), "two"), ((1, 0), "three"), ((1, 1), "four")]
describe "matchAndGroups" $ do
diff --git a/lens-regex-pcre.cabal b/lens-regex-pcre.cabal
index 5af73f3..e006e3c 100644
--- a/lens-regex-pcre.cabal
+++ b/lens-regex-pcre.cabal
@@ -4,11 +4,13 @@ cabal-version: 1.12
--
-- see: https://github.com/sol/hpack
--
--- hash: a626a534d16f0bea96536315430d586eea2fc04b175395c0047b63268c10fbca
+-- hash: c1baf100f3d2d1c413b88695a96d578b0a8d18aace0929759a5428dcfa30b1f1
name: lens-regex-pcre
-version: 0.3.0.0
+version: 0.3.1.0
+synopsis: A lensy interface to regular expressions
description: Please see the README on GitHub at <https://github.com/ChrisPenner/lens-regex-pcre#readme>
+category: Regex
homepage: https://github.com/ChrisPenner/lens-regex-pcre#readme
bug-reports: https://github.com/ChrisPenner/lens-regex-pcre/issues
author: Chris Penner
@@ -40,6 +42,7 @@ library
, pcre-light
, template-haskell
, text
+ , bytestring
default-language: Haskell2010
test-suite lens-regex-pcre-test
diff --git a/src/Control/Lens/Regex.hs b/src/Control/Lens/Regex.hs
index 73194bb..67a7c29 100644
--- a/src/Control/Lens/Regex.hs
+++ b/src/Control/Lens/Regex.hs
@@ -3,9 +3,6 @@ Module : Control.Lens.Regex
Description : PCRE regex combinators for interop with lens
Copyright : (c) Chris Penner, 2019
License : BSD3
-
-Note that all traversals in this library are not techically lawful; they break the 'multi-set'
-idempotence law; in reality this isn't usually a problem; but consider yourself warned. Test your code.
-}
{-# LANGUAGE FlexibleContexts #-}
@@ -19,6 +16,7 @@ module Control.Lens.Regex
(
-- * Combinators
regex
+ , regexBS
, match
, groups
, matchAndGroups
@@ -34,7 +32,10 @@ module Control.Lens.Regex
, Regex
) where
-import Data.Text as T hiding (index)
+import qualified Data.Text as T hiding (index)
+import qualified Data.Text.Encoding as T
+import qualified Data.Text.Encoding.Error as T
+import qualified Data.ByteString as BS
import Text.Regex.PCRE.Heavy
import Text.Regex.PCRE.Light (compile)
import Control.Lens hiding (re, matching)
@@ -47,12 +48,15 @@ import Language.Haskell.TH.Quote
-- >>> :set -XOverloadedStrings
-- >>> :set -XTypeApplications
-- >>> import Data.Text.Lens (unpacked)
+-- >>> import Data.Text (Text)
-- >>> import Data.List (sort)
--- | Match represents a whole regex match; you can drill into it using 'match' or 'groups' or
--- 'matchAndGroups'
+-- | Match represents a whole regex match; you can drill into it using 'match' or 'groups' or 'matchAndGroups'
+--
+-- @text@ is either "Text" or "ByteString" depending on whether you use 'regex' or 'regexBS'
+--
-- Consider this to be internal; don't depend on its representation.
-type Match = [Either Text Text]
+type Match text = [Either text text]
type MatchRange = (Int, Int)
type GroupRanges = [(Int, Int)]
@@ -85,7 +89,7 @@ type GroupRanges = [(Int, Int)]
--
-- >>> "raindrops on roses and whiskers on kittens" ^.. regex [rx|(\w+) on (\w+)|] . groups . traversed
-- ["raindrops","roses","whiskers","kittens"]
-groups :: Traversal' Match [T.Text]
+groups :: Traversal' (Match text) [text]
groups = partsOf (traversed . _Right)
-- | Traverse each match
@@ -104,7 +108,7 @@ groups = partsOf (traversed . _Right)
--
-- >>> "one _two_ three _four_" & regex [rx|_\w+_|] . match %~ T.toUpper
-- "one _TWO_ three _FOUR_"
-match :: Traversal' Match T.Text
+match :: Monoid text => Traversal' (Match text) text
match f grps = (:[]) . Right <$> f (grps ^. traversed . chosen)
-- | The base combinator for doing regex searches.
@@ -155,29 +159,41 @@ match f grps = (:[]) . Right <$> f (grps ^. traversed . chosen)
-- 'Regex' into 'regex';
-- Alternatively can make your own version of the QuasiQuoter with any options you want embedded
-- by using 'mkRegexQQ'.
-regex :: Regex -> IndexedTraversal' Int T.Text Match
-regex pattern = indexing (regexT pattern)
+regex :: Regex -> IndexedTraversal' Int T.Text (Match T.Text)
+regex pattern = utf8 . regexBS pattern . matchBsText
+ where
+ utf8 :: Iso' T.Text BS.ByteString
+ utf8 = iso T.encodeUtf8 (T.decodeUtf8With T.lenientDecode)
+ matchBsText :: Iso' [Either BS.ByteString BS.ByteString] (Match T.Text)
+ matchBsText = iso (traversed . chosen %~ T.decodeUtf8With T.lenientDecode) (traversed . chosen %~ T.encodeUtf8)
+
+-- | A version of 'regex' which operates directly on 'BS.ByteString's.
+-- This is more efficient than using 'regex' as it avoids converting back and forth
+-- between 'BS.ByteString' and 'T.Text'.
+regexBS :: Regex -> IndexedTraversal' Int BS.ByteString (Match BS.ByteString)
+regexBS pattern = indexing (regexT pattern)
--- | Base regex traversal. Used only to define 'regex'
-regexT :: Regex -> Traversal' T.Text Match
+-- | Base regex traversal. Used only to define 'regex' traversals
+regexT :: Regex -> Traversal' BS.ByteString [Either BS.ByteString BS.ByteString]
regexT pattern f txt = collapseMatch <$> apply (fmap splitAgain <$> splitter txt matches)
where
matches :: [(MatchRange, GroupRanges)]
matches = scanRanges pattern txt
- collapseMatch :: [Either Text [Either Text Text]] -> Text
+ collapseMatch :: [Either BS.ByteString [Either BS.ByteString BS.ByteString]] -> BS.ByteString
collapseMatch xs = xs ^. folded . beside id (traversed . chosen)
-- apply :: [Either Text [Either Text Text]] -> _ [Either Text [Either Text Text]]
apply xs = xs & traversed . _Right %%~ f
-matchText :: Match -> T.Text
+-- | Get the full match text from a match
+matchText :: Monoid text => Match text -> text
matchText m = m ^. traversed . chosen
-- | Collect both the match text AND all the matching groups
--
-- >>> "raindrops on roses and whiskers on kittens" ^.. regex [rx|(\w+) on (\w+)|] . matchAndGroups
-- [("raindrops on roses",["raindrops","roses"]),("whiskers on kittens",["whiskers","kittens"])]
-matchAndGroups :: Getter Match (T.Text, [T.Text])
+matchAndGroups :: Monoid text => Getter (Match text) (text, [text])
matchAndGroups = to $ \m -> (matchText m, m ^. groups)
-- | 'QuasiQuoter' for compiling regexes.
@@ -194,8 +210,7 @@ rx = re
--
-- >>> "raindrops on roses and whiskers on kittens" ^.. regex [rx|(\w+) on (\w+)|] . (withGroups <. match) . withIndex
-- [(["raindrops","roses"],"raindrops on roses"),(["whiskers","kittens"],"whiskers on kittens")]
---
-withMatch :: IndexedTraversal' T.Text Match Match
+withMatch :: Monoid text => IndexedTraversal' text (Match text) (Match text)
withMatch p mtch = indexed p (matchText mtch) mtch
-- | This allows you to "stash" the match text into an index for use later in the traversal.
@@ -206,37 +221,37 @@ withMatch p mtch = indexed p (matchText mtch) mtch
--
-- >>> "raindrops on roses and whiskers on kittens" ^.. regex [rx|(\w+) on (\w+)|] . (withMatch <. groups) . withIndex
-- [("raindrops on roses",["raindrops","roses"]),("whiskers on kittens",["whiskers","kittens"])]
-withGroups :: IndexedTraversal' [T.Text] Match Match
+withGroups :: IndexedTraversal' [text] (Match text) (Match text)
withGroups p mtch = indexed p (mtch ^. groups) mtch
-- split up text into matches paired with groups; Left is unmatched text
-splitter :: Text -> [(MatchRange, GroupRanges)] -> [Either T.Text (T.Text, GroupRanges)]
+splitter :: BS.ByteString -> [(MatchRange, GroupRanges)] -> [Either BS.ByteString (BS.ByteString, GroupRanges)]
splitter t [] = wrapIfNotEmpty t
splitter t (((start, end), grps) : rest) =
splitOnce t ((start, end), grps)
- <> splitter (T.drop end t) (subtractFromAll end rest)
+ <> splitter (BS.drop end t) (subtractFromAll end rest)
-splitOnce :: Text -> (MatchRange, GroupRanges) -> [Either T.Text (T.Text, GroupRanges)]
+splitOnce :: BS.ByteString -> (MatchRange, GroupRanges) -> [Either BS.ByteString (BS.ByteString, GroupRanges)]
splitOnce t ((start, end), grps) = do
- let (before, mid) = T.splitAt start t
- let focused = T.take (end - start) mid
+ let (before, mid) = BS.splitAt start t
+ let focused = BS.take (end - start) mid
wrapIfNotEmpty before <> [Right (focused, subtractFromAll start grps)]
-splitAgain :: (T.Text, GroupRanges) -> Match
-splitAgain (t, []) | T.null t = []
+splitAgain :: (BS.ByteString, GroupRanges) -> [Either BS.ByteString BS.ByteString]
+splitAgain (t, []) | BS.null t = []
| otherwise = [Left t]
splitAgain (t, (start, end) : rest) = do
- let (before, mid) = T.splitAt start t
- let focused = T.take (end - start) mid
+ let (before, mid) = BS.splitAt start t
+ let focused = BS.take (end - start) mid
wrapIfNotEmpty before
<> [Right focused]
- <> splitAgain ((T.drop end t), (subtractFromAll end rest))
+ <> splitAgain ((BS.drop end t), (subtractFromAll end rest))
--- helpers
subtractFromAll :: (Data b) => Int -> b -> b
subtractFromAll n = biplate -~ n
-wrapIfNotEmpty :: Text -> [Either Text a]
+wrapIfNotEmpty :: BS.ByteString -> [Either BS.ByteString a]
wrapIfNotEmpty txt
- | T.null txt = []
+ | BS.null txt = []
| otherwise = [Left txt]
diff --git a/test/Spec.hs b/test/Spec.hs
index 2916aeb..081e6bb 100644
--- a/test/Spec.hs
+++ b/test/Spec.hs
@@ -38,6 +38,16 @@ main = hspec $ do
"abc" ^.. regex [rx|\w+?|] . match
`shouldBe`["a", "b", "c"]
+ it "should handle unicode in source text properly" $ do
+ "🍕 test 🍔" ^. regex [rx|test|] . match
+ `shouldBe` "test"
+ ("🍕 test 🍔" & regex [rx|🍔|] . match .~ "👻🙈")
+ `shouldBe` "🍕 test 👻🙈"
+
+ it "should handle unicode in patterns properly" $ do
+ "*🍕 test 🍔*" ^. regex [rx|🍕 \w+ 🍔|] . match
+ `shouldBe` "🍕 test 🍔"
+
describe "setting" $ do
it "should allow setting" $ do
("one two three" & regex [rx|two|] . match .~ "new")
@@ -87,8 +97,8 @@ main = hspec $ do
`shouldBe` ["two", "four"]
xit "should handle weird group alternation" $ do
- "1:2 a=b" ^.. regex [rx|(\d):(\d)|(\w)=(\w)|] . match
- `shouldBe` ["not entirely sure what I expect this to be yet"]
+ "1:2 a=b" ^.. regex [rx|(\d):(\d)|(\w)=(\w)|] . groups
+ `shouldBe` [[ "not entirely sure what I expect this to be yet" ]]
describe "setting" $ do
it "should allow setting groups as a list" $ do