summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorkoral <>2017-09-13 06:54:00 (GMT)
committerhdiff <hdiff@hdiff.luite.com>2017-09-13 06:54:00 (GMT)
commitabc236361eb8824f2ada23be514a0934d5de2ea8 (patch)
tree15bf160a499c13267ca8909310459967f421fab6
parent7a204cb7638de72b3d6e292e659e047b7d189b5e (diff)
version 1.7.0HEAD1.7.0master
-rw-r--r--ChangeLog.md6
-rw-r--r--Text/XML/Stream/Parse.hs105
-rw-r--r--test/main.hs45
-rw-r--r--xml-conduit.cabal2
4 files changed, 109 insertions, 49 deletions
diff --git a/ChangeLog.md b/ChangeLog.md
index 27bac2a..0ddb1e3 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -1,3 +1,9 @@
+## 1.7.0
+
+* `psDecodeEntities` is no longer passed numeric character references (e.g., `&#x20;`, `&#65;`) and the predefined XML entities (`&amp;`, `&lt;`, etc). They are now handled by the parser. Both of these construct classes only have one spec-compliant interpretation and this behaviour must always be present, so it makes no sense to force user code to re-implement the parsing logic.
+* In prior versions of xml-conduit, hexadecimal character references with a leading `0x` or `0X` like `&0x20;` were accepted. This was not in compliance with the XML specification and it has been corrected.
+* xml-conduit now rejects some (but not all) invalid-according-to-spec entities during parsing: specifically, entities with a leading `#` that are not character references are no longer allowed and will be parse errors.
+
## 1.6.0
* Dropped the dependency on `data-default` for `data-default-class`, reducing the transitive dependency load. For most users, this will not be a breaking change, but it does mean that importing `Text.XML.Conduit` will no longer bring various instances for `Default` into scope. This will break code that relies on those instances and does not otherwise see them. To fix this, import `Data.Default` from `data-default` or one of the more specific instance-providing packages directly (e.g., `data-default-dlist` for the `DList` instance).
diff --git a/Text/XML/Stream/Parse.hs b/Text/XML/Stream/Parse.hs
index 658ce13..99fbbbc 100644
--- a/Text/XML/Stream/Parse.hs
+++ b/Text/XML/Stream/Parse.hs
@@ -156,14 +156,12 @@ module Text.XML.Stream.Parse
, PositionRange
, EventPos
) where
-import Blaze.ByteString.Builder (fromWord32be, toByteString)
-import Control.Applicative ((<$>))
import Control.Applicative (Alternative (empty, (<|>)),
Applicative (..), (<$>))
import qualified Control.Applicative as A
import Control.Arrow ((***))
import Control.Exception (Exception (..), SomeException)
-import Control.Monad (ap, guard, liftM, void)
+import Control.Monad (ap, liftM, void)
import Control.Monad.Fix (fix)
import Control.Monad.Trans.Class (lift)
import Control.Monad.Trans.Maybe (MaybeT (..))
@@ -189,12 +187,9 @@ import Data.Maybe (fromMaybe, isNothing)
import Data.String (IsString (..))
import Data.Text (Text, pack)
import qualified Data.Text as T
-import Data.Text.Encoding (decodeUtf32BEWith,
- decodeUtf8With)
-import Data.Text.Encoding.Error (ignore, lenientDecode)
-import Data.Text.Read (Reader, decimal, hexadecimal)
+import Data.Text.Encoding (decodeUtf8With)
+import Data.Text.Encoding.Error (lenientDecode)
import Data.Typeable (Typeable)
-import Data.Word (Word32)
import Data.XML.Types (Content (..), Event (..),
ExternalID (..),
Instruction (..), Name (..))
@@ -567,18 +562,42 @@ parseIdent =
valid '"' = False
valid '\'' = False
valid '/' = False
+ valid ';' = False
+ valid '#' = False
valid c = not $ isXMLSpace c
parseContent :: DecodeEntities
-> Bool -- break on double quote
-> Bool -- break on single quote
-> Parser Content
-parseContent de breakDouble breakSingle = parseEntity <|> parseTextContent where
- parseEntity = do
+parseContent de breakDouble breakSingle = parseReference <|> parseTextContent where
+ parseReference = do
char' '&'
- t <- takeWhile1 (/= ';')
+ t <- parseEntityRef <|> parseHexCharRef <|> parseDecCharRef
char' ';'
- return $ de t
+ return t
+ parseEntityRef = do
+ TName ma b <- parseName
+ let name = maybe "" (`T.append` ":") ma `T.append` b
+ return $ case name of
+ "lt" -> ContentText "<"
+ "gt" -> ContentText ">"
+ "amp" -> ContentText "&"
+ "quot" -> ContentText "\""
+ "apos" -> ContentText "'"
+ _ -> de name
+ parseHexCharRef = do
+ void $ string "#x"
+ n <- AT.hexadecimal
+ case toValidXmlChar n of
+ Nothing -> fail "Invalid character from hexadecimal character reference."
+ Just c -> return $ ContentText $ T.singleton c
+ parseDecCharRef = do
+ void $ string "#"
+ n <- AT.decimal
+ case toValidXmlChar n of
+ Nothing -> fail "Invalid character from decimal character reference."
+ Just c -> return $ ContentText $ T.singleton c
parseTextContent = ContentText <$> takeWhile1 valid
valid '"' = not breakDouble
valid '\'' = not breakSingle
@@ -586,6 +605,24 @@ parseContent de breakDouble breakSingle = parseEntity <|> parseTextContent where
valid '<' = False -- lt
valid _ = True
+-- | Is this codepoint a valid XML character? See
+-- <https://www.w3.org/TR/xml/#charsets>. This is proudly XML 1.0 only.
+toValidXmlChar :: Int -> Maybe Char
+toValidXmlChar n
+ | any checkRange ranges = Just (toEnum n)
+ | otherwise = Nothing
+ where
+ --Inclusive lower bound, inclusive upper bound.
+ ranges :: [(Int, Int)]
+ ranges =
+ [ (0x9, 0xA)
+ , (0xD, 0xD)
+ , (0x20, 0xD7FF)
+ , (0xE000, 0xFFFD)
+ , (0x10000, 0x10FFFF)
+ ]
+ checkRange (lb, ub) = lb <= n && n <= ub
+
skipSpace :: Parser ()
skipSpace = skipWhile isXMLSpace
@@ -1101,38 +1138,21 @@ takeAllTreesContent = takeAnyTreeContent
type DecodeEntities = Text -> Content
--- | Default implementation of 'DecodeEntities': handles numeric entities and
--- the five standard character entities (lt, gt, amp, quot, apos).
+-- | Default implementation of 'DecodeEntities', which leaves the
+-- entity as-is. Numeric character references and the five standard
+-- entities (lt, gt, amp, quot, pos) are handled internally by the
+-- parser.
decodeXmlEntities :: DecodeEntities
-decodeXmlEntities "lt" = ContentText "<"
-decodeXmlEntities "gt" = ContentText ">"
-decodeXmlEntities "amp" = ContentText "&"
-decodeXmlEntities "quot" = ContentText "\""
-decodeXmlEntities "apos" = ContentText "'"
-decodeXmlEntities t = let backup = ContentEntity t in
- case T.uncons t of
- Just ('#', t') ->
- case T.uncons t' of
- Just ('x', t'')
- | T.length t'' > 6 -> backup
- | otherwise -> decodeChar hexadecimal backup t''
- _
- | T.length t' > 7 -> backup
- | otherwise -> decodeChar decimal backup t'
- _ -> backup
-
--- | HTML4-compliant entity decoder. Handles numerics, the five standard
--- character entities, and the additional 248 entities defined by HTML 4 and
--- XHTML 1.
+decodeXmlEntities = ContentEntity
+
+-- | HTML4-compliant entity decoder. Handles the additional 248
+-- entities defined by HTML 4 and XHTML 1.
--
-- Note that HTML 5 introduces a drastically larger number of entities, and
-- this code does not recognize most of them.
decodeHtmlEntities :: DecodeEntities
decodeHtmlEntities t =
- case decodeXmlEntities t of
- x@ContentText{} -> x
- backup@ContentEntity{} ->
- maybe backup ContentText $ Map.lookup t htmlEntities
+ maybe (ContentEntity t) ContentText $ Map.lookup t htmlEntities
htmlEntities :: Map.Map T.Text T.Text
htmlEntities = Map.fromList
@@ -1386,12 +1406,3 @@ htmlEntities = Map.fromList
, ("hearts", "\9829")
, ("diams", "\9830")
]
-
-decodeChar :: Reader Word32 -> Content -> Text -> Content
-decodeChar readNum backup = either (const backup) toContent . readNum
- where
- toContent (num, extra) | T.null extra =
- case decodeUtf32BEWith ignore . toByteString $ fromWord32be num of
- c | T.length c == 1 -> ContentText c
- | otherwise -> backup
- toContent _ = backup
diff --git a/test/main.hs b/test/main.hs
index 5c69472..f3521df 100644
--- a/test/main.hs
+++ b/test/main.hs
@@ -52,6 +52,7 @@ main = hspec $ do
it "displays comments" testRenderComments
it "conduit parser" testConduitParser
it "can omit the XML declaration" omitXMLDeclaration
+ context "correctly parses hexadecimal entities" hexEntityParsing
describe "XML Cursors" $ do
it "has correct parent" cursorParent
it "has correct ancestor" cursorAncestor
@@ -179,7 +180,7 @@ testChoose = do
testChooseElemOrTextIsWhiteSpace
it "can choose between text and elements, when the text is whitespace"
testChooseTextOrElemIsWhiteSpace
- it "can choose betwen text and elements, when the whitespace is both literal and encoded"
+ it "can choose between text and elements, when the whitespace is both literal and encoded"
testChooseElemOrTextIsChunkedText
it "can choose between text and elements, when the text is chunked the other way"
testChooseElemOrTextIsChunkedText2
@@ -537,6 +538,48 @@ omitXMLDeclaration = Res.renderLBS settings input @?= spec
[]
spec = "<foo>bar</foo>"
+hexEntityParsing :: Spec
+hexEntityParsing = do
+ it "rejects leading 0x" $
+ go "<foo>&#x0xff;</foo>" @?= Nothing
+ it "rejects leading 0X" $
+ go "<foo>&#x0Xff;</foo>" @?= Nothing
+ it "accepts lowercase hex digits" $
+ go "<foo>&#xff;</foo>" @?= Just spec
+ it "accepts uppercase hex digits" $
+ go "<foo>&#xFF;</foo>" @?= Just spec
+ --Note: this must be rejected, because, according to the XML spec, a
+ --legal EntityRef's entity matches Name, which can't start with a
+ --hash.
+ it "rejects trailing junk" $
+ go "<foo>&#xffhello;</foo>" @?= Nothing
+ --Some of these next tests are XML 1.0 specific (i.e., they would
+ --differ for XML 1.1), but approximately no-one uses XML 1.1.
+ it "rejects illegal character #x0" $
+ go "<foo>&#x0;</foo>" @?= Nothing
+ it "rejects illegal character #xFFFE" $
+ go "<foo>&#xFFFE;</foo>" @?= Nothing
+ it "rejects illegal character #xFFFF" $
+ go "<foo>&#xFFFF;</foo>" @?= Nothing
+ it "rejects illegal character #xD900" $
+ go "<foo>&#xD900;</foo>" @?= Nothing
+ it "rejects illegal character #xC" $
+ go "<foo>&#xC;</foo>" @?= Nothing
+ it "rejects illegal character #x1F" $
+ go "<foo>&#x1F;</foo>" @?= Nothing
+ it "accepts astral plane character" $
+ go "<foo>&#x1006ff;</foo>" @?= Just astralSpec
+ where
+ spec = Document (Prologue [] Nothing [])
+ (Element "foo" [] [NodeContent (ContentText "\xff")])
+ []
+
+ astralSpec = Document (Prologue [] Nothing [])
+ (Element "foo" [] [NodeContent (ContentText "\x1006ff")])
+ []
+
+ go = either (const Nothing) Just . D.parseLBS def
+
name :: [Cu.Cursor] -> [Text]
name [] = []
name (c:cs) = ($ name cs) $ case Cu.node c of
diff --git a/xml-conduit.cabal b/xml-conduit.cabal
index 747833f..c9c2181 100644
--- a/xml-conduit.cabal
+++ b/xml-conduit.cabal
@@ -1,5 +1,5 @@
name: xml-conduit
-version: 1.6.0
+version: 1.7.0
license: MIT
license-file: LICENSE
author: Michael Snoyman <michael@snoyman.com>, Aristid Breitkreuz <aristidb@googlemail.com>