summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDougBeardsley <>2017-05-19 10:39:00 (GMT)
committerhdiff <hdiff@hdiff.luite.com>2017-05-19 10:39:00 (GMT)
commit6e3492e3c9e9adda3af2b532df313ad6114828c7 (patch)
treec3a404bbce51e305388bd4dbd19eca8e0bb4b554
parentce8128f0cfae1abd2b2a0a05f5ef91065db82bfa (diff)
version 0.2.40.2.4
-rw-r--r--src/Text/XmlHtml/Common.hs58
-rw-r--r--src/Text/XmlHtml/HTML/Meta.hs19
-rw-r--r--src/Text/XmlHtml/HTML/Render.hs22
-rw-r--r--test/src/Text/XmlHtml/Tests.hs41
-rw-r--r--xmlhtml.cabal14
5 files changed, 125 insertions, 29 deletions
diff --git a/src/Text/XmlHtml/Common.hs b/src/Text/XmlHtml/Common.hs
index 36c9fae..acdad42 100644
--- a/src/Text/XmlHtml/Common.hs
+++ b/src/Text/XmlHtml/Common.hs
@@ -5,6 +5,8 @@
module Text.XmlHtml.Common where
import Blaze.ByteString.Builder
+import Data.Char (isAscii, isLatin1)
+import qualified Data.HashMap.Strict as M
import Data.Maybe
import Data.Text (Text)
@@ -13,6 +15,8 @@ import qualified Data.Text.Encoding as T
import qualified Data.Text.Encoding.Error as TE
import Data.ByteString (ByteString)
+import qualified Data.ByteString.Char8 as BS
+import Text.XmlHtml.HTML.Meta (reversePredefinedRefs)
------------------------------------------------------------------------------
@@ -186,32 +190,66 @@ data InternalSubset = InternalText !Text
------------------------------------------------------------------------------
-- | The character encoding of a document. Currently only the required
-- character encodings are implemented.
-data Encoding = UTF8 | UTF16BE | UTF16LE deriving (Eq, Show)
+data Encoding = UTF8 | UTF16BE | UTF16LE | ISO_8859_1 deriving (Eq, Show)
------------------------------------------------------------------------------
-- | Retrieves the preferred name of a character encoding for embedding in
-- a document.
encodingName :: Encoding -> Text
-encodingName UTF8 = "UTF-8"
-encodingName UTF16BE = "UTF-16"
-encodingName UTF16LE = "UTF-16"
+encodingName UTF8 = "UTF-8"
+encodingName UTF16BE = "UTF-16"
+encodingName UTF16LE = "UTF-16"
+encodingName ISO_8859_1 = "ISO-8859-1"
------------------------------------------------------------------------------
-- | Gets the encoding function from 'Text' to 'ByteString' for an encoding.
encoder :: Encoding -> Text -> ByteString
-encoder UTF8 = T.encodeUtf8
-encoder UTF16BE = T.encodeUtf16BE
-encoder UTF16LE = T.encodeUtf16LE
+encoder UTF8 = T.encodeUtf8
+encoder UTF16BE = T.encodeUtf16BE
+encoder UTF16LE = T.encodeUtf16LE
+encoder ISO_8859_1 = encodeIso_8859_1
+
+
+------------------------------------------------------------------------------
+-- | Encodes UTF-8 Text into bytestring with only latin1 characters
+-- UTF-8 characters found in the input and present in the
+-- 'Text.XmlHtml.Meta.references' map are mapped to their escape sequences,
+-- and any other UTF-8 characters are replaced with ascii "?"
+encodeIso_8859_1 :: Text -> ByteString
+encodeIso_8859_1 t = T.encodeUtf8 . T.concat . map toAsciiChunk $
+ T.groupBy asciiSplits t
+ where
+
+ -- Identify long strings of all-acceptable or all-unacceptable characters
+ -- Acceptable strings are passed through
+ -- Unacceptable strings are mapped to ASCII character by character
+ toAsciiChunk sub =
+ if T.any isAscii sub
+ then sub
+ else T.concat . map toAsciiChar $ T.unpack sub
+ asciiSplits x y = isAscii x == isAscii y
+
+ -- A character's mapping to ascii goes through html entity escaping
+ -- if that character is in the references table
+ -- Otherwise its unicode index is printed to decimal and "&#" is appended
+ toAsciiChar c = maybe
+ (uniEscape c) (\esc -> T.concat ["&", esc, ";"])
+ (M.lookup (T.singleton c) reversePredefinedRefs)
+
+ uniEscape = T.append "&#" . flip T.snoc ';' . T.pack .
+ (show :: Int -> String) . fromEnum
------------------------------------------------------------------------------
-- | Gets the decoding function from 'ByteString' to 'Text' for an encoding.
decoder :: Encoding -> ByteString -> Text
-decoder UTF8 = T.decodeUtf8With (TE.replace '\xFFFF')
-decoder UTF16BE = T.decodeUtf16BEWith (TE.replace '\xFFFF')
-decoder UTF16LE = T.decodeUtf16LEWith (TE.replace '\xFFFF')
+decoder UTF8 = T.decodeUtf8With (TE.replace '\xFFFF')
+decoder UTF16BE = T.decodeUtf16BEWith (TE.replace '\xFFFF')
+decoder UTF16LE = T.decodeUtf16LEWith (TE.replace '\xFFFF')
+decoder ISO_8859_1 = T.decodeLatin1 .
+ BS.map (\c -> if isLatin1 c then c else '?')
------------------------------------------------------------------------------
diff --git a/src/Text/XmlHtml/HTML/Meta.hs b/src/Text/XmlHtml/HTML/Meta.hs
index 094afd2..f624bb6 100644
--- a/src/Text/XmlHtml/HTML/Meta.hs
+++ b/src/Text/XmlHtml/HTML/Meta.hs
@@ -8,7 +8,9 @@ module Text.XmlHtml.HTML.Meta
, isRawText
, endOmittableLast
, endOmittableNext
+ , explicitAttributes
, predefinedRefs
+ , reversePredefinedRefs
) where
#if !MIN_VERSION_base(4,8,0)
@@ -113,6 +115,16 @@ endOmittableNext = M.fromList [
]
------------------------------------------------------------------------------
+-- | Tags and attributes which should always be rendered with an explicit
+-- value, even when the value is empty. This is required by some web browsers
+-- for tags that are typically non-empty.
+{-# NOINLINE explicitAttributes #-}
+explicitAttributes :: HashMap Text (HashSet Text)
+explicitAttributes = M.fromList [
+ ("a", S.fromList ["href"])
+ ]
+
+------------------------------------------------------------------------------
-- | Predefined character entity references as defined by the HTML5 spec.
{-# NOINLINE predefinedRefs #-}
predefinedRefs :: HashMap Text Text
@@ -2534,3 +2546,10 @@ reftab58 =
("zwj", "\x0200D"),
("zwnj", "\x0200C") ]
+
+------------------------------------------------------------------------------
+-- Reverse lookup of Html entities
+reversePredefinedRefs :: HashMap Text Text
+reversePredefinedRefs = M.fromList . map (\(x,y) -> (y,x)) $
+ M.toList predefinedRefs
+
diff --git a/src/Text/XmlHtml/HTML/Render.hs b/src/Text/XmlHtml/HTML/Render.hs
index cd517b8..d4a238e 100644
--- a/src/Text/XmlHtml/HTML/Render.hs
+++ b/src/Text/XmlHtml/HTML/Render.hs
@@ -18,6 +18,7 @@ import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.HashSet as S
+import qualified Data.HashMap.Strict as M
#if !MIN_VERSION_base(4,8,0)
import Data.Monoid
@@ -98,7 +99,7 @@ element e t tb a c
| tb `S.member` voidTags && null c =
fromText e "<"
`mappend` fromText e t
- `mappend` (mconcat $ map (attribute e) a)
+ `mappend` (mconcat $ map (attribute e tb) a)
`mappend` fromText e " />"
| tb `S.member` voidTags =
error $ T.unpack t ++ " must be empty"
@@ -108,7 +109,7 @@ element e t tb a c
not ("</" `T.append` t `T.isInfixOf` s) =
fromText e "<"
`mappend` fromText e t
- `mappend` (mconcat $ map (attribute e) a)
+ `mappend` (mconcat $ map (attribute e tb) a)
`mappend` fromText e ">"
`mappend` fromText e s
`mappend` fromText e "</"
@@ -122,7 +123,7 @@ element e t tb a c
| otherwise =
fromText e "<"
`mappend` fromText e t
- `mappend` (mconcat $ map (attribute e) a)
+ `mappend` (mconcat $ map (attribute e tb) a)
`mappend` fromText e ">"
`mappend` (mconcat $ map (node e) c)
`mappend` fromText e "</"
@@ -131,21 +132,24 @@ element e t tb a c
------------------------------------------------------------------------------
-attribute :: Encoding -> (Text, Text) -> Builder
-attribute e (n,v)
- | v == "" =
+attribute :: Encoding -> Text -> (Text, Text) -> Builder
+attribute e tb (n,v)
+ | v == "" && not explicit =
fromText e " "
`mappend` fromText e n
- | not ("\'" `T.isInfixOf` v) =
+ | v /= "" && not ("\'" `T.isInfixOf` v) =
fromText e " "
`mappend` fromText e n
`mappend` fromText e "=\'"
`mappend` escaped "&" e v
`mappend` fromText e "\'"
- | otherwise =
+ | otherwise =
fromText e " "
`mappend` fromText e n
`mappend` fromText e "=\""
`mappend` escaped "&\"" e v
`mappend` fromText e "\""
-
+ where nbase = T.toLower $ snd $ T.breakOnEnd ":" n
+ explicit = case M.lookup tb explicitAttributes of
+ Nothing -> False
+ Just ns -> nbase `S.member` ns
diff --git a/test/src/Text/XmlHtml/Tests.hs b/test/src/Text/XmlHtml/Tests.hs
index a321e96..32f018b 100644
--- a/test/src/Text/XmlHtml/Tests.hs
+++ b/test/src/Text/XmlHtml/Tests.hs
@@ -561,7 +561,10 @@ xmlRenderingTests = [
testIt "renderEmptyText " renderEmptyText,
testIt "singleQuoteInAttr " singleQuoteInAttr,
testIt "doubleQuoteInAttr " doubleQuoteInAttr,
- testIt "bothQuotesInAttr " bothQuotesInAttr
+ testIt "bothQuotesInAttr " bothQuotesInAttr,
+ testIt "ndashEscapesInLatin " ndashEscapesInLatin,
+ testIt "smileyEscapesInLatin " smileyEscapesInLatin,
+ testIt "numericalEscapes " numericalEscapes
]
renderByteOrderMark :: Bool
@@ -644,6 +647,26 @@ bothQuotesInAttr =
]))
== utf8Decl `B.append` "<foo bar=\"test\'&quot;ing\"/>"
+ndashEscapesInLatin :: Bool
+ndashEscapesInLatin =
+ toByteString (renderXmlFragment ISO_8859_1 ([
+ TextNode "Hello–world"
+ ]))
+ == "Hello&ndash;world"
+
+smileyEscapesInLatin :: Bool
+smileyEscapesInLatin =
+ toByteString (renderXmlFragment ISO_8859_1 ([
+ TextNode "Hello ☺"
+ ]))
+ == "Hello &#9786;"
+
+numericalEscapes :: Bool
+numericalEscapes =
+ fmap (toByteString . renderXmlFragment ISO_8859_1 . docContent)
+ (parseXML "test" "Hello &#174;")
+ == Right "Hello &REG;"
+
------------------------------------------------------------------------------
-- HTML Repeats of XML Rendering Tests ---------------------------------------
@@ -749,6 +772,8 @@ htmlRenderingQuirkTests = [
testIt "renderHTMLRaw2 " renderHTMLRaw2,
testIt "renderHTMLRaw3 " renderHTMLRaw3,
testIt "renderHTMLRaw4 " renderHTMLRaw4,
+ testIt "renderHTMLEmptyAttr " renderHTMLEmptyAttr,
+ testIt "renderHTMLEmptyAttr2 " renderHTMLEmptyAttr2,
testIt "renderHTMLAmpAttr1 " renderHTMLAmpAttr1,
testIt "renderHTMLAmpAttr2 " renderHTMLAmpAttr2,
testIt "renderHTMLAmpAttr3 " renderHTMLAmpAttr3,
@@ -818,6 +843,20 @@ renderHTMLRaw4 = isBottom $
]
]))
+renderHTMLEmptyAttr :: Bool
+renderHTMLEmptyAttr =
+ toByteString (render (HtmlDocument UTF8 Nothing [
+ Element "input" [("checked", "")] []
+ ]))
+ == "<input checked />"
+
+renderHTMLEmptyAttr2 :: Bool
+renderHTMLEmptyAttr2 =
+ toByteString (render (HtmlDocument UTF8 Nothing [
+ Element "a" [("href", "")] []
+ ]))
+ == "<a href=\"\"></a>"
+
renderHTMLAmpAttr1 :: Bool
renderHTMLAmpAttr1 =
toByteString (render (HtmlDocument UTF8 Nothing [
diff --git a/xmlhtml.cabal b/xmlhtml.cabal
index 6c6e263..55b86d6 100644
--- a/xmlhtml.cabal
+++ b/xmlhtml.cabal
@@ -1,5 +1,5 @@
Name: xmlhtml
-Version: 0.2.3.5
+Version: 0.2.4
Synopsis: XML parser and renderer with HTML 5 quirks mode
Description: Contains renderers and parsers for both XML and HTML 5
document fragments, which share data structures so that
@@ -841,26 +841,22 @@ Library
ghc-options: -Wall -fwarn-tabs -fno-warn-orphans
Test-suite testsuite
- hs-source-dirs: src test/src
+ hs-source-dirs: test/src
type: exitcode-stdio-1.0
main-is: TestSuite.hs
build-depends:
- HUnit >= 1.2 && <1.4,
- QuickCheck >= 2.3.0.2,
+ HUnit >= 1.2 && <1.7,
base,
blaze-builder,
blaze-html,
blaze-markup,
bytestring,
- containers,
- directory >= 1.0 && <1.3,
- parsec,
+ directory >= 1.0 && <1.4,
test-framework >= 0.8.0.3 && <0.9,
test-framework-hunit >= 0.3 && <0.4,
- test-framework-quickcheck2 >= 0.3 && <0.4,
text,
- unordered-containers
+ xmlhtml
ghc-options: -Wall -fwarn-tabs -funbox-strict-fields -threaded
-fno-warn-unused-do-bind