summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorocramz <>2020-03-09 08:01:00 (GMT)
committerhdiff <hdiff@hdiff.luite.com>2020-03-09 08:01:00 (GMT)
commit1dd6172e1af9fa10dad1aae168987686f9dc9d97 (patch)
tree2d147de8154374e58b4d22061ab90b247796d7f4
parente9ee106d6e482af5bf802c0106e6508c71ca5556 (diff)
version 0.40.4
-rw-r--r--CHANGELOG.markdown26
-rw-r--r--app/Bench.hs45
-rw-r--r--bench/Memory.hs22
-rw-r--r--bench/Speed.hs113
-rw-r--r--bench/SpeedBigFiles.hs122
-rw-r--r--src/Xeno/DOM.hs297
-rw-r--r--src/Xeno/DOM/Internal.hs123
-rw-r--r--src/Xeno/DOM/Robust.hs155
-rw-r--r--src/Xeno/Errors.hs1
-rw-r--r--src/Xeno/SAX.hs344
-rw-r--r--src/Xeno/Types.hs24
-rw-r--r--test/Main.hs106
-rw-r--r--xeno.cabal27
13 files changed, 1005 insertions, 400 deletions
diff --git a/CHANGELOG.markdown b/CHANGELOG.markdown
index 96ffdb4..e47674b 100644
--- a/CHANGELOG.markdown
+++ b/CHANGELOG.markdown
@@ -1,4 +1,16 @@
- 0.3.5.2
+ 0.4
+ * A number of optimizations and some changes in ergonomics. Thanks to Dmitry Krylov (dmalkr) and Michal Gajda (mgajda) !
+ * breaking API changes :
+ * The parameters to function 'Xeno.SAX.process' are now wrapped in a Process type
+ * Speed optimizations :
+ * function 'Xeno.DOM.predictGrowSize'
+ * Xeno.DOM.Robust
+ * Benchmark improvements :
+ * Added benchmarks for ByteStringZeroTerminated
+ * Added benchmarks for big files (bench/SpeedBigFiles.hs)
+ * Benchmarks run non-threaded
+
+ 0.3.5.2
* Fix dependency lower bounds (GHC 8.0.1 is the earliest version currently supported)
0.3.5
@@ -9,15 +21,15 @@
* Fixed typos in the examples (unhammer)
0.3.2
- Fixed DOM parsing from bystrings with non-zero offset (#11, qrilka)
+ * Fixed DOM parsing from bystrings with non-zero offset (#11, qrilka)
0.3
- Fixed name parsing (for attributes and tags) so it conforms with the XML spec (qrilka)
- Fixed parsing failure when root tag is preceded by white space (though without checking for white space characters specifically) (qrilka)
- Added contribution guidelines (ocramz)
+ * Fixed name parsing (for attributes and tags) so it conforms with the XML spec (qrilka)
+ * Fixed parsing failure when root tag is preceded by white space (though without checking for white space characters specifically) (qrilka)
+ * Added contribution guidelines (ocramz)
0.2
- Added CDATA support (Rembane)
+ * Added CDATA support (Rembane)
0.1
- First Hackage release
+ * First Hackage release
diff --git a/app/Bench.hs b/app/Bench.hs
new file mode 100644
index 0000000..79daa6a
--- /dev/null
+++ b/app/Bench.hs
@@ -0,0 +1,45 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE QuasiQuotes #-}
+import Control.Monad
+import Data.Time.Clock
+import System.IO.Posix.MMap
+import System.Mem
+import Xeno.DOM
+import qualified Data.ByteString as BS
+
+
+main :: IO ()
+main = do
+ let prefix = "ex-data/"
+ files' = map (prefix ++)
+ [ {- 921 Mb -} "1htq.xml"
+ , {- 190 Mb -} "enwiki-20190901-abstract10.xml"
+ , {- 1.6 Gb -} "enwiki-20190901-pages-logging1.xml"
+ , {- 4.0 Gb -} "enwiki-20190901-pages-meta-current24.xml-p30503451p32003451.xml"
+ -- , {- 21 Gb -} "enwiki-20190901-pages-meta-history2.xml"
+ ]
+ files = concat $ replicate 5 files'
+ --
+ deltas <- forM files $ \fn -> do
+ putStrLn $ "Processing file '" ++ show fn ++ "'"
+ --
+ -- NOTE: It is need to cache file in memory BEFORE start test.
+ -- It can be done with `vmtouch` utility for example (`vmtouch -vtL *`).
+ --
+ bs <- unsafeMMapFile fn
+ -- bs <- BS.readFile fn
+ putStrLn $ " size: " ++ show (BS.length bs `div` (1024*1024)) ++ " Mb"
+ performGC
+ start <- getCurrentTime
+ -- SAX:
+ -- let res = validate bs
+ -- putStrLn [qc| process result: {res}|]
+ -- DOM:
+ (\(Right !_node) -> putStrLn " processed!") (parse bs)
+ finish <- getCurrentTime
+ let delta = finish `diffUTCTime` start
+ putStrLn $ " processing time: " ++ show delta
+ return delta
+ --
+ putStrLn "------"
+ putStrLn $ "Total: " ++ show (sum deltas)
diff --git a/bench/Memory.hs b/bench/Memory.hs
index 04d5f8d..3c40b20 100644
--- a/bench/Memory.hs
+++ b/bench/Memory.hs
@@ -10,6 +10,7 @@ import qualified Data.ByteString as S
import qualified Text.XML.Hexml as Hexml
import Weigh
import qualified Xeno.DOM
+import qualified Xeno.DOM.Robust
import qualified Xeno.SAX
main :: IO ()
@@ -18,15 +19,18 @@ main = do
f31kb <- S.readFile "data/text-31kb.xml"
f211kb <- S.readFile "data/fabricated-211kb.xml"
mainWith
- (do func "4kb/hexml/dom" Hexml.parse f4kb
- func "4kb/xeno/sax" Xeno.SAX.validate f4kb
- func "4kb/xeno/dom" Xeno.DOM.parse f4kb
- func "31kb/hexml/dom" Hexml.parse f31kb
- func "31kb/xeno/sax" Xeno.SAX.validate f31kb
- func "31kb/xeno/dom" Xeno.DOM.parse f31kb
- func "211kb/hexml/dom" Hexml.parse f211kb
- func "211kb/xeno/sax" Xeno.SAX.validate f211kb
- func "211kb/xeno/dom" Xeno.DOM.parse f211kb)
+ (do func "4kb_hexml_dom" Hexml.parse f4kb
+ func "4kb_xeno_sax" Xeno.SAX.validate f4kb
+ func "4kb_xeno_dom" Xeno.DOM.parse f4kb
+ func "4kb_xeno_dom-with-recovery" Xeno.DOM.Robust.parse f4kb
+ func "31kb_hexml_dom" Hexml.parse f31kb
+ func "31kb_xeno_sax" Xeno.SAX.validate f31kb
+ func "31kb_xeno_dom" Xeno.DOM.parse f31kb
+ func "31kb_xeno_dom-with-recovery" Xeno.DOM.Robust.parse f31kb
+ func "211kb_hexml_dom" Hexml.parse f211kb
+ func "211kb_xeno_sax" Xeno.SAX.validate f211kb
+ func "211kb_xeno_dom" Xeno.DOM.parse f211kb
+ func "211kb_xeno_dom-with-recovery" Xeno.DOM.Robust.parse f211kb)
instance NFData Hexml.Node where
rnf !_ = ()
diff --git a/bench/Speed.hs b/bench/Speed.hs
index f96f1d4..46bbea3 100644
--- a/bench/Speed.hs
+++ b/bench/Speed.hs
@@ -18,90 +18,55 @@ import qualified Text.XML.Expat.SAX as Hexpat
import qualified Text.XML.Expat.Tree as HexpatTree
import qualified Text.XML.Hexml as Hexml
import Text.XML.Light as XML
-import Text.XML.Light.Input as XML
import qualified Xeno.SAX
+import qualified Xeno.Types
import qualified Xeno.DOM
+import qualified Xeno.DOM.Robust
#ifdef LIBXML2
import qualified Text.XML.LibXML.Parser as Libxml2
#endif
+
+readFileZ :: FilePath -> IO (ByteString, Xeno.Types.ByteStringZeroTerminated)
+readFileZ fn = do
+ !s <- S.readFile fn
+ let !sz = Xeno.Types.BSZT (s `S.snoc` 0)
+ return (s, sz)
+
+
main :: IO ()
-main =
- defaultMain
- [ env
- (S.readFile "data/books-4kb.xml")
- (\input ->
- bgroup
- "4KB"
- [ bench "hexml-dom" (whnf Hexml.parse input)
- , bench "xeno-sax" (whnf Xeno.SAX.validate input)
- , bench "xeno-dom" (whnf Xeno.DOM.parse input)
- , bench
- "hexpat-sax"
- (whnf
- ((Hexpat.parseThrowing Hexpat.defaultParseOptions :: L.ByteString -> [Hexpat.SAXEvent ByteString ByteString]) .
- L.fromStrict)
- input)
- , bench
- "hexpat-dom"
- (whnf
- ((HexpatTree.parse' HexpatTree.defaultParseOptions :: ByteString -> Either HexpatTree.XMLParseError (HexpatTree.Node ByteString ByteString)))
- input)
- , bench "xml-dom" (nf XML.parseXMLDoc input)
+main = defaultMain $
+ (flip map) [ ("4KB", "data/books-4kb.xml")
+ , ("31KB", "data/text-31kb.xml")
+ , ("211KB", "data/fabricated-211kb.xml")
+ ]
+ $ \(group, fn) ->
+ env (readFileZ fn)
+ (\ ~(!input, !inputz) -> bgroup group
+ [ bench "hexml-dom" (whnf Hexml.parse input)
+ , bench "xeno-sax" (whnf Xeno.SAX.validate input)
+ , bench "xeno-sax-z" (whnf Xeno.SAX.validate inputz)
+ , bench "xeno-sax-ex" (whnf Xeno.SAX.validateEx input)
+ , bench "xeno-sax-ex-z" (whnf Xeno.SAX.validateEx inputz)
+ , bench "xeno-dom" (whnf Xeno.DOM.parse input)
+ , bench "xeno-dom-with-recovery" (whnf Xeno.DOM.Robust.parse input)
+ , bench
+ "hexpat-sax"
+ (whnf
+ ((Hexpat.parseThrowing Hexpat.defaultParseOptions :: L.ByteString -> [Hexpat.SAXEvent ByteString ByteString]) .
+ L.fromStrict)
+ input)
+ , bench
+ "hexpat-dom"
+ (whnf
+ ((HexpatTree.parse' HexpatTree.defaultParseOptions :: ByteString -> Either HexpatTree.XMLParseError (HexpatTree.Node ByteString ByteString)))
+ input)
+ , bench "xml-dom" (nf XML.parseXMLDoc input)
#ifdef LIBXML2
- , bench "libxml2-dom" (whnfIO (Libxml2.parseMemory input))
+ , bench "libxml2-dom" (whnfIO (Libxml2.parseMemory input))
#endif
- ])
- , env
- (S.readFile "data/text-31kb.xml")
- (\input ->
- bgroup
- "31KB"
- [ bench "hexml-dom" (whnf Hexml.parse input)
- , bench "xeno-sax" (whnf Xeno.SAX.validate input)
- , bench "xeno-dom" (whnf Xeno.DOM.parse input)
- , bench
- "hexpat-sax"
- (nf
- ((Hexpat.parseThrowing Hexpat.defaultParseOptions :: L.ByteString -> [Hexpat.SAXEvent ByteString ByteString]) .
- L.fromStrict)
- input)
- , bench
- "hexpat-dom"
- (whnf
- ((HexpatTree.parse' HexpatTree.defaultParseOptions :: ByteString -> Either HexpatTree.XMLParseError (HexpatTree.Node ByteString ByteString)))
- input)
- , bench "xml-dom" (nf XML.parseXMLDoc input)
-#ifdef LIBXML2
- , bench "libxml2-dom" (whnfIO (Libxml2.parseMemory input))
+ ])
-#endif
- ])
- , env
- (S.readFile "data/fabricated-211kb.xml")
- (\input ->
- bgroup
- "211KB"
- [ bench "hexml-dom" (whnf Hexml.parse input)
- , bench "xeno-sax" (whnf Xeno.SAX.validate input)
- , bench "xeno-dom" (whnf Xeno.DOM.parse input)
- , bench
- "hexpat-sax"
- (nf
- ((Hexpat.parseThrowing Hexpat.defaultParseOptions :: L.ByteString -> [Hexpat.SAXEvent ByteString ByteString]) .
- L.fromStrict)
- input)
- , bench
- "hexpat-dom"
- (whnf
- ((HexpatTree.parse' HexpatTree.defaultParseOptions :: ByteString -> Either HexpatTree.XMLParseError (HexpatTree.Node ByteString ByteString)))
- input)
- , bench "xml-dom" (nf XML.parseXMLDoc input)
-#ifdef LIBXML2
- , bench "libxml2-dom" (whnfIO (Libxml2.parseMemory input))
-#endif
- ])
- ]
deriving instance Generic Content
deriving instance Generic Element
diff --git a/bench/SpeedBigFiles.hs b/bench/SpeedBigFiles.hs
new file mode 100644
index 0000000..c651458
--- /dev/null
+++ b/bench/SpeedBigFiles.hs
@@ -0,0 +1,122 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE DeriveGeneric #-}
+{-# LANGUAGE StandaloneDeriving #-}
+{-# OPTIONS_GHC -fno-warn-orphans -Wno-unused-imports #-}
+
+-- | Benchmark speed with big files
+
+module Main where
+
+
+import Codec.Compression.BZip
+import Control.DeepSeq
+import Criterion
+import Criterion.Main
+import Data.ByteString (ByteString)
+import qualified Data.ByteString.Lazy as L
+import Data.List (delete)
+import GHC.Generics
+import System.FilePath.Posix
+import qualified Text.XML.Expat.SAX as Hexpat
+import qualified Text.XML.Expat.Tree as HexpatTree
+import qualified Text.XML.Hexml as Hexml
+import Text.XML.Light as XML
+import Text.XML.Light.Input as XML
+import qualified Xeno.Types
+import qualified Xeno.SAX
+import qualified Xeno.DOM
+import qualified Xeno.DOM.Robust
+import qualified Data.ByteString as S
+#ifdef LIBXML2
+import qualified Text.XML.LibXML.Parser as Libxml2
+#endif
+
+
+main :: IO ()
+main = defaultMain
+ [ benchFile allTests "46MB" "enwiki-20190901-pages-articles14.xml-p7697599p7744799.bz2"
+ , benchFile allTests "624MB" "enwiki-20190901-pages-articles-multistream1.xml-p10p30302.bz2"
+ , benchFile allTests "921MB" "1HTQ.xml.bz2"
+ , benchFile allTests "1.6Gb" "enwiki-20190901-pages-meta-current6.xml-p565314p892912.bz2"
+ , benchFile allExceptHexml "4Gb" "enwiki-20190901-pages-meta-current24.xml-p30503451p32003451.bz2"
+ -- , benchFile allExceptHexml "21Gb" "enwiki-20190901-pages-meta-history2.xml-p31255p31720.bz2"
+ ]
+
+
+allTests :: [String]
+allTests = [ "hexml-dom"
+ , "xeno-sax"
+ , "xeno-sax-z"
+ -- , "xeno-sax-ex"
+ -- , "xeno-sax-ex-z"
+ , "xeno-dom"
+ , "xeno-dom-with-recovery"
+ -- XXX: "hexpact", "xml-dom" library don't work with big files; require too much memory
+ -- , "hexpat-sax"
+ -- , "hexpat-dom"
+ -- , "xml-dom"
+ -- , "libxml2-dom"
+ ]
+
+
+allExceptHexml :: [String]
+allExceptHexml = "hexml-dom" `delete` allTests
+
+
+benchFile :: [String] -> String -> FilePath -> Benchmark
+benchFile enabledTests size fn =
+ env (readBZip2File fn)
+ (\ ~(input, inputz) -> bgroup size $ benchMethods enabledTests input inputz)
+
+
+benchMethods :: [String] -> ByteString -> Xeno.Types.ByteStringZeroTerminated -> [Benchmark]
+benchMethods enabledTests input inputz =
+ runBench "hexml-dom" (whnf Hexml.parse input)
+ ++ runBench "xeno-sax" (whnf Xeno.SAX.validate input)
+ ++ runBench "xeno-sax-z" (whnf Xeno.SAX.validate inputz)
+ ++ runBench "xeno-sax-ex " (whnf Xeno.SAX.validateEx input)
+ ++ runBench "xeno-sax-ex-z" (whnf Xeno.SAX.validateEx inputz)
+ ++ runBench "xeno-dom" (whnf Xeno.DOM.parse input)
+ ++ runBench "xeno-dom-with-recovery" (whnf Xeno.DOM.Robust.parse input)
+ ++ runBench
+ "hexpat-sax"
+ (whnf
+ ((Hexpat.parseThrowing Hexpat.defaultParseOptions :: L.ByteString -> [Hexpat.SAXEvent ByteString ByteString]) .
+ L.fromStrict)
+ input)
+ ++ runBench
+ "hexpat-dom"
+ (whnf
+ ((HexpatTree.parse' HexpatTree.defaultParseOptions :: ByteString -> Either HexpatTree.XMLParseError (HexpatTree.Node ByteString ByteString)))
+ input)
+ ++ runBench "xml-dom" (nf XML.parseXMLDoc input)
+#ifdef LIBXML2
+ ++ runBench "libxml2-dom" (whnfIO (Libxml2.parseMemory input))
+#endif
+ where
+ runBench name act
+ | name `elem` enabledTests = [bench name act]
+ | otherwise = []
+
+
+readBZip2File :: FilePath -> IO (ByteString, Xeno.Types.ByteStringZeroTerminated)
+readBZip2File fn = do
+ file <- L.readFile ("data" </> "ex" </> fn)
+ let !bs = L.toStrict $ decompress file
+ !bsz = Xeno.Types.BSZT $ bs `S.snoc` 0
+ return (bs, bsz)
+
+
+deriving instance Generic Content
+deriving instance Generic Element
+deriving instance Generic CData
+deriving instance Generic CDataKind
+deriving instance Generic QName
+deriving instance Generic Attr
+instance NFData Content
+instance NFData Element
+instance NFData CData
+instance NFData CDataKind
+instance NFData QName
+instance NFData Attr
diff --git a/src/Xeno/DOM.hs b/src/Xeno/DOM.hs
index 78feadb..25369db 100644
--- a/src/Xeno/DOM.hs
+++ b/src/Xeno/DOM.hs
@@ -1,8 +1,8 @@
-{-# LANGUAGE BangPatterns #-}
-{-# LANGUAGE DeriveAnyClass #-}
-{-# LANGUAGE DeriveGeneric #-}
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE DeriveAnyClass #-}
+{-# LANGUAGE DeriveDataTypeable #-}
+{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
-
-- | DOM parser and API for XML.
module Xeno.DOM
@@ -15,120 +15,23 @@ module Xeno.DOM
, children
) where
-import Control.DeepSeq
import Control.Monad.ST
import Control.Spork
-import Data.ByteString (ByteString)
-import qualified Data.ByteString as S
+import Data.ByteString (ByteString)
import Data.ByteString.Internal (ByteString(PS))
+import qualified Data.ByteString as S
import Data.Mutable
import Data.STRef
-import Data.Vector.Unboxed ((!))
import qualified Data.Vector.Unboxed as UV
import qualified Data.Vector.Unboxed.Mutable as UMV
import Xeno.SAX
import Xeno.Types
-
--- | Some XML nodes.
-data Node = Node !ByteString !Int !(UV.Vector Int)
- deriving (Eq)
-
-instance NFData Node where
- rnf !_ = ()
-
-instance Show Node where
- show n =
- "(Node " ++
- show (name n) ++
- " " ++ show (attributes n) ++ " " ++ show (contents n) ++ ")"
-
--- | Content of a node.
-data Content
- = Element {-# UNPACK #-}!Node
- | Text {-# UNPACK #-}!ByteString
- | CData {-# UNPACK #-}!ByteString
- deriving (Eq, Show)
-
--- | Get just element children of the node (no text).
-children :: Node -> [Node]
-children (Node str start offsets) = collect firstChild
- where
- collect i
- | i < endBoundary =
- case offsets ! i of
- 0x00 -> Node str i offsets : collect (offsets ! (i + 4))
- 0x01 -> collect (i + 3)
- _ -> []
- | otherwise = []
- firstChild = go (start + 5)
- where
- go i
- | i < endBoundary =
- case offsets ! i of
- 0x02 -> go (i + 5)
- _ -> i
- | otherwise = i
- endBoundary = offsets ! (start + 4)
-
--- | Contents of a node.
-contents :: Node -> [Content]
-contents (Node str start offsets) = collect firstChild
- where
- collect i
- | i < endBoundary =
- case offsets ! i of
- 0x00 ->
- Element
- (Node str i offsets) :
- collect (offsets ! (i + 4))
- 0x01 ->
- Text (substring str (offsets ! (i + 1)) (offsets ! (i + 2))) :
- collect (i + 3)
- 0x03 ->
- CData (substring str (offsets ! (i + 1)) (offsets ! (i + 2))) :
- collect (i + 3)
- _ -> []
- | otherwise = []
- firstChild = go (start + 5)
- where
- go i | i < endBoundary =
- case offsets ! i of
- 0x02 -> go (i + 5)
- _ -> i
- | otherwise = i
- endBoundary = offsets ! (start + 4)
-
--- | Attributes of a node.
-attributes :: Node -> [(ByteString,ByteString)]
-attributes (Node str start offsets) = collect (start + 5)
- where
- collect i
- | i < endBoundary =
- case offsets ! i of
- 0x02 ->
- ( substring str (offsets ! (i + 1)) (offsets ! (i + 2))
- , substring str (offsets ! (i + 3)) (offsets ! (i + 4))) :
- collect (i + 5)
- _ -> []
- | otherwise = []
- endBoundary = offsets ! (start + 4)
-
--- | Name of the element.
-name :: Node -> ByteString
-name (Node str start offsets) =
- case offsets ! start of
- 0x00 -> substring str (offsets ! (start + 2)) (offsets ! (start + 3))
- _ -> mempty
+import Xeno.DOM.Internal
-- | Parse a complete Nodes document.
parse :: ByteString -> Either XenoException Node
parse str =
- case spork node of
- Left e -> Left e
- Right r ->
- case findRootNode r of
- Just n -> Right n
- Nothing -> Left XenoExpectRootNode
+ maybe (Left XenoExpectRootNode) Right . findRootNode =<< spork node
where
findRootNode r = go 0
where
@@ -137,99 +40,111 @@ parse str =
-- skipping text assuming that it contains only white space
-- characters
Just 0x1 -> go (n+3)
- _ -> Nothing
+ _ -> Nothing
PS _ offset0 _ = str
node =
+ let !initialSize = max 1000 (S.length str `div` 8) in
runST
- (do nil <- UMV.new 1000
+ (do nil <- UMV.unsafeNew initialSize
vecRef <- newSTRef nil
sizeRef <- fmap asURef (newRef 0)
parentRef <- fmap asURef (newRef 0)
- process
- (\(PS _ name_start name_len) -> do
- let tag = 0x00
- tag_end = -1
- index <- readRef sizeRef
- v' <-
- do v <- readSTRef vecRef
- if index + 5 < UMV.length v
- then pure v
- else do
- v' <- UMV.grow v (UMV.length v)
- writeSTRef vecRef v'
- return v'
- tag_parent <- readRef parentRef
- do writeRef parentRef index
- writeRef sizeRef (index + 5)
- do UMV.write v' index tag
- UMV.write v' (index + 1) tag_parent
- UMV.write v' (index + 2) (name_start - offset0)
- UMV.write v' (index + 3) name_len
- UMV.write v' (index + 4) tag_end)
- (\(PS _ key_start key_len) (PS _ value_start value_len) -> do
- index <- readRef sizeRef
- v' <-
- do v <- readSTRef vecRef
- if index + 5 < UMV.length v
- then pure v
- else do
- v' <- UMV.grow v (UMV.length v)
- writeSTRef vecRef v'
- return v'
- let tag = 0x02
- do writeRef sizeRef (index + 5)
- do UMV.write v' index tag
- UMV.write v' (index + 1) (key_start - offset0)
- UMV.write v' (index + 2) key_len
- UMV.write v' (index + 3) (value_start - offset0)
- UMV.write v' (index + 4) value_len)
- (\_ -> return ())
- (\(PS _ text_start text_len) -> do
- let tag = 0x01
- index <- readRef sizeRef
- v' <-
- do v <- readSTRef vecRef
- if index + 3 < UMV.length v
- then pure v
- else do
- v' <- UMV.grow v (UMV.length v)
- writeSTRef vecRef v'
- return v'
- do writeRef sizeRef (index + 3)
- do UMV.write v' index tag
- UMV.write v' (index + 1) (text_start - offset0)
- UMV.write v' (index + 2) text_len)
- (\_ -> do
- v <- readSTRef vecRef
- -- Set the tag_end slot of the parent.
- parent <- readRef parentRef
- index <- readRef sizeRef
- UMV.write v (parent + 4) index
- -- Pop the stack and return to the parent element.
- previousParent <- UMV.read v (parent + 1)
- writeRef parentRef previousParent)
- (\(PS _ cdata_start cdata_len) -> do
- let tag = 0x03
- index <- readRef sizeRef
- v' <-
- do v <- readSTRef vecRef
- if index + 3 < UMV.length v
- then pure v
- else do
- v' <- UMV.grow v (UMV.length v)
- writeSTRef vecRef v'
- return v'
- do writeRef sizeRef (index + 3)
- do UMV.write v' index tag
- UMV.write v' (index + 1) (cdata_start - offset0)
- UMV.write v' (index + 2) cdata_len)
- str
+ process Process {
+ openF = \(PS _ name_start name_len) -> do
+ let tag = 0x00
+ tag_end = -1
+ index <- readRef sizeRef
+ v' <-
+ do v <- readSTRef vecRef
+ if index + 5 < UMV.length v
+ then pure v
+ else do
+ v' <- UMV.unsafeGrow v (predictGrowSize name_start name_len (index + 5) (UMV.length v))
+ writeSTRef vecRef v'
+ return v'
+ tag_parent <- readRef parentRef
+ do writeRef parentRef index
+ writeRef sizeRef (index + 5)
+ do UMV.unsafeWrite v' index tag
+ UMV.unsafeWrite v' (index + 1) tag_parent
+ UMV.unsafeWrite v' (index + 2) (name_start - offset0)
+ UMV.unsafeWrite v' (index + 3) name_len
+ UMV.unsafeWrite v' (index + 4) tag_end
+ , attrF = \(PS _ key_start key_len) (PS _ value_start value_len) -> do
+ index <- readRef sizeRef
+ v' <-
+ do v <- readSTRef vecRef
+ if index + 5 < UMV.length v
+ then pure v
+ else do
+ v' <- UMV.unsafeGrow v (predictGrowSize value_start value_len (index + 5) (UMV.length v))
+ writeSTRef vecRef v'
+ return v'
+ let tag = 0x02
+ do writeRef sizeRef (index + 5)
+ do UMV.unsafeWrite v' index tag
+ UMV.unsafeWrite v' (index + 1) (key_start - offset0)
+ UMV.unsafeWrite v' (index + 2) key_len
+ UMV.unsafeWrite v' (index + 3) (value_start - offset0)
+ UMV.unsafeWrite v' (index + 4) value_len
+ , endOpenF = \_ -> return ()
+ , textF = \(PS _ text_start text_len) -> do
+ let tag = 0x01
+ index <- readRef sizeRef
+ v' <-
+ do v <- readSTRef vecRef
+ if index + 3 < UMV.length v
+ then pure v
+ else do
+ v' <- UMV.unsafeGrow v (predictGrowSize text_start text_len (index + 3) (UMV.length v))
+ writeSTRef vecRef v'
+ return v'
+ do writeRef sizeRef (index + 3)
+ do UMV.unsafeWrite v' index tag
+ UMV.unsafeWrite v' (index + 1) (text_start - offset0)
+ UMV.unsafeWrite v' (index + 2) text_len
+ , closeF = \_ -> do
+ v <- readSTRef vecRef
+ -- Set the tag_end slot of the parent.
+ parent <- readRef parentRef
+ index <- readRef sizeRef
+ UMV.unsafeWrite v (parent + 4) index
+ -- Pop the stack and return to the parent element.
+ previousParent <- UMV.unsafeRead v (parent + 1)
+ writeRef parentRef previousParent
+ , cdataF = \(PS _ cdata_start cdata_len) -> do
+ let tag = 0x03
+ index <- readRef sizeRef
+ v' <- do
+ v <- readSTRef vecRef
+ if index + 3 < UMV.length v
+ then pure v
+ else do
+ v' <- UMV.unsafeGrow v (predictGrowSize cdata_start cdata_len (index + 3) (UMV.length v))
+ writeSTRef vecRef v'
+ return v'
+ writeRef sizeRef (index + 3)
+ UMV.unsafeWrite v' index tag
+ UMV.unsafeWrite v' (index + 1) (cdata_start - offset0)
+ UMV.unsafeWrite v' (index + 2) cdata_len
+ } str
wet <- readSTRef vecRef
arr <- UV.unsafeFreeze wet
size <- readRef sizeRef
return (UV.unsafeSlice 0 size arr))
-
--- | Get a substring of the BS.
-substring :: ByteString -> Int -> Int -> ByteString
-substring s' start len = S.take len (S.drop start s')
-{-# INLINE substring #-}
+ where
+ -- Growing a large vector is slow, so we need to do it less times.
+ -- We can predict final array size after processing some part (i.e. 1/4) of input XML.
+ --
+ -- predictGrowSize _bsStart _bsLen _index vecLen = round $ fromIntegral vecLen * (1.25 :: Double)
+ predictGrowSize bsStart bsLen index vecLen =
+ let processedLen = bsStart + bsLen - offset0
+ -- 1. Using integral operations, such as
+ -- "predictedTotalSize = (index * S.length str) `div` processedLen"
+ -- cause overflow, so we use float.
+ -- 2. Slightly enlarge predicted size to compensite copy on vector grow
+ -- if prediction is incorrect
+ k = (1.25 :: Double) * fromIntegral (S.length str) / fromIntegral processedLen
+ predictedTotalSize = round $ fromIntegral index * k
+ growSize = predictedTotalSize - vecLen
+ in growSize
diff --git a/src/Xeno/DOM/Internal.hs b/src/Xeno/DOM/Internal.hs
new file mode 100644
index 0000000..660f5f6
--- /dev/null
+++ b/src/Xeno/DOM/Internal.hs
@@ -0,0 +1,123 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE DeriveAnyClass #-}
+{-# LANGUAGE DeriveDataTypeable #-}
+{-# LANGUAGE DeriveGeneric #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving #-}
+-- | Efficient DOM data structure
+module Xeno.DOM.Internal
+ ( Node(..)
+ , Content(..)
+ , name
+ , attributes
+ , contents
+ , children
+ ) where
+
+import Control.DeepSeq
+import Data.ByteString (ByteString)
+import qualified Data.ByteString as S
+import Data.Data (Data, Typeable)
+import Data.Vector.Unboxed ((!))
+import qualified Data.Vector.Unboxed as UV
+
+--import Debug.Trace
+--trace _ a = a
+
+-- | Some XML nodes.
+data Node = Node !ByteString !Int !(UV.Vector Int)
+ deriving (Eq, Data, Typeable)
+
+instance NFData Node where
+ rnf !_ = ()
+
+instance Show Node where
+ show n =
+ "(Node " ++
+ show (name n) ++
+ " " ++ show (attributes n) ++ " " ++ show (contents n) ++ ")"
+
+-- | Content of a node.
+data Content
+ = Element {-# UNPACK #-}!Node
+ | Text {-# UNPACK #-}!ByteString
+ | CData {-# UNPACK #-}!ByteString
+ deriving (Eq, Show, Data, Typeable)
+
+instance NFData Content where
+ rnf !_ = ()
+
+-- | Get just element children of the node (no text).
+children :: Node -> [Node]
+children (Node str start offsets) = collect firstChild
+ where
+ collect i
+ | i < endBoundary =
+ case offsets ! i of
+ 0x00 -> Node str i offsets : collect (offsets ! (i + 4))
+ 0x01 -> collect (i + 3)
+ _off -> [] -- trace ("Offsets " <> show i <> " is " <> show off) []
+ | otherwise = []
+ firstChild = go (start + 5)
+ where
+ go i
+ | i < endBoundary =
+ case offsets ! i of
+ 0x02 -> go (i + 5)
+ _ -> i
+ | otherwise = i
+ endBoundary = offsets ! (start + 4)
+
+-- | Contents of a node.
+contents :: Node -> [Content]
+contents (Node str start offsets) = collect firstChild
+ where
+ collect i
+ | i < endBoundary =
+ case offsets ! i of
+ 0x00 ->
+ Element
+ (Node str i offsets) :
+ collect (offsets ! (i + 4))
+ 0x01 ->
+ Text (substring str (offsets ! (i + 1)) (offsets ! (i + 2))) :
+ collect (i + 3)
+ 0x03 ->
+ CData (substring str (offsets ! (i + 1)) (offsets ! (i + 2))) :
+ collect (i + 3)
+ _ -> []
+ | otherwise = []
+ firstChild = go (start + 5)
+ where
+ go i | i < endBoundary =
+ case offsets ! i of
+ 0x02 -> go (i + 5)
+ _ -> i
+ | otherwise = i
+ endBoundary = offsets ! (start + 4)
+
+-- | Attributes of a node.
+attributes :: Node -> [(ByteString,ByteString)]
+attributes (Node str start offsets) = collect (start + 5)
+ where
+ collect i
+ | i < endBoundary =
+ case offsets ! i of
+ 0x02 ->
+ ( substring str (offsets ! (i + 1)) (offsets ! (i + 2))
+ , substring str (offsets ! (i + 3)) (offsets ! (i + 4))) :
+ collect (i + 5)
+ _ -> []
+ | otherwise = []
+ endBoundary = offsets ! (start + 4)
+
+-- | Name of the element.
+name :: Node -> ByteString
+name (Node str start offsets) =
+ case offsets ! start of
+ 0x00 -> substring str (offsets ! (start + 2)) (offsets ! (start + 3))
+ _ -> error "Node cannot have empty name" -- mempty
+
+-- | Get a substring of the BS.
+substring :: ByteString -> Int -> Int -> ByteString
+substring s' start len = S.take len (S.drop start s')
+{-# INLINE substring #-}
diff --git a/src/Xeno/DOM/Robust.hs b/src/Xeno/DOM/Robust.hs
new file mode 100644
index 0000000..cbacea4
--- /dev/null
+++ b/src/Xeno/DOM/Robust.hs
@@ -0,0 +1,155 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE DeriveAnyClass #-}
+{-# LANGUAGE DeriveDataTypeable #-}
+{-# LANGUAGE DeriveGeneric #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving #-}
+-- | DOM parser and API for XML.
+-- Slightly slower DOM parsing,
+-- but add missing close tags.
+module Xeno.DOM.Robust
+ ( parse
+ , Node
+ , Content(..)
+ , name
+ , attributes
+ , contents
+ , children
+ ) where
+
+import Control.Monad.ST
+import Control.Spork
+import Data.ByteString.Internal(ByteString(..))
+import Data.STRef
+import qualified Data.Vector.Unboxed as UV
+import qualified Data.Vector.Unboxed.Mutable as UMV
+import Data.Mutable(asURef, newRef, readRef, writeRef)
+import Xeno.SAX
+import Xeno.Types
+import Xeno.DOM.Internal(Node(..), Content(..), name, attributes, contents, children)
+
+-- | Parse a complete Nodes document.
+parse :: ByteString -> Either XenoException Node
+parse inp =
+ case spork node of
+ Left e -> Left e
+ Right r ->
+ case findRootNode r of
+ Just n -> Right n
+ Nothing -> Left XenoExpectRootNode
+ where
+ findRootNode r = go 0
+ where
+ go n = case r UV.!? n of
+ Just 0x0 -> Just (Node str n r)
+ -- skipping text assuming that it contains only white space
+ -- characters
+ Just 0x1 -> go (n+3)
+ _ -> Nothing
+ PS _ offset0 _ = str
+ str = skipDoctype inp
+ node =
+ runST
+ (do nil <- UMV.new 1000
+ vecRef <- newSTRef nil
+ sizeRef <- fmap asURef $ newRef 0
+ parentRef <- fmap asURef $ newRef 0
+ process Process {
+ openF = \(PS _ name_start name_len) -> do
+ let tag = 0x00
+ tag_end = -1
+ index <- readRef sizeRef
+ v' <-
+ do v <- readSTRef vecRef
+ if index + 5 < UMV.length v
+ then pure v
+ else do
+ v' <- UMV.grow v (UMV.length v)
+ writeSTRef vecRef v'
+ return v'
+ tag_parent <- readRef parentRef
+ do writeRef parentRef index
+ writeRef sizeRef (index + 5)
+ UMV.write v' index tag
+ UMV.write v' (index + 1) tag_parent
+ UMV.write v' (index + 2) (name_start - offset0)
+ UMV.write v' (index + 3) name_len
+ UMV.write v' (index + 4) tag_end
+ , attrF = \(PS _ key_start key_len) (PS _ value_start value_len) -> do
+ index <- readRef sizeRef
+ v' <-
+ do v <- readSTRef vecRef
+ if index + 5 < UMV.length v
+ then pure v
+ else do
+ v' <- UMV.grow v (UMV.length v)
+ writeSTRef vecRef v'
+ return v'
+ let tag = 0x02
+ do writeRef sizeRef (index + 5)
+ do UMV.write v' index tag
+ UMV.write v' (index + 1) (key_start - offset0)
+ UMV.write v' (index + 2) key_len
+ UMV.write v' (index + 3) (value_start - offset0)
+ UMV.write v' (index + 4) value_len
+ , endOpenF = \_ -> return ()
+ , textF = \(PS _ text_start text_len) -> do
+ let tag = 0x01
+ index <- readRef sizeRef
+ v' <-
+ do v <- readSTRef vecRef
+ if index + 3 < UMV.length v
+ then pure v
+ else do
+ v' <- UMV.grow v (UMV.length v)
+ writeSTRef vecRef v'
+ return v'
+ do writeRef sizeRef (index + 3)
+ do UMV.write v' index tag
+ UMV.write v' (index + 1) (text_start - offset0)
+ UMV.write v' (index + 2) text_len
+ , closeF = \closeTag@(PS s _ _) -> do
+ v <- readSTRef vecRef
+ -- Set the tag_end slot of the parent.
+ index <- readRef sizeRef
+ untilM $ do
+ parent <- readRef parentRef
+ correctTag <- if parent == 0
+ then return True -- no more tags to close!!!
+ else do
+ parent_name <- UMV.read v (parent + 2)
+ parent_len <- UMV.read v (parent + 3)
+ let openTag = PS s (parent_name+offset0) parent_len
+ return $ openTag == closeTag
+ UMV.write v (parent + 4) index
+ -- Pop the stack and return to the parent element.
+ previousParent <- UMV.read v (parent + 1)
+ writeRef parentRef previousParent
+ return correctTag -- continue closing tags, until matching one is found
+ , cdataF = \(PS _ cdata_start cdata_len) -> do
+ let tag = 0x03
+ index <- readRef sizeRef
+ v' <-
+ do v <- readSTRef vecRef
+ if index + 3 < UMV.length v
+ then pure v
+ else do
+ v' <- UMV.grow v (UMV.length v)
+ writeSTRef vecRef v'
+ return v'
+ do writeRef sizeRef (index + 3)
+ do UMV.write v' index tag
+ UMV.write v' (index + 1) (cdata_start - offset0)
+ UMV.write v' (index + 2) cdata_len
+ } str
+ wet <- readSTRef vecRef
+ arr <- UV.unsafeFreeze wet
+ size <- readRef sizeRef
+ return (UV.unsafeSlice 0 size arr))
+
+untilM :: Monad m => m Bool -> m ()
+untilM loop = do
+ cond <- loop
+ case cond of
+ True -> return ()
+ False -> untilM loop
+
diff --git a/src/Xeno/Errors.hs b/src/Xeno/Errors.hs
index 0920eb2..027be5b 100644
--- a/src/Xeno/Errors.hs
+++ b/src/Xeno/Errors.hs
@@ -1,6 +1,7 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE Strict #-}
{-# LANGUAGE ViewPatterns #-}
+{-# LANGUAGE CPP #-}
-- | Simplifies raising and presenting localized exceptions to the user.
module Xeno.Errors(printExceptions
,displayException
diff --git a/src/Xeno/SAX.hs b/src/Xeno/SAX.hs
index fe2e5d4..443d09a 100644
--- a/src/Xeno/SAX.hs
+++ b/src/Xeno/SAX.hs
@@ -1,29 +1,83 @@
-{-# LANGUAGE MultiWayIf #-}
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE BinaryLiterals #-}
+{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE MultiWayIf #-}
+{-# LANGUAGE NamedFieldPuns #-}
+{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}
-{-# LANGUAGE OverloadedStrings #-}
-{-# LANGUAGE BangPatterns #-}
-- | SAX parser and API for XML.
module Xeno.SAX
( process
+ , Process(..)
+ , StringLike(..)
, fold
, validate
+ , validateEx
, dump
+ , skipDoctype
) where
import Control.Exception
+import Control.Monad.ST
import Control.Monad.State.Strict
import Control.Spork
+import Data.Bits
import Data.ByteString (ByteString)
import qualified Data.ByteString as S
import qualified Data.ByteString.Char8 as S8
import qualified Data.ByteString.Unsafe as SU
+import Data.Char(isSpace)
import Data.Functor.Identity
-import Data.Monoid
+import Data.Semigroup
+import Data.STRef
import Data.Word
import Xeno.Types
+
+class StringLike str where
+ s_index' :: str -> Int -> Word8
+ elemIndexFrom' :: Word8 -> str -> Int -> Maybe Int
+ drop' :: Int -> str -> str
+ substring' :: str -> Int -> Int -> ByteString
+ toBS :: str -> ByteString
+
+instance StringLike ByteString where
+ s_index' = s_index
+ {-# INLINE s_index' #-}
+ elemIndexFrom' = elemIndexFrom
+ {-# INLINE elemIndexFrom' #-}
+ drop' = S.drop
+ {-# INLINE drop' #-}
+ substring' = substring
+ {-# INLINE substring' #-}
+ toBS = id
+ {-# INLINE toBS #-}
+
+instance StringLike ByteStringZeroTerminated where
+ s_index' (BSZT ps) n = ps `SU.unsafeIndex` n
+ {-# INLINE s_index' #-}
+ elemIndexFrom' w (BSZT bs) i = elemIndexFrom w bs i
+ {-# INLINE elemIndexFrom' #-}
+ drop' i (BSZT bs) = BSZT $ S.drop i bs
+ {-# INLINE drop' #-}
+ substring' (BSZT bs) s t = substring bs s t
+ {-# INLINE substring' #-}
+ toBS (BSZT bs) = bs
+ {-# INLINE toBS #-}
+
+-- | Parameters to the 'process' function
+data Process a =
+ Process {
+ openF :: !(ByteString -> a) -- ^ Open tag.
+ , attrF :: !(ByteString -> ByteString -> a) -- ^ Tag attribute.
+ , endOpenF :: !(ByteString -> a) -- ^ End open tag.
+ , textF :: !(ByteString -> a) -- ^ Text.
+ , closeF :: !(ByteString -> a) -- ^ Close tag.
+ , cdataF :: !(ByteString -> a) -- ^ CDATA.
+ }
+
--------------------------------------------------------------------------------
-- Helpful interfaces to the parser
@@ -37,46 +91,89 @@ import Xeno.Types
--
-- > > validate "<b"
-- > False
-validate :: ByteString -> Bool
+validate :: (StringLike str) => str -> Bool
validate s =
case spork
(runIdentity
(process
- (\_ -> pure ())
- (\_ _ -> pure ())
- (\_ -> pure ())
- (\_ -> pure ())
- (\_ -> pure ())
- (\_ -> pure ())
+ Process {
+ openF = \_ -> pure ()
+ , attrF = \_ _ -> pure ()
+ , endOpenF = \_ -> pure ()
+ , textF = \_ -> pure ()
+ , closeF = \_ -> pure ()
+ , cdataF = \_ -> pure ()
+ }
s)) of
Left (_ :: XenoException) -> False
Right _ -> True
+-- It must be inlined or specialised to ByteString/ByteStringZeroTerminated
+{-# INLINE validate #-}
+{-# SPECIALISE validate :: ByteString -> Bool #-}
+{-# SPECIALISE validate :: ByteStringZeroTerminated -> Bool #-}
+
+
+-- | Parse the XML and checks tags nesting.
+--
+validateEx :: (StringLike str) => str -> Bool
+validateEx s =
+ case spork
+ (runST $ do
+ tags <- newSTRef []
+ (process
+ Process {
+ openF = \tag -> modifySTRef' tags (tag:)
+ , attrF = \_ _ -> pure ()
+ , endOpenF = \_ -> pure ()
+ , textF = \_ -> pure ()
+ , closeF = \tag ->
+ modifySTRef' tags $ \case
+ [] -> fail $ "Unexpected close tag \"" ++ show tag ++ "\""
+ (expectedTag:tags') ->
+ if expectedTag == tag
+ then tags'
+ else fail $ "Unexpected close tag. Expected \"" ++ show expectedTag ++ "\", but got \"" ++ show tag ++ "\""
+ , cdataF = \_ -> pure ()
+ }
+ s)
+ readSTRef tags >>= \case
+ [] -> return ()
+ tags' -> fail $ "Not all tags closed: " ++ show tags'
+ ) of
+ Left (_ :: XenoException) -> False
+ Right _ -> True
+{-# INLINE validateEx #-}
+{-# SPECIALISE validateEx :: ByteString -> Bool #-}
+{-# SPECIALISE validateEx :: ByteStringZeroTerminated -> Bool #-}
+
-- | Parse the XML and pretty print it to stdout.
dump :: ByteString -> IO ()
dump str =
evalStateT
(process
- (\name -> do
+ Process {
+ openF = \name -> do
level <- get
- lift (S8.putStr (S8.replicate level ' ' <> "<" <> name <> "")))
- (\key value -> lift (S8.putStr (" " <> key <> "=\"" <> value <> "\"")))
- (\_ -> do
+ lift (S8.putStr (S8.replicate level ' ' <> "<" <> name <> ""))
+ , attrF = \key value -> lift (S8.putStr (" " <> key <> "=\"" <> value <> "\""))
+ , endOpenF = \_ -> do
level <- get
let !level' = level + 2
put level'
- lift (S8.putStrLn (">")))
- (\text -> do
+ lift (S8.putStrLn (">"))
+ , textF = \text -> do
level <- get
- lift (S8.putStrLn (S8.replicate level ' ' <> S8.pack (show text))))
- (\name -> do
+ lift (S8.putStrLn (S8.replicate level ' ' <> S8.pack (show text)))
+ , closeF = \name -> do
level <- get
let !level' = level - 2
put level'
- lift (S8.putStrLn (S8.replicate level' ' ' <> "</" <> name <> ">")))
- (\cdata -> do
+ lift (S8.putStrLn (S8.replicate level' ' ' <> "</" <> name <> ">"))
+ , cdataF = \cdata -> do
level <- get
- lift (S8.putStrLn (S8.replicate level ' ' <> "CDATA: " <> S8.pack (show cdata))))
+ lift (S8.putStrLn (S8.replicate level ' ' <> "CDATA: " <> S8.pack (show cdata)))
+ }
str)
(0 :: Int)
@@ -94,14 +191,14 @@ fold
fold openF attrF endOpenF textF closeF cdataF s str =
spork
(execState
- (process
- (\name -> modify (\s' -> openF s' name))
- (\key value -> modify (\s' -> attrF s' key value))
- (\name -> modify (\s' -> endOpenF s' name))
- (\text -> modify (\s' -> textF s' text))
- (\name -> modify (\s' -> closeF s' name))
- (\cdata -> modify (\s' -> cdataF s' cdata))
- str)
+ (process Process {
+ openF = \name -> modify (\s' -> openF s' name)
+ , attrF = \key value -> modify (\s' -> attrF s' key value)
+ , endOpenF = \name -> modify (\s' -> endOpenF s' name)
+ , textF = \text -> modify (\s' -> textF s' text)
+ , closeF = \name -> modify (\s' -> closeF s' name)
+ , cdataF = \cdata -> modify (\s' -> cdataF s' cdata)
+ } str)
s)
--------------------------------------------------------------------------------
@@ -109,79 +206,75 @@ fold openF attrF endOpenF textF closeF cdataF s str =
-- | Process events with callbacks in the XML input.
process
- :: Monad m
- => (ByteString -> m ()) -- ^ Open tag.
- -> (ByteString -> ByteString -> m ()) -- ^ Tag attribute.
- -> (ByteString -> m ()) -- ^ End open tag.
- -> (ByteString -> m ()) -- ^ Text.
- -> (ByteString -> m ()) -- ^ Close tag.
- -> (ByteString -> m ()) -- ^ CDATA.
- -> ByteString -> m ()
-process openF attrF endOpenF textF closeF cdataF str = findLT 0
+ :: (Monad m, StringLike str)
+ => Process (m ())
+ -> str
+ -> m ()
+process !(Process {openF, attrF, endOpenF, textF, closeF, cdataF}) str = findLT 0
where
findLT index =
- case elemIndexFrom openTagChar str index of
+ case elemIndexFrom' openTagChar str index of
Nothing -> unless (S.null text) (textF text)
- where text = S.drop index str
+ where text = toBS $ drop' index str
Just fromLt -> do
unless (S.null text) (textF text)
checkOpenComment (fromLt + 1)
- where text = substring str index fromLt
+ where text = substring' str index fromLt
-- Find open comment, CDATA or tag name.
checkOpenComment index =
- if | s_index this 0 == bangChar -- !
- && s_index this 1 == commentChar -- -
- && s_index this 2 == commentChar -> -- -
+ if | s_index' this 0 == bangChar -- !
+ && s_index' this 1 == commentChar -- -
+ && s_index' this 2 == commentChar -> -- -
findCommentEnd (index + 3)
- | s_index this 0 == bangChar -- !
- && s_index this 1 == openAngleBracketChar -- [
- && s_index this 2 == 67 -- C
- && s_index this 3 == 68 -- D
- && s_index this 4 == 65 -- A
- && s_index this 5 == 84 -- T
- && s_index this 6 == 65 -- A
- && s_index this 7 == openAngleBracketChar -> -- [
+ | s_index' this 0 == bangChar -- !
+ && s_index' this 1 == openAngleBracketChar -- [
+ && s_index' this 2 == 67 -- C
+ && s_index' this 3 == 68 -- D
+ && s_index' this 4 == 65 -- A
+ && s_index' this 5 == 84 -- T
+ && s_index' this 6 == 65 -- A
+ && s_index' this 7 == openAngleBracketChar -> -- [
findCDataEnd (index + 8) (index + 8)
| otherwise ->
findTagName index
where
- this = S.drop index str
+ this = drop' index str
findCommentEnd index =
- case elemIndexFrom commentChar str index of
+ case elemIndexFrom' commentChar str index of
Nothing -> throw $ XenoParseError index "Couldn't find the closing comment dash."
Just fromDash ->
- if s_index this 0 == commentChar && s_index this 1 == closeTagChar
+ if s_index' this 0 == commentChar && s_index' this 1 == closeTagChar
then findLT (fromDash + 2)
else findCommentEnd (fromDash + 1)
- where this = S.drop index str
+ where this = drop' index str
findCDataEnd cdata_start index =
- case elemIndexFrom closeAngleBracketChar str index of
+ case elemIndexFrom' closeAngleBracketChar str index of
Nothing -> throw $ XenoParseError index "Couldn't find closing angle bracket for CDATA."
Just fromCloseAngleBracket ->
- if s_index str (fromCloseAngleBracket + 1) == closeAngleBracketChar
+ if s_index' str (fromCloseAngleBracket + 1) == closeAngleBracketChar
then do
- cdataF (substring str cdata_start fromCloseAngleBracket)
+ cdataF (substring' str cdata_start fromCloseAngleBracket)
findLT (fromCloseAngleBracket + 3) -- Start after ]]>
else
-- We only found one ], that means that we need to keep searching.
findCDataEnd cdata_start (fromCloseAngleBracket + 1)
findTagName index0 =
let spaceOrCloseTag = parseName str index
- in if | s_index str index0 == questionChar ->
- case elemIndexFrom closeTagChar str spaceOrCloseTag of
+ in if | s_index' str index0 == questionChar ->
+ case elemIndexFrom' closeTagChar str spaceOrCloseTag of
Nothing -> throw $ XenoParseError index "Couldn't find the end of the tag."
Just fromGt -> do
findLT (fromGt + 1)
- | s_index str spaceOrCloseTag == closeTagChar ->
- do let tagname = substring str index spaceOrCloseTag
- if s_index str index0 == slashChar
+ | s_index' str spaceOrCloseTag == closeTagChar ->
+ do let tagname = substring' str index spaceOrCloseTag
+ if s_index' str index0 == slashChar
then closeF tagname
else do
openF tagname
endOpenF tagname
findLT (spaceOrCloseTag + 1)
| otherwise ->
- do let tagname = substring str index spaceOrCloseTag
+ do let tagname = substring' str index spaceOrCloseTag
openF tagname
result <- findAttributes spaceOrCloseTag
endOpenF tagname
@@ -192,22 +285,22 @@ process openF attrF endOpenF textF closeF cdataF str = findLT 0
findLT (closingPair + 2)
where
index =
- if s_index str index0 == slashChar
+ if s_index' str index0 == slashChar
then index0 + 1
else index0
findAttributes index0 =
- if s_index str index == slashChar &&
- s_index str (index + 1) == closeTagChar
+ if s_index' str index == slashChar &&
+ s_index' str (index + 1) == closeTagChar
then pure (Left index)
- else if s_index str index == closeTagChar
+ else if s_index' str index == closeTagChar
then pure (Right index)
else let afterAttrName = parseName str index
- in if s_index str afterAttrName == equalChar
+ in if s_index' str afterAttrName == equalChar
then let quoteIndex = afterAttrName + 1
- usedChar = s_index str quoteIndex
+ usedChar = s_index' str quoteIndex
in if usedChar == quoteChar ||
usedChar == doubleQuoteChar
- then case elemIndexFrom
+ then case elemIndexFrom'
usedChar
str
(quoteIndex + 1) of
@@ -216,33 +309,33 @@ process openF attrF endOpenF textF closeF cdataF str = findLT 0
(XenoParseError index "Couldn't find the matching quote character.")
Just endQuoteIndex -> do
attrF
- (substring str index afterAttrName)
- (substring
+ (substring' str index afterAttrName)
+ (substring'
str
(quoteIndex + 1)
(endQuoteIndex))
findAttributes (endQuoteIndex + 1)
else throw
(XenoParseError index("Expected ' or \", got: " <> S.singleton usedChar))
- else throw (XenoParseError index ("Expected =, got: " <> S.singleton (s_index str afterAttrName) <> " at character index: " <> (S8.pack . show) afterAttrName))
+ else throw (XenoParseError index ("Expected =, got: " <> S.singleton (s_index' str afterAttrName) <> " at character index: " <> (S8.pack . show) afterAttrName))
where
index = skipSpaces str index0
{-# INLINE process #-}
-{-# SPECIALISE process ::
- (ByteString -> Identity ()) ->
- (ByteString -> ByteString -> Identity ()) ->
- (ByteString -> Identity ()) ->
- (ByteString -> Identity ()) ->
- (ByteString -> Identity ()) ->
- (ByteString -> Identity ()) -> ByteString -> Identity ()
+{-# SPECIALISE process :: Process (Identity ()) -> ByteString -> Identity ()
+ #-}
+{-# SPECIALISE process :: Process (State s ()) -> ByteString -> State s ()
+ #-}
+{-# SPECIALISE process :: Process (ST s ()) -> ByteString -> ST s ()
+ #-}
+{-# SPECIALISE process :: Process (IO ()) -> ByteString -> IO ()
+ #-}
+{-# SPECIALISE process :: Process (Identity ()) -> ByteStringZeroTerminated -> Identity ()
+ #-}
+{-# SPECIALISE process :: Process (State s ()) -> ByteStringZeroTerminated -> State s ()
#-}
-{-# SPECIALISE process ::
- (ByteString -> IO ()) ->
- (ByteString -> ByteString -> IO ()) ->
- (ByteString -> IO ()) ->
- (ByteString -> IO ()) ->
- (ByteString -> IO ()) ->
- (ByteString -> IO ()) -> ByteString -> IO ()
+{-# SPECIALISE process :: Process (ST s ()) -> ByteStringZeroTerminated -> ST s ()
+ #-}
+{-# SPECIALISE process :: Process (IO ()) -> ByteStringZeroTerminated -> IO ()
#-}
--------------------------------------------------------------------------------
@@ -253,13 +346,13 @@ s_index :: ByteString -> Int -> Word8
s_index ps n
| n < 0 = throw (XenoStringIndexProblem n ps)
| n >= S.length ps = throw (XenoStringIndexProblem n ps)
- | otherwise = ps `SU.unsafeIndex` n
+ | otherwise = ps `SU.unsafeIndex` n
{-# INLINE s_index #-}
-- | A fast space skipping function.
-skipSpaces :: ByteString -> Int -> Int
+skipSpaces :: (StringLike str) => str -> Int -> Int
skipSpaces str i =
- if isSpaceChar (s_index str i)
+ if isSpaceChar (s_index' str i)
then skipSpaces str (i + 1)
else i
{-# INLINE skipSpaces #-}
@@ -270,16 +363,17 @@ substring s start end = S.take (end - start) (S.drop start s)
{-# INLINE substring #-}
-- | Basically @findIndex (not . isNameChar)@, but doesn't allocate.
-parseName :: ByteString -> Int -> Int
+parseName :: (StringLike str) => str -> Int -> Int
parseName str index =
- if not (isNameChar1 (s_index str index))
+ if not (isNameChar1 (s_index' str index))
then index
else parseName' str (index + 1)
+{-# INLINE parseName #-}
-- | Basically @findIndex (not . isNameChar)@, but doesn't allocate.
-parseName' :: ByteString -> Int -> Int
+parseName' :: (StringLike str) => str -> Int -> Int
parseName' str index =
- if not (isNameChar (s_index str index))
+ if not (isNameChar (s_index' str index))
then index
else parseName' str (index + 1)
{-# INLINE parseName' #-}
@@ -296,7 +390,12 @@ elemIndexFrom c str offset = fmap (+ offset) (S.elemIndex c (S.drop offset str))
-- Character types
isSpaceChar :: Word8 -> Bool
-isSpaceChar c = c == 32 || (c <= 10 && c >= 9) || c == 13
+isSpaceChar = testBit (0b100000000000000000010011000000000 :: Int) . fromIntegral
+-- | | || bits:
+-- | | |+-- 9
+-- | | +--- 10
+-- | +------ 13
+-- +------------------------- 32
{-# INLINE isSpaceChar #-}
-- | Is the character a valid first tag/attribute name constituent?
@@ -306,12 +405,44 @@ isNameChar1 c =
(c >= 97 && c <= 122) || (c >= 65 && c <= 90) || c == 95 || c == 58
{-# INLINE isNameChar1 #-}
+-- isNameCharOriginal :: Word8 -> Bool
+-- isNameCharOriginal c =
+-- (c >= 97 && c <= 122) || (c >= 65 && c <= 90) || c == 95 || c == 58 ||
+-- c == 45 || c == 46 || (c >= 48 && c <= 57)
+-- {-# INLINE isNameCharOriginal #-}
+--
+-- TODO Strange, but highMaskIsNameChar, lowMaskIsNameChar don't calculate fast... FIX IT
+-- highMaskIsNameChar, lowMaskIsNameChar :: Word64
+-- (highMaskIsNameChar, lowMaskIsNameChar) =
+-- foldl (\(hi,low) char -> (hi `setBit` (char - 64), low `setBit` char)) -- NB: `setBit` can process overflowed values (where char < 64; -- TODO fix it
+-- (0::Word64, 0::Word64)
+-- (map fromIntegral (filter isNameCharOriginal [0..128]))
+-- {-# INLINE highMaskIsNameChar #-}
+-- {-# INLINE lowMaskIsNameChar #-}
+
-- | Is the character a valid tag/attribute name constituent?
-- isNameChar1 + '-', '.', '0'-'9'
isNameChar :: Word8 -> Bool
-isNameChar c =
- (c >= 97 && c <= 122) || (c >= 65 && c <= 90) || c == 95 || c == 58 ||
- c == 45 || c == 46 || (c >= 48 && c <= 57)
+isNameChar char = (lowMaskIsNameChar `testBit` char'low) || (highMaskIsNameChar `testBit` char'high)
+ -- TODO 1) change code to use W# instead of Word64
+ -- 2) Document `ii - 64` -- there is underflow, but `testBit` can process this!
+ where
+ char'low = fromIntegral char
+ char'high = fromIntegral (char - 64)
+ highMaskIsNameChar :: Word64
+ highMaskIsNameChar = 0b11111111111111111111111111010000111111111111111111111111110
+ -- ------------+------------- | ------------+-------------
+ -- | | | bits:
+ -- | | +-- 65-90
+ -- | +------------------- 95
+ -- +---------------------------------- 97-122
+ lowMaskIsNameChar :: Word64
+ lowMaskIsNameChar = 0b11111111111011000000000000000000000000000000000000000000000
+ -- -----+----- ||
+ -- | || bits:
+ -- | |+-- 45
+ -- | +--- 46
+ -- +---------- 48-58
{-# INLINE isNameChar #-}
-- | Char for '\''.
@@ -357,3 +488,14 @@ openAngleBracketChar = 91
-- | Close angle bracket character.
closeAngleBracketChar :: Word8
closeAngleBracketChar = 93
+
+-- | Skip initial DOCTYPE declaration
+skipDoctype :: ByteString -> ByteString
+skipDoctype arg =
+ if "<!DOCTYPE" `S8.isPrefixOf` bs
+ then let (_, rest)=">" `S8.breakSubstring` bs
+ in skipBlanks $ S8.drop 1 rest
+ else bs
+ where
+ bs = skipBlanks arg
+ skipBlanks = S8.dropWhile isSpace
diff --git a/src/Xeno/Types.hs b/src/Xeno/Types.hs
index eab1d49..312ad2c 100644
--- a/src/Xeno/Types.hs
+++ b/src/Xeno/Types.hs
@@ -1,5 +1,8 @@
-{-# LANGUAGE DeriveGeneric #-}
-{-# LANGUAGE DeriveAnyClass #-}
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE DeriveAnyClass #-}
+{-# LANGUAGE DeriveDataTypeable #-}
+{-# LANGUAGE DeriveGeneric #-}
+{-# LANGUAGE FlexibleInstances #-}
-- | Shared types.
@@ -7,15 +10,26 @@ module Xeno.Types where
import Control.DeepSeq
import Control.Exception
-import Data.ByteString (ByteString)
-import Data.Typeable
+import Data.ByteString.Char8 (ByteString, pack)
+import Data.Data
import GHC.Generics
+#if MIN_VERSION_base(4,9,0)
+import Control.Monad.Fail
+
+-- It is recommended to use more specific `failHere` instead
+instance MonadFail (Either Xeno.Types.XenoException) where
+ fail = Left . XenoParseError 0 . pack
+#endif
+
data XenoException
= XenoStringIndexProblem { stringIndex :: Int, inputString :: ByteString }
| XenoParseError { inputIndex :: Int, message :: ByteString }
| XenoExpectRootNode
- deriving (Show, Typeable, NFData, Generic)
+ deriving (Show, Data, Typeable, NFData, Generic)
instance Exception XenoException where displayException = show
+-- | ByteString wich guaranted have '\NUL' at the end
+newtype ByteStringZeroTerminated = BSZT ByteString deriving (Generic, NFData)
+
diff --git a/test/Main.hs b/test/Main.hs
index 8fb0bbd..6d6e224 100644
--- a/test/Main.hs
+++ b/test/Main.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE OverloadedStrings #-}
-- | Simple test suite.
@@ -8,17 +9,35 @@ import Data.Either (isRight)
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS
import Test.Hspec
-import Xeno.SAX (validate)
-import Xeno.DOM (Content(..), parse, name, contents, attributes, children)
+import Xeno.SAX (validate, skipDoctype)
+import Xeno.DOM (Node, Content(..), parse, name, contents, attributes, children)
+import qualified Xeno.DOM.Robust as RDOM
import Xeno.Types
-
+import qualified Debug.Trace as Debug(trace)
main :: IO ()
main = hspec spec
+
spec :: SpecWith ()
spec = do
describe "Xeno.DOM tests" $ do
+ it "test 1" $ do
+ xml <- BS.readFile "data/books-4kb.xml"
+ let (Right dom) = parse xml
+ (name dom) `shouldBe` "catalog"
+ (length $ contents dom) `shouldBe` 25
+ (length $ children dom) `shouldBe` 12
+ (length $ allChildrens dom) `shouldBe` 84
+ (length $ concatMap attributes $ allChildrens dom) `shouldBe` 12
+ (concatMap attributes $ allChildrens dom) `shouldBe`
+ [("id","bk101"),("id","bk102"),("id","bk103"),("id","bk104")
+ ,("id","bk105"),("id","bk106"),("id","bk107"),("id","bk108")
+ ,("id","bk109"),("id","bk110"),("id","bk111"),("id","bk112")]
+ (map name $ allChildrens dom) `shouldBe`
+ (replicate 12 "book" ++ (concat $
+ replicate 12 ["author","title","genre","price","publish_date","description"]))
+ describe "Xeno.DOM tests" $ do
it "DOM from bytestring substring" $ do
let substr = BS.drop 5 "5<8& <valid>xml<here/></valid>"
parsedRoot = fromRightE $ parse substr
@@ -30,10 +49,10 @@ spec = do
let doc =
parse
"<root><test id=\"1\" extra=\"2\" />\n<test id=\"2\" /><b><test id=\"3\" /></b><test id=\"4\" /><test /></root>"
-
+
it "children test" $
map name (children $ fromRightE doc) `shouldBe` ["test", "test", "b", "test", "test"]
-
+
it "attributes" $
attributes (head (children $ fromRightE doc)) `shouldBe` [("id", "1"), ("extra", "2")]
@@ -41,7 +60,7 @@ spec = do
let docWithPrologue = "<?xml version=\"1.1\"?>\n<greeting>Hello, world!</greeting>"
parsedRoot = fromRightE $ Xeno.DOM.parse docWithPrologue
name parsedRoot `shouldBe` "greeting"
-
+
describe
"hexml tests"
(do mapM_
@@ -50,13 +69,73 @@ spec = do
mapM_
(\(v, i) -> it (show i) (shouldBe (either (Left . show) (Right . id) (contents <$> parse i)) v))
cdata_tests
-
+
-- If this works without crashing we're happy.
- let nsdoc = "<ns:tag os:attr=\"Namespaced attribute value\">Content.</ns:tag>"
+ let nsdoc = ("<ns:tag os:attr=\"Namespaced attribute value\">Content.</ns:tag>" :: ByteString)
it
"namespaces" $
validate nsdoc `shouldBe` True
)
+ describe "robust XML tests" $ do
+ it "DOM from bytestring substring" $ do
+ let substr = BS.drop 5 "5<8& <valid>xml<here/></valid>"
+ parsedRoot = fromRightE $ RDOM.parse substr
+ name parsedRoot `shouldBe` "valid"
+
+ it "Leading whitespace characters are accepted by parse" $
+ isRight (RDOM.parse "\n<a></a>") `shouldBe` True
+
+ let doc =
+ RDOM.parse
+ "<root><test id=\"1\" extra=\"2\" />\n<test id=\"2\" /><b><test id=\"3\" /></b><test id=\"4\" /><test /></root>"
+
+ it "children test" $
+ map name (children $ fromRightE doc) `shouldBe` ["test", "test", "b", "test", "test"]
+
+ it "attributes" $
+ attributes (head (children $ fromRightE doc)) `shouldBe` [("id", "1"), ("extra", "2")]
+
+ it "xml prologue test" $ do
+ let docWithPrologue = "<?xml version=\"1.1\"?>\n<greeting>Hello, world!</greeting>"
+ parsedRoot = fromRightE $ RDOM.parse docWithPrologue
+ name parsedRoot `shouldBe` "greeting"
+ it "html doctype test" $ do
+ let docWithPrologue = "<!DOCTYPE html>\n<greeting>Hello, world!</greeting>"
+ parsedRoot = fromRightE $ RDOM.parse docWithPrologue
+ name parsedRoot `shouldBe` "greeting"
+
+ describe
+ "hexml tests"
+ (do mapM_
+ (\(v, i) -> it (show i) (shouldBe (validate i) v))
+ (hexml_examples_sax ++ extra_examples_sax)
+ mapM_
+ (\(v, i) -> it (show i) (shouldBe (either (Left . show) (Right . id) (contents <$> parse i)) v))
+ cdata_tests
+
+ -- If this works without crashing we're happy.
+ let nsdoc = ("<ns:tag os:attr=\"Namespaced attribute value\">Content.</ns:tag>" :: ByteString)
+ it
+ "namespaces" $
+ validate nsdoc `shouldBe` True
+ )
+ it "recovers unclosed tag" $ do
+ let parsed = RDOM.parse "<a attr='a'><img></a>"
+ Debug.trace (show parsed) $ do
+ name (fromRightE parsed) `shouldBe` "a"
+ RDOM.attributes (fromRightE parsed) `shouldBe` [("attr", "a")]
+ map name (RDOM.children $ fromRightE parsed) `shouldBe` ["img"]
+ it "ignores too many closing tags" $ do
+ let parsed = RDOM.parse "<a></a></b></c>"
+ isRight parsed `shouldBe` True
+ describe "skipDoctype" $ do
+ it "strips initial doctype declaration" $ do
+ skipDoctype "<!DOCTYPE html><?xml version=\"1.0\" encoding=\"UTF-8\"?>Hello" `shouldBe` "<?xml version=\"1.0\" encoding=\"UTF-8\"?>Hello"
+ it "strips doctype after spaces" $ do
+ skipDoctype " \n<!DOCTYPE html><?xml version=\"1.0\" encoding=\"UTF-8\"?>Hello" `shouldBe` "<?xml version=\"1.0\" encoding=\"UTF-8\"?>Hello"
+ it "does not strip anything after or inside element" $ do
+ let insideElt = "<xml><?xml version=\"1.0\" encoding=\"UTF-8\"?>Hello</xml>"
+ skipDoctype insideElt `shouldBe` insideElt
hexml_examples_sax :: [(Bool, ByteString)]
hexml_examples_sax =
@@ -92,9 +171,18 @@ cdata_tests =
fromRightE :: Either XenoException a -> a
fromRightE = either (error . show) id
-
mapLeft :: Applicative f => (a -> f b) -> Either a b -> f b
mapLeft f = either f pure
mapRight :: Applicative f => (b -> f a) -> Either a b -> f a
mapRight = either pure
+
+allChildrens :: Node -> [Node]
+allChildrens n = allChildrens' [n]
+ where
+ allChildrens' :: [Node] -> [Node]
+ allChildrens' [] = []
+ allChildrens' ns =
+ let nextNodes = concatMap children ns
+ in nextNodes ++ (allChildrens' nextNodes)
+
diff --git a/xeno.cabal b/xeno.cabal
index 1841870..cfe5d78 100644
--- a/xeno.cabal
+++ b/xeno.cabal
@@ -1,5 +1,5 @@
name: xeno
-version: 0.3.5.2
+version: 0.4
synopsis: A fast event-based XML parser in pure Haskell
description: A fast, low-memory use, event-based XML parser in pure Haskell.
build-type: Simple
@@ -26,7 +26,7 @@ flag libxml2
library
hs-source-dirs: src
ghc-options: -Wall -O2
- exposed-modules: Xeno.SAX, Xeno.DOM, Xeno.Types, Xeno.Errors
+ exposed-modules: Xeno.SAX, Xeno.DOM, Xeno.DOM.Internal, Xeno.DOM.Robust, Xeno.Types, Xeno.Errors
other-modules: Control.Spork
build-depends: base >= 4.7 && < 5
, bytestring >= 0.10.8
@@ -55,7 +55,7 @@ benchmark xeno-speed-bench
build-depends: base, xeno, hexml, criterion, bytestring, deepseq, ghc-prim, xml, hexpat
if flag(libxml2)
build-depends: libxml
- ghc-options: -Wall -threaded -rtsopts -with-rtsopts=-N -O2
+ ghc-options: -Wall -rtsopts -O2
if flag(libxml2)
cpp-options: -DLIBXML2
-- ghc-options: -DLIBXML2 -- Hackage started complaining about this
@@ -66,5 +66,24 @@ benchmark xeno-memory-bench
hs-source-dirs: bench
main-is: Memory.hs
build-depends: base, xeno, weigh, bytestring, deepseq, hexml
- ghc-options: -Wall -threaded -rtsopts -with-rtsopts=-N -O2
+ ghc-options: -Wall -threaded -O2 -rtsopts -with-rtsopts=-N
+ default-language: Haskell2010
+
+benchmark xeno-speed-big-files-bench
+ type: exitcode-stdio-1.0
+ hs-source-dirs: bench
+ main-is: SpeedBigFiles.hs
+ build-depends: base, xeno, hexml, criterion, bytestring, deepseq, ghc-prim, xml, hexpat, bzlib, filepath
+ if flag(libxml2)
+ build-depends: libxml
+ ghc-options: -Wall -O2 -rtsopts "-with-rtsopts=-H8G -AL1G -A256m -M25G"
+ if flag(libxml2)
+ cpp-options: -DLIBXML2
+ default-language: Haskell2010
+
+executable xeno-bench
+ main-is: Bench.hs
+ hs-source-dirs: app
+ build-depends: base, xeno, weigh, bytestring, deepseq, hexml, bytestring-mmap, time
+ ghc-options: -O2 -threaded -rtsopts "-with-rtsopts=-N"
default-language: Haskell2010