1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
|
{-# OPTIONS_GHC -fno-warn-missing-signatures #-}
{-# LANGUAGE OverloadedLists #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE TypeFamilies #-}
-- | Streaming parser for the OPML 2.0 standard.
--
-- The parser tries to be as lenient as possible. All functions may throw an 'OpmlException'.
module Text.OPML.Conduit.Parse
( -- * Parsers
parseOpml
, parseOpmlHead
, parseOpmlOutline
-- * Exceptions
, OpmlException(..)
) where
-- {{{ Imports
import Conduit hiding (throwM)
import Control.Applicative hiding (many)
import Control.Exception.Safe as Exception
import Control.Monad
import Control.Monad.Fix
import Data.CaseInsensitive hiding (map)
import Data.Either
import Data.List.NonEmpty (NonEmpty, nonEmpty)
import Data.Maybe
import Data.Monoid
import Data.Monoid.Textual hiding (map)
import Data.Text as Text (Text, null, strip,
unpack)
import Data.Text.Encoding
import Data.Time.Clock
import Data.Time.LocalTime
import Data.Time.RFC822
import Data.Tree
import Data.Version
import Data.XML.Types
import Lens.Simple
import Numeric
import Prelude hiding (last)
import Refined hiding (NonEmpty)
import Text.OPML.Types
import Text.ParserCombinators.ReadP (readP_to_S)
import Text.XML.Stream.Parse
import URI.ByteString
-- }}}
data OpmlException = MissingText
| InvalidBool Text
| InvalidDecimal Text
| InvalidTime Text
| InvalidURI URIParseError
| InvalidVersion Text
deriving instance Eq OpmlException
deriving instance Show OpmlException
instance Exception OpmlException where
displayException MissingText = "An outline is missing the 'text' attribute."
displayException (InvalidBool t) = "Invalid boolean: " ++ unpack t
displayException (InvalidDecimal t) = "Invalid decimal: " ++ unpack t
displayException (InvalidURI e) = "Invalid URI: " ++ show e
displayException (InvalidTime t) = "Invalid time: " ++ unpack t
displayException (InvalidVersion t) = "Invalid version: " ++ unpack t
asURI :: (MonadThrow m) => Text -> m URI
asURI t = either (throwM . InvalidURI) return . parseURI laxURIParserOptions $ encodeUtf8 t
asVersion :: MonadThrow m => Text -> m Version
asVersion v = case filter (Prelude.null . snd) . readP_to_S parseVersion $ unpack v of
[(a, "")] -> return a
_ -> throwM $ InvalidVersion v
asDecimal :: (MonadThrow m, Integral a) => Text -> m a
asDecimal t = case filter (Prelude.null . snd) . readSigned readDec $ unpack t of
(result, _):_ -> return result
_ -> throwM $ InvalidDecimal t
asExpansionState :: (MonadThrow m, Integral a) => Text -> m [a]
asExpansionState t = mapM asDecimal . filter (not . Text.null) . map strip $ split (== ',') t
asTime :: (MonadThrow m) => Text -> m UTCTime
asTime t = maybe (throwM $ InvalidTime t) (return . zonedTimeToUTC) $ parseTimeRFC822 t
-- The standard only accepts "true", and "false",
-- but it doesn't hurt to be more lenient
asBool :: (MonadThrow m) => Text -> m Bool
asBool t
| mk t == "true" = return True
| mk t == "false" = return False
| otherwise = throwM $ InvalidBool t
asCategories :: Text -> [NonEmpty (Refined (Not Null) Text)]
asCategories = mapMaybe (nonEmpty . rights . map refine . split (== '/')) . split (== ',')
dateTag :: (MonadThrow m) => NameMatcher a -> ConduitM Event o m (Maybe UTCTime)
dateTag name = tagIgnoreAttrs name $ content >>= asTime
uriTag :: (MonadThrow m) => NameMatcher a -> ConduitM Event o m (Maybe URI)
uriTag name = tagIgnoreAttrs name $ content >>= asURI
expansionStateTag :: (MonadThrow m, Integral a) => ConduitM Event o m (Maybe [a])
expansionStateTag = tagIgnoreAttrs "expansionState" $ content >>= asExpansionState
textTag :: (MonadThrow m) => NameMatcher a -> ConduitM Event o m (Maybe Text)
textTag name = tagIgnoreAttrs name content
decimalTag :: (Integral i, MonadThrow m) => NameMatcher a -> ConduitM Event o m (Maybe i)
decimalTag name = tagIgnoreAttrs name $ content >>= asDecimal
projectC :: Monad m => Fold a a' b b' -> ConduitT a b m ()
projectC prism = fix $ \recurse -> do
item <- await
case (item, item ^? (_Just . prism)) of
(_, Just a) -> yield a >> recurse
(Just _, _) -> recurse
_ -> return ()
data HeadPiece = HeadCreated UTCTime
| HeadModified UTCTime
| HeadDocs URI
| HeadExpansionState [Int]
| HeadOwnerEmail Text
| HeadOwnerId URI
| HeadOwnerName Text
| HeadTitle Text
| HeadVertScrollState Int
| HeadWindowBottom Int
| HeadWindowLeft Int
| HeadWindowRight Int
| HeadWindowTop Int
makeTraversals ''HeadPiece
-- | Parse the @\<head\>@ section.
-- This function is more lenient than what the standard demands on the following points:
--
-- - each sub-element may be repeated, in which case only the first occurrence is taken into account;
-- - each unknown sub-element is ignored.
parseOpmlHead :: (MonadCatch m) => ConduitM Event o m (Maybe OpmlHead)
parseOpmlHead = tagIgnoreAttrs "head" $ (manyYield' (choose piece) <* many ignoreAnyTreeContent) .| zipConduit where
zipConduit = getZipConduit $ OpmlHead
<$> ZipConduit (projectC _HeadTitle .| headDefC mempty)
<*> ZipConduit (projectC _HeadCreated .| headC)
<*> ZipConduit (projectC _HeadModified .| headC)
<*> ZipConduit (projectC _HeadOwnerName .| headDefC mempty)
<*> ZipConduit (projectC _HeadOwnerEmail .| headDefC mempty)
<*> ZipConduit (projectC _HeadOwnerId .| headC)
<*> ZipConduit (projectC _HeadDocs .| headC)
<*> ZipConduit (projectC _HeadExpansionState .| concatC .| sinkList)
<*> ZipConduit (projectC _HeadVertScrollState .| headC)
<*> ZipConduit (projectC _HeadWindowBottom .| headC)
<*> ZipConduit (projectC _HeadWindowLeft .| headC)
<*> ZipConduit (projectC _HeadWindowRight .| headC)
<*> ZipConduit (projectC _HeadWindowTop .| headC)
piece = [ fmap HeadCreated <$> dateTag "dateCreated"
, fmap HeadModified <$> dateTag "dateModified"
, fmap HeadDocs <$> uriTag "docs"
, fmap HeadExpansionState <$> expansionStateTag
, fmap HeadOwnerEmail <$> textTag "ownerEmail"
, fmap HeadOwnerId <$> uriTag "ownerId"
, fmap HeadOwnerName <$> textTag "ownerName"
, fmap HeadTitle <$> textTag "title"
, fmap HeadVertScrollState <$> decimalTag "vertScrollState"
, fmap HeadWindowBottom <$> decimalTag "windowBottom"
, fmap HeadWindowLeft <$> decimalTag "windowLeft"
, fmap HeadWindowRight <$> decimalTag "windowRight"
, fmap HeadWindowTop <$> decimalTag "windowTop"
]
-- | Parse an @\<outline\>@ section.
-- The value of type attributes are not case-sensitive, that is @type=\"LINK\"@ has the same meaning as @type="link"@.
parseOpmlOutline :: (MonadCatch m) => ConduitM Event o m (Maybe (Tree OpmlOutline))
parseOpmlOutline = tag' "outline" attributes handler where
attributes = do
otype <- optional $ requireAttr "type"
case mk <$> otype of
Just "include" -> (,,,) otype <$> baseAttr <*> pure Nothing <*> (Just <$> linkAttr) <* ignoreAttrs
Just "link" -> (,,,) otype <$> baseAttr <*> pure Nothing <*> (Just <$> linkAttr) <* ignoreAttrs
Just "rss" -> (,,,) otype <$> baseAttr <*> (Just <$> subscriptionAttr) <*> pure Nothing <* ignoreAttrs
_ -> (,,,) otype <$> baseAttr <*> pure Nothing <*> pure Nothing <* ignoreAttrs
baseAttr = (,,,,) <$> (requireAttr "text" >>= refineThrow)
<*> optional (requireAttr "isComment" >>= asBool)
<*> optional (requireAttr "isBreakpoint" >>= asBool)
<*> optional (requireAttr "created" >>= asTime)
<*> optional (asCategories <$> requireAttr "category")
linkAttr = requireAttr "url"
subscriptionAttr = (,,,,,) <$> (requireAttr "xmlUrl" >>= asURI)
<*> optional (requireAttr "htmlUrl" >>= asURI)
<*> optional (requireAttr "description")
<*> optional (requireAttr "language")
<*> optional (requireAttr "title")
<*> optional (requireAttr "version")
handler (_, b, Just s, _) = Node <$> (OpmlOutlineSubscription <$> baseHandler b <*> pure (subscriptionHandler s)) <*> pure []
handler (_, b, _, Just l) = Node <$> (OpmlOutlineLink <$> baseHandler b <*> asURI l) <*> pure []
handler (otype, b, _, _) = Node <$> (OpmlOutlineGeneric <$> baseHandler b <*> pure (fromMaybe mempty otype))
<*> (manyYield' parseOpmlOutline .| sinkList)
baseHandler (txt, comment, breakpoint, created, category) = return $ OutlineBase txt comment breakpoint created (fromMaybe mempty category)
subscriptionHandler (uri, html, desc, lang, title, version) = OutlineSubscription uri html (fromMaybe mempty desc) (fromMaybe mempty lang) (fromMaybe mempty title) (fromMaybe mempty version)
data OpmlDocPiece = DocHead OpmlHead | DocBody [Tree OpmlOutline]
makeTraversals ''OpmlDocPiece
-- | Parse the top-level @\<opml\>@ element.
parseOpml :: (MonadCatch m) => ConduitM Event o m (Maybe Opml)
parseOpml = tag' "opml" attributes handler where
attributes = (requireAttr "version" >>= asVersion) <* ignoreAttrs
handler version = (manyYield' (choose piece) <* many ignoreAnyTreeContent) .| zipConduit version
zipConduit version = getZipConduit $ Opml version
<$> ZipConduit (projectC _DocHead .| headDefC mkOpmlHead)
<*> ZipConduit (projectC _DocBody .| headDefC mempty)
parseOpmlBody = tagIgnoreAttrs "body" $ manyYield' parseOpmlOutline .| sinkList
piece = [ fmap DocHead <$> parseOpmlHead
, fmap DocBody <$> parseOpmlBody
]
|