summaryrefslogtreecommitdiff
path: root/Text/OPML/Conduit/Parse.hs
blob: 0194e31e73fef8085ae4fd07f914c45ccae672f9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
{-# OPTIONS_GHC -fno-warn-missing-signatures #-}
{-# LANGUAGE OverloadedLists    #-}
{-# LANGUAGE OverloadedStrings  #-}
{-# LANGUAGE RankNTypes         #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TemplateHaskell    #-}
{-# LANGUAGE TupleSections      #-}
{-# LANGUAGE TypeFamilies       #-}
-- | Streaming parser for the OPML 2.0 standard.
--
-- The parser tries to be as lenient as possible. All functions may throw an 'OpmlException'.
module Text.OPML.Conduit.Parse
  ( -- * Parsers
    parseOpml
  , parseOpmlHead
  , parseOpmlOutline
    -- * Exceptions
  , OpmlException(..)
  ) where

-- {{{ Imports
import           Conduit                      hiding (throwM)
import           Control.Applicative          hiding (many)
import           Control.Exception.Safe       as Exception
import           Control.Monad
import           Control.Monad.Fix
import           Data.CaseInsensitive         hiding (map)
import           Data.Either
import           Data.List.NonEmpty           (NonEmpty, nonEmpty)
import           Data.Maybe
import           Data.Monoid
import           Data.Monoid.Textual          hiding (map)
import           Data.Text                    as Text (Text, null, strip,
                                                       unpack)
import           Data.Text.Encoding
import           Data.Time.Clock
import           Data.Time.LocalTime
import           Data.Time.RFC822
import           Data.Tree
import           Data.Version
import           Data.XML.Types
import           Lens.Simple
import           Numeric
import           Prelude                      hiding (last)
import           Refined                      hiding (NonEmpty)
import           Text.OPML.Types
import           Text.ParserCombinators.ReadP (readP_to_S)
import           Text.XML.Stream.Parse

import           URI.ByteString
-- }}}

data OpmlException = MissingText
                   | InvalidBool Text
                   | InvalidDecimal Text
                   | InvalidTime Text
                   | InvalidURI URIParseError
                   | InvalidVersion Text

deriving instance Eq OpmlException
deriving instance Show OpmlException

instance Exception OpmlException where
  displayException MissingText = "An outline is missing the 'text' attribute."
  displayException (InvalidBool t) = "Invalid boolean: " ++ unpack t
  displayException (InvalidDecimal t) = "Invalid decimal: " ++ unpack t
  displayException (InvalidURI e) = "Invalid URI: " ++ show e
  displayException (InvalidTime t) = "Invalid time: " ++ unpack t
  displayException (InvalidVersion t) = "Invalid version: " ++ unpack t

asURI :: (MonadThrow m) => Text -> m URI
asURI t = either (throwM . InvalidURI) return . parseURI laxURIParserOptions $ encodeUtf8 t

asVersion :: MonadThrow m => Text -> m Version
asVersion v = case filter (Prelude.null . snd) . readP_to_S parseVersion $ unpack v of
  [(a, "")] -> return a
  _         -> throwM $ InvalidVersion v

asDecimal :: (MonadThrow m, Integral a) => Text -> m a
asDecimal t = case filter (Prelude.null . snd) . readSigned readDec $ unpack t of
  (result, _):_ -> return result
  _             -> throwM $ InvalidDecimal t

asExpansionState :: (MonadThrow m, Integral a) => Text -> m [a]
asExpansionState t = mapM asDecimal . filter (not . Text.null) . map strip $ split (== ',') t

asTime :: (MonadThrow m) => Text -> m UTCTime
asTime t = maybe (throwM $ InvalidTime t) (return . zonedTimeToUTC) $ parseTimeRFC822 t

-- The standard only accepts "true", and "false",
-- but it doesn't hurt to be more lenient
asBool :: (MonadThrow m) => Text -> m Bool
asBool t
  | mk t == "true" = return True
  | mk t == "false" = return False
  | otherwise = throwM $ InvalidBool t

asCategories :: Text -> [NonEmpty (Refined (Not Null) Text)]
asCategories = mapMaybe (nonEmpty . rights . map refine . split (== '/')) . split (== ',')

dateTag :: (MonadThrow m) => NameMatcher a -> ConduitM Event o m (Maybe UTCTime)
dateTag name = tagIgnoreAttrs name $ content >>= asTime

uriTag :: (MonadThrow m) => NameMatcher a -> ConduitM Event o m (Maybe URI)
uriTag name = tagIgnoreAttrs name $ content >>= asURI

expansionStateTag :: (MonadThrow m, Integral a) => ConduitM Event o m (Maybe [a])
expansionStateTag = tagIgnoreAttrs "expansionState" $ content >>= asExpansionState

textTag :: (MonadThrow m) => NameMatcher a -> ConduitM Event o m (Maybe Text)
textTag name = tagIgnoreAttrs name content

decimalTag :: (Integral i, MonadThrow m) => NameMatcher a -> ConduitM Event o m (Maybe i)
decimalTag name = tagIgnoreAttrs name $ content >>= asDecimal

projectC :: Monad m => Fold a a' b b' -> ConduitT a b m ()
projectC prism = fix $ \recurse -> do
  item <- await
  case (item, item ^? (_Just . prism)) of
    (_, Just a) -> yield a >> recurse
    (Just _, _) -> recurse
    _           -> return ()


data HeadPiece = HeadCreated UTCTime
               | HeadModified UTCTime
               | HeadDocs URI
               | HeadExpansionState [Int]
               | HeadOwnerEmail Text
               | HeadOwnerId URI
               | HeadOwnerName Text
               | HeadTitle Text
               | HeadVertScrollState Int
               | HeadWindowBottom Int
               | HeadWindowLeft Int
               | HeadWindowRight Int
               | HeadWindowTop Int

makeTraversals ''HeadPiece


-- | Parse the @\<head\>@ section.
-- This function is more lenient than what the standard demands on the following points:
--
-- - each sub-element may be repeated, in which case only the first occurrence is taken into account;
-- - each unknown sub-element is ignored.
parseOpmlHead :: (MonadCatch m) => ConduitM Event o m (Maybe OpmlHead)
parseOpmlHead = tagIgnoreAttrs "head" $ (manyYield' (choose piece) <* many ignoreAnyTreeContent) .| zipConduit where
  zipConduit = getZipConduit $ OpmlHead
    <$> ZipConduit (projectC _HeadTitle .| headDefC mempty)
    <*> ZipConduit (projectC _HeadCreated .| headC)
    <*> ZipConduit (projectC _HeadModified .| headC)
    <*> ZipConduit (projectC _HeadOwnerName .| headDefC mempty)
    <*> ZipConduit (projectC _HeadOwnerEmail .| headDefC mempty)
    <*> ZipConduit (projectC _HeadOwnerId .| headC)
    <*> ZipConduit (projectC _HeadDocs .| headC)
    <*> ZipConduit (projectC _HeadExpansionState .| concatC .| sinkList)
    <*> ZipConduit (projectC _HeadVertScrollState .| headC)
    <*> ZipConduit (projectC _HeadWindowBottom .| headC)
    <*> ZipConduit (projectC _HeadWindowLeft .| headC)
    <*> ZipConduit (projectC _HeadWindowRight .| headC)
    <*> ZipConduit (projectC _HeadWindowTop .| headC)
  piece = [ fmap HeadCreated <$> dateTag "dateCreated"
          , fmap HeadModified <$> dateTag "dateModified"
          , fmap HeadDocs <$> uriTag "docs"
          , fmap HeadExpansionState <$> expansionStateTag
          , fmap HeadOwnerEmail <$> textTag "ownerEmail"
          , fmap HeadOwnerId <$> uriTag "ownerId"
          , fmap HeadOwnerName <$> textTag "ownerName"
          , fmap HeadTitle <$> textTag "title"
          , fmap HeadVertScrollState <$> decimalTag "vertScrollState"
          , fmap HeadWindowBottom <$> decimalTag "windowBottom"
          , fmap HeadWindowLeft <$> decimalTag "windowLeft"
          , fmap HeadWindowRight <$> decimalTag "windowRight"
          , fmap HeadWindowTop <$> decimalTag "windowTop"
          ]


-- | Parse an @\<outline\>@ section.
-- The value of type attributes are not case-sensitive, that is @type=\"LINK\"@ has the same meaning as @type="link"@.
parseOpmlOutline :: (MonadCatch m) => ConduitM Event o m (Maybe (Tree OpmlOutline))
parseOpmlOutline = tag' "outline" attributes handler where
  attributes = do
    otype <- optional $ requireAttr "type"
    case mk <$> otype of
      Just "include" -> (,,,) otype <$> baseAttr <*> pure Nothing <*> (Just <$> linkAttr) <* ignoreAttrs
      Just "link" -> (,,,) otype <$> baseAttr <*> pure Nothing <*> (Just <$> linkAttr) <* ignoreAttrs
      Just "rss" -> (,,,) otype <$> baseAttr <*> (Just <$> subscriptionAttr) <*> pure Nothing <* ignoreAttrs
      _          -> (,,,) otype <$> baseAttr <*> pure Nothing <*> pure Nothing <* ignoreAttrs
  baseAttr = (,,,,) <$> (requireAttr "text" >>= refineThrow)
                    <*> optional (requireAttr "isComment" >>= asBool)
                    <*> optional (requireAttr "isBreakpoint" >>= asBool)
                    <*> optional (requireAttr "created" >>= asTime)
                    <*> optional (asCategories <$> requireAttr "category")
  linkAttr = requireAttr "url"
  subscriptionAttr = (,,,,,) <$> (requireAttr "xmlUrl" >>= asURI)
                             <*> optional (requireAttr "htmlUrl" >>= asURI)
                             <*> optional (requireAttr "description")
                             <*> optional (requireAttr "language")
                             <*> optional (requireAttr "title")
                             <*> optional (requireAttr "version")
  handler (_, b, Just s, _) = Node <$> (OpmlOutlineSubscription <$> baseHandler b <*> pure (subscriptionHandler s)) <*> pure []
  handler (_, b, _, Just l) = Node <$> (OpmlOutlineLink <$> baseHandler b <*> asURI l) <*> pure []
  handler (otype, b, _, _) = Node <$> (OpmlOutlineGeneric <$> baseHandler b <*> pure (fromMaybe mempty otype))
                                  <*> (manyYield' parseOpmlOutline .| sinkList)
  baseHandler (txt, comment, breakpoint, created, category) = return $ OutlineBase txt comment breakpoint created (fromMaybe mempty category)
  subscriptionHandler (uri, html, desc, lang, title, version) = OutlineSubscription uri html (fromMaybe mempty desc) (fromMaybe mempty lang) (fromMaybe mempty title) (fromMaybe mempty version)


data OpmlDocPiece = DocHead OpmlHead | DocBody [Tree OpmlOutline]

makeTraversals ''OpmlDocPiece


-- | Parse the top-level @\<opml\>@ element.
parseOpml :: (MonadCatch m) => ConduitM Event o m (Maybe Opml)
parseOpml = tag' "opml" attributes handler where
  attributes = (requireAttr "version" >>= asVersion) <* ignoreAttrs
  handler version = (manyYield' (choose piece) <* many ignoreAnyTreeContent) .| zipConduit version
  zipConduit version = getZipConduit $ Opml version
    <$> ZipConduit (projectC _DocHead .| headDefC mkOpmlHead)
    <*> ZipConduit (projectC _DocBody .| headDefC mempty)
  parseOpmlBody = tagIgnoreAttrs "body" $ manyYield' parseOpmlOutline .| sinkList
  piece = [ fmap DocHead <$> parseOpmlHead
          , fmap DocBody <$> parseOpmlBody
          ]