summaryrefslogtreecommitdiff
path: root/src/Text/FromHTML.hs
blob: 4caa6e20e7cc6d8db164386f412b4dc7fdb13ae3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
{-|
Module      : Text.FromHTML
Description : Simple library for transformation of HTML to other formats
Copyright   : (c) Marek Suchánek, 2018
License     : MIT
Maintainer  : marek.suchanek@fit.cvut.cz
Stability   : experimental
Portability : POSIX

Simplified API for transformation of HTML to other formats with Pandoc
and wkhtmltopdf in Haskell code. It requires @wkhtmltopdf@ and @pandoc@
to be installed locally.
-}
module Text.FromHTML
   ( fromHTML
   , ExportType(..)
   ) where

import qualified Data.Char as C
import qualified Data.Text as T
import qualified Data.Text.Encoding as E
import qualified Data.ByteString as B

import           Control.Exception
import           GHC.IO.Encoding
import           System.Exit
import           System.Process.ByteString

-- | Allowed export types
data ExportType = HTML
                | LaTeX
                | RTF
                | RST
                | Markdown
                | AsciiDoc
                | Docx
                | ODT
                | DokuWiki
                | MediaWiki
                | EPUB2
                | EPUB3
                | PDF
                deriving (Show, Read, Enum, Bounded, Eq)

type Input = B.ByteString
type Output = B.ByteString
type Command = Input -> IO (Either Output Output)

str2BS :: String -> B.ByteString
str2BS = E.encodeUtf8 . T.pack

-- | Transform given HTML as String to selected format
fromHTML :: ExportType -> String -> IO (Either Output Output)
fromHTML HTML html = return $ Right (str2BS html)  -- HTML is already provided!
fromHTML PDF html = wkhtmltopdf (str2BS html)
fromHTML extp html = pandoc extp (str2BS html)

-- | Simple conversion of HTML to PDF using process wkhtmltopdf
wkhtmltopdf :: Command
wkhtmltopdf = perform "wkhtmltopdf" ["--quiet", "--disable-smart-shrinking",
                                     "--footer-center", "\"[page]\"", 
                                     "--footer-font-name", "\"Noto Serif\"", 
                                     "--footer-spacing", "10", 
                                     "--footer-font-size", "10", 
                                     "-B", "25mm", "-L", "25mm", "-R", "25mm", "-T", "25mm",
                                     "--encoding", "utf-8", "-", "-"]

-- | Simple conversion of HTML to some format using process pandoc
pandoc :: ExportType -> Command
pandoc expt = perform "pandoc" args
    where
        format = exportType2PD expt
        args = ["-s", "-f", "html", "-t", format, "-o", "-"]

-- | Perform process (catched IOException)
perform :: String -> [String] -> Command
perform cmd args input = catch (performUnsafe cmd args input)
        (\e -> do let err = show (e :: SomeException)
                  return . Left $ "Exception: " <> str2BS err)

-- | Perform process (no caching exceptions)
performUnsafe :: String -> [String] -> Command
performUnsafe cmd args input = do
    setLocaleEncoding utf8  -- don't know what was locales are there...
    (exitCode, stdout, stderr) <- readProcessWithExitCode cmd args input
    case exitCode of
      ExitSuccess -> return $ Right stdout
      _           -> return . Left $ str2BS (show exitCode) <> ": " <> stderr

exportType2PD :: ExportType -> String
exportType2PD = map C.toLower . show