summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnthonyCowley <>2017-12-06 17:19:00 (GMT)
committerhdiff <hdiff@hdiff.luite.com>2017-12-06 17:19:00 (GMT)
commitc5c855fcb6a0cbb40964de63a2e63a2cdc782dde (patch)
treee700c40becc27a7967b3b339dfcb0d83b85c3740
parent2612fc05123dad0cbd7fc1049c925043ec9469fb (diff)
version 0.3.00.3.0
-rw-r--r--CHANGELOG.md11
-rw-r--r--Frames.cabal19
-rw-r--r--benchmarks/BenchDemo.hs2
-rw-r--r--benchmarks/InsuranceBench.hs10
-rw-r--r--benchmarks/panda.py5
-rw-r--r--demo/Kata04.hs28
-rw-r--r--demo/Main.hs14
-rw-r--r--demo/MissingData.hs4
-rw-r--r--demo/Plot.hs4
-rw-r--r--demo/Plot2.hs11
-rw-r--r--src/Frames.hs64
-rw-r--r--src/Frames/CSV.hs228
-rw-r--r--src/Frames/RecF.hs30
13 files changed, 290 insertions, 140 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6dbea23..a4593ad 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,14 @@
+# 0.3.0
+
+- Pervasive use of `pipes` for CSV data loading
+
+This provides better exception handling (file handles should be closed more reliably), and offers an interface point for customized handling of input texts. An example of this latter point is working with particular file encodings.
+
+A breaking change is that operations that previously returned `IO` values now return `MonadSafe` constrained values.
+
+- Adaptation of `Data.Vinyl.Curry.runcurry` to the Frames `Record` type
+This simply strips the column name information from a row before applying the function from `vinyl`.
+
# 0.2.1
- Refactored to use the `CoRec` type provided by `vinyl` >= 0.6.0
diff --git a/Frames.cabal b/Frames.cabal
index c776001..35c0bc2 100644
--- a/Frames.cabal
+++ b/Frames.cabal
@@ -1,5 +1,5 @@
name: Frames
-version: 0.2.1.1
+version: 0.3.0
synopsis: Data frames For working with tabular data files
description: User-friendly, type safe, runtime efficient tooling for
working with tabular data deserialized from
@@ -59,7 +59,12 @@ library
vector,
readable >= 0.3.1,
pipes >= 4.1 && < 5,
- vinyl >= 0.6 && < 0.7
+ pipes-bytestring >= 2.1.6 && < 2.2,
+ pipes-group >= 1.0.8 && < 1.1,
+ pipes-parse >= 3.0 && < 3.1,
+ pipes-safe >= 2.2.6 && < 2.3,
+ pipes-text >= 0.0.2.5 && < 0.1,
+ vinyl >= 0.6 && < 0.8
hs-source-dirs: src
default-language: Haskell2010
ghc-options: -Wall
@@ -85,6 +90,7 @@ executable plot
microlens, vector, text,
template-haskell,
pipes >= 4.1.5 && < 4.4,
+ pipes-safe,
Chart >= 1.5 && < 1.9,
Chart-diagrams >= 1.5 && < 1.9,
diagrams-rasterific >= 1.3 && < 1.5,
@@ -117,7 +123,7 @@ executable demo
if flag(demos)
build-depends: base, list-t, microlens, transformers, Frames,
vector, text, template-haskell, ghc-prim, readable,
- pipes >= 4.1.5 && < 4.4
+ pipes
hs-source-dirs: demo
default-language: Haskell2010
ghc-options: -O2
@@ -130,8 +136,7 @@ executable tutorial
if flag(demos)
build-depends: base, Frames,
microlens, vector, text, template-haskell, readable,
- foldl >= 1.1.0 && < 1.3,
- pipes >= 4.1.5 && < 4.4
+ foldl, pipes
hs-source-dirs: demo
default-language: Haskell2010
@@ -142,7 +147,7 @@ executable benchdemo
main-is: BenchDemo.hs
if flag(demos)
build-depends: base, Frames,
- foldl >= 1.1.0 && < 1.3,
+ foldl >= 1.1.0 && < 1.4,
pipes >= 4.1.5 && < 4.4
hs-source-dirs: benchmarks
default-language: Haskell2010
@@ -158,7 +163,7 @@ executable missing
buildable: False
main-is: MissingData.hs
if flag(demos)
- build-depends: base, Frames, vinyl, pipes
+ build-depends: base, Frames, vinyl, pipes, pipes-safe
hs-source-dirs: demo
default-language: Haskell2010
diff --git a/benchmarks/BenchDemo.hs b/benchmarks/BenchDemo.hs
index 36397bd..aaee3ad 100644
--- a/benchmarks/BenchDemo.hs
+++ b/benchmarks/BenchDemo.hs
@@ -11,7 +11,7 @@ import Pipes.Prelude (fold)
tableTypes "Ins" "data/FL2.csv"
main :: IO ()
-main = do (lat,lng,n) <- F.purely fold f (readTable "data/FL2.csv")
+main = do (lat,lng,n) <- runSafeT $ F.purely fold f (readTable "data/FL2.csv")
print $ lat / n
print $ lng / n
where f :: F.Fold Ins (Double,Double,Double)
diff --git a/benchmarks/InsuranceBench.hs b/benchmarks/InsuranceBench.hs
index de76d40..06b8e5b 100644
--- a/benchmarks/InsuranceBench.hs
+++ b/benchmarks/InsuranceBench.hs
@@ -13,7 +13,7 @@ tableTypes "Ins" "data/FL2.csv"
type TinyIns = Record [PolicyID, PointLatitude, PointLongitude]
-tblP :: P.Producer Ins IO ()
+tblP :: P.Producer Ins (SafeT IO) ()
tblP = readTable "data/FL2.csv"
-- Strict pair
@@ -22,13 +22,15 @@ data P a = P !a !a
-- | Perform two consecutive folds of streamed-in data.
pipeBench :: IO (P Double)
pipeBench = do (n,sumLat) <-
+ runSafeT $
P.fold (\ !(!i, !s) r -> (i+1, s+rget pointLatitude r))
(0::Int,0)
id
tbl
- sumLong <- P.fold (\s r -> (s + rget pointLongitude r)) 0 id tbl
+ sumLong <- runSafeT $
+ P.fold (\s r -> (s + rget pointLongitude r)) 0 id tbl
return $! P (sumLat / fromIntegral n) (sumLong / fromIntegral n)
- where tbl = P.for tblP (P.yield . rcast) :: P.Producer TinyIns IO ()
+ where tbl = P.for tblP (P.yield . rcast) :: P.Producer TinyIns (SafeT IO) ()
-- | Perform two consecutive folds after first streaming all data into
-- an in-memory representation.
@@ -75,4 +77,4 @@ main :: IO ()
main = defaultMain [ bench "pipes" $ whnfIO pipeBench
, bench "pipes in-core" $ whnfIO pipeBenchInCore
, bench "pipes in-core subset" $ whnfIO pipeBenchInCore'
- , bench "pipes AoS" $ whnfIO pipeBenchAoS ]
+ , bench "pipes AoS subset" $ whnfIO pipeBenchAoS ]
diff --git a/benchmarks/panda.py b/benchmarks/panda.py
index 1fb09bd..20e7dff 100644
--- a/benchmarks/panda.py
+++ b/benchmarks/panda.py
@@ -1,7 +1,10 @@
# Demonstration of streaming data processing. Try building with
# cabal, then running with in bash with something like,
-#
+#
# $ /usr/bin/time -l python benchmarks/panda.py 2>&1 | head -n 4
+#
+# Or
+# nix-shell -p 'python.withPackages (p: [p.pandas])' --run '/usr/bin/time -l python benchmarks/panda.py 2>&1 | head -n 4'
from pandas import DataFrame, read_csv
import pandas as pd
diff --git a/demo/Kata04.hs b/demo/Kata04.hs
index 8b4eb92..487cf99 100644
--- a/demo/Kata04.hs
+++ b/demo/Kata04.hs
@@ -1,11 +1,11 @@
-{-# LANGUAGE DataKinds, FlexibleContexts, OverloadedStrings,
- TemplateHaskell, TypeFamilies, TypeOperators,
- MultiParamTypeClasses, ScopedTypeVariables,
- FlexibleInstances #-}
+{-# LANGUAGE AllowAmbiguousTypes, DataKinds, EmptyCase,
+ FlexibleContexts, FlexibleInstances,
+ MultiParamTypeClasses, OverloadedStrings,
+ ScopedTypeVariables, TemplateHaskell, TypeApplications,
+ TypeFamilies, TypeOperators #-}
module Main where
import qualified Data.Foldable as F
import Data.Ord (comparing)
-import Data.Proxy
import qualified Data.Text as T
import qualified Data.Text.IO as T
import Data.Vinyl.Functor (Identity(..))
@@ -33,24 +33,26 @@ type family Find i xs where
Find ('S i) (x ': xs) = Find i xs
class GetFieldByIndex i rs where
- getFieldByIndex :: proxy i -> Record rs -> Find i rs
+ getFieldByIndex :: Record rs -> Find i rs
instance GetFieldByIndex 'Z ((s :-> r) ': rs) where
- getFieldByIndex _ (Identity x :& _) = x
+ getFieldByIndex (Identity x :& _) = x
instance GetFieldByIndex i rs => GetFieldByIndex ('S i) ((s :-> r) ': rs) where
- getFieldByIndex _ (_ :& xs) = getFieldByIndex (Proxy :: Proxy i) xs
+ getFieldByIndex (_ :& xs) = getFieldByIndex @i xs
-getTemperatureRange' :: (GetFieldByIndex (S Z) rs, GetFieldByIndex (S (S Z)) rs,
- Find (S Z) rs ~ Double, Find (S (S Z)) rs ~ Double )
+getTemperatureRange' :: (GetFieldByIndex ('S 'Z) rs,
+ GetFieldByIndex ('S ('S 'Z)) rs,
+ Find ('S 'Z) rs ~ Double,
+ Find ('S ('S 'Z)) rs ~ Double )
=> Record rs -> Double
getTemperatureRange' row = mx - mn
- where mx = getFieldByIndex (Proxy::Proxy ('S 'Z)) row
- mn = getFieldByIndex (Proxy::Proxy ('S ('S 'Z))) row
+ where mx = getFieldByIndex @('S 'Z) row
+ mn = getFieldByIndex @('S ('S 'Z)) row
partOne' :: IO T.Text
partOne' = do tbl <- inCoreAoS (readTable "data/weather.csv") :: IO (Frame Row)
- return . getFieldByIndex (Proxy::Proxy Z) $
+ return . getFieldByIndex @'Z $
F.maximumBy (comparing getTemperatureRange') tbl
main :: IO ()
diff --git a/demo/Main.hs b/demo/Main.hs
index 1ee7e8a..f03b26d 100644
--- a/demo/Main.hs
+++ b/demo/Main.hs
@@ -4,26 +4,22 @@ module Main where
import Data.Functor.Identity
import Frames
import Lens.Micro
-import qualified ListT as L
import qualified Pipes as P
import qualified Pipes.Prelude as P
tableTypes "Row" "data/data1.csv"
-listTlist :: Monad m => L.ListT m a -> m [a]
-listTlist = L.toList
-
tbl :: IO [Row]
-tbl = listTlist $ readTable' "data/data1.csv"
+tbl = runSafeT . P.toListM $ readTable "data/data1.csv"
ageDoubler :: (Age ∈ rs) => Record rs -> Record rs
ageDoubler = age %~ (* 2)
tbl2 :: IO [Row]
-tbl2 = listTlist $ readTable' "data/data2.csv"
+tbl2 = runSafeT . P.toListM $ readTable "data/data2.csv"
tbl2a :: IO [ColFun Maybe Row]
-tbl2a = P.toListM $ readTableMaybe "data/data2.csv"
+tbl2a = runSafeT . P.toListM $ readTableMaybe "data/data2.csv"
{-
@@ -53,10 +49,10 @@ REPL examples:
-- characters (\n) for the text library's line parsing to work.
tableTypes "Ins" "data/FL2.csv"
-insuranceTbl :: P.Producer Ins IO ()
+insuranceTbl :: MonadSafe m => P.Producer Ins m ()
insuranceTbl = readTable "data/FL2.csv"
-insMaybe :: P.Producer (ColFun Maybe Ins) IO ()
+insMaybe :: MonadSafe m => P.Producer (ColFun Maybe Ins) m ()
insMaybe = readTableMaybe "data/FL2.csv"
type TinyIns = Record [PolicyID, PointLatitude, PointLongitude]
diff --git a/demo/MissingData.hs b/demo/MissingData.hs
index 27f69e5..3572c1b 100644
--- a/demo/MissingData.hs
+++ b/demo/MissingData.hs
@@ -55,7 +55,7 @@ tableTypes "Row" "data/missing.csv"
-- | Fill in missing columns with a default 'Row' value synthesized
-- from 'Default' instances.
-holesFilled :: Producer Row IO ()
+holesFilled :: MonadSafe m => Producer Row m ()
holesFilled = readTableMaybe "data/missing.csv" >-> P.map (fromJust . holeFiller)
where holeFiller :: Rec Maybe (RecordColumns Row) -> Maybe Row
holeFiller = recMaybe
@@ -65,7 +65,7 @@ holesFilled = readTableMaybe "data/missing.csv" >-> P.map (fromJust . holeFiller
fromJust = maybe (error "Frames holesFilled failure") id
showFilledHoles :: IO ()
-showFilledHoles = pipePreview holesFilled 10 cat
+showFilledHoles = runSafeT (pipePreview holesFilled 10 cat)
main :: IO ()
main = return ()
diff --git a/demo/Plot.hs b/demo/Plot.hs
index 4d5a133..73b5dd5 100644
--- a/demo/Plot.hs
+++ b/demo/Plot.hs
@@ -14,7 +14,7 @@ import Statistics.Sample.KernelDensity (kde)
tableTypes "Trigly" "data/trigly_d.csv"
-- Load the data. Invalid records use zeros as a placeholder.
-triglyData :: P.Producer Trigly IO ()
+triglyData :: MonadSafe m => P.Producer Trigly m ()
triglyData = readTable "data/trigly_d.csv" P.>-> P.filter ((> 0) . view lBDLDL)
-- Adapted from a Chart example
@@ -49,7 +49,7 @@ mkPlots xs = do layout_title .= "Distributions"
main :: IO ()
main = do env <- defaultEnv bitmapAlignmentFns 640 480
let chart2diagram = fst . runBackendR env . toRenderable . execEC
- ldlData <- P.toListM $ triglyData P.>-> P.map rcast
+ ldlData <- runSafeT . P.toListM $ triglyData P.>-> P.map rcast
let d = chart2diagram $ mkPlots ldlData
sz = dims2D (width d) (height d)
renderRasterific "plot.png" sz d
diff --git a/demo/Plot2.hs b/demo/Plot2.hs
index 8c9d7fe..489029d 100644
--- a/demo/Plot2.hs
+++ b/demo/Plot2.hs
@@ -14,21 +14,21 @@ import qualified Data.Foldable as F
-- Data from http://archive.ics.uci.edu/ml/datasets/Adult
tableTypes "Income" "data/adult.csv"
-adultData :: Producer Income IO ()
+adultData :: MonadSafe m => Producer Income m ()
adultData = readTable "data/adult.csv"
-fishers :: Producer Income IO ()
+fishers :: MonadSafe m => Producer Income m ()
fishers = adultData >-> P.filter isFisher >-> P.filter makesMoney
where isFisher = ((>0) . T.count "fishing" . T.toCaseFold . view occupation)
makesMoney = (> 0) . view capitalGain
-fisherIncomeData :: Producer (Record [Age, CapitalGain]) IO ()
+fisherIncomeData :: MonadSafe m => Producer (Record [Age, CapitalGain]) m ()
fisherIncomeData = fishers >-> P.map rcast
mkPlot :: IO ()
mkPlot = do env <- defaultEnv bitmapAlignmentFns 640 480
let chart2diagram = fst . runBackendR env . toRenderable . execEC
- xs <- P.toListM fisherIncomeData
+ xs <- runSafeT $ P.toListM fisherIncomeData
let d = chart2diagram $ do
layout_title .= "Farmer/fisher Income vs Age"
layout_x_axis . laxis_title .= "Age (Years)"
@@ -39,7 +39,8 @@ mkPlot = do env <- defaultEnv bitmapAlignmentFns 640 480
-- Manually fused folds
main :: IO ()
-main = do ((age_,inc,n), _) <- P.fold' aux (0,0,0::Double) id fisherIncomeData
+main = do ((age_,inc,n), _) <- runSafeT $
+ P.fold' aux (0,0,0::Double) id fisherIncomeData
putStrLn $ "The average farmer/fisher is "++
show (fromIntegral age_ / n) ++
" and made " ++ show (fromIntegral inc / n)
diff --git a/src/Frames.hs b/src/Frames.hs
index f87fedb..dc7ed37 100644
--- a/src/Frames.hs
+++ b/src/Frames.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE TypeFamilies #-}
-- | User-friendly, type safe, runtime efficient tooling for working
-- with tabular data deserialized from comma-separated values (CSV)
-- files. The type of each row of data is inferred from data, which
@@ -12,29 +13,82 @@ module Frames
, module Frames.CSV
, module Frames.Exploration
, module Frames.Frame
- , module Frames.InCore
+ , inCoreAoS, inCoreAoS', inCore, inCoreSoA
+ , I.toAoS, I.toFrame, I.filterFrame
, module Frames.Melt
, module Frames.Rec
, module Frames.RecF
, module Frames.RecLens
, module Frames.TypeLevel
+ , module Pipes.Safe, runSafeEffect
, Text
) where
+import Control.Monad.IO.Class (MonadIO)
+import Control.Monad.Primitive
import Data.Text (Text)
-import Data.Vinyl ((<+>))
+import Data.Vinyl ((<+>), Rec)
import Data.Vinyl.CoRec (Field, onField, onCoRec)
import Data.Vinyl.Lens hiding (rlens, rget, rput)
import Data.Vinyl.TypeLevel (AllConstrained, AllSatisfied, AllAllSat)
import Frames.Col ((:->)(..))
import Frames.ColumnUniverse
-import Frames.CSV (readTable, readTableMaybe, readTable', declareColumn,
+import Frames.CSV (readTable, readTableMaybe, declareColumn,
+ pipeTable, pipeTableMaybe,
tableType, tableTypes, tableType', tableTypes')
import Frames.Exploration
import Frames.Frame
-import Frames.InCore (toFrame, inCore, inCoreSoA,
- inCoreAoS, inCoreAoS', toAoS, filterFrame)
+import qualified Frames.InCore as I
import Frames.Melt (melt, meltRow)
import Frames.Rec (Record, RecordColumns, (&:), recUncons, recMaybe, showFields)
import Frames.RecF
import Frames.RecLens
import Frames.TypeLevel
+import qualified Pipes as P
+import Pipes.Safe (MonadSafe, runSafeT, runSafeP, SafeT)
+import qualified Pipes.Safe as PS
+
+-- * SafeT helpers
+
+-- | Run a self-contained ’Pipes.Effect’ and execute the finalizers
+-- associated with the ’SafeT’ transformer.
+runSafeEffect :: (MonadIO m, PS.MonadMask m)
+ => P.Effect (SafeT m) r -> m r
+runSafeEffect = runSafeT . P.runEffect
+
+-- | Stream a finite sequence of rows into an efficient in-memory
+-- representation for further manipulation. Each column of the input
+-- table will be stored optimally based on its type, making use of the
+-- resulting generators a matter of indexing into a densely packed
+-- representation. Returns a 'Frame' that provides a function to index
+-- into the table.
+inCoreAoS :: (PrimMonad m, MonadIO m, PS.MonadMask m, I.RecVec rs)
+ => P.Producer (Record rs) (PS.SafeT m) () -> m (FrameRec rs)
+inCoreAoS = runSafeT . I.inCoreAoS
+
+-- | Like 'inCoreAoS', but applies the provided function to the record
+-- of columns before building the 'Frame'.
+inCoreAoS' :: (PrimMonad m, MonadIO m, PS.MonadMask m, I.RecVec rs)
+ => (Rec ((->) Int) rs -> Rec ((->) Int) ss)
+ -> P.Producer (Record rs) (SafeT m) () -> m (FrameRec ss)
+inCoreAoS' f = runSafeT . I.inCoreAoS' f
+
+-- | Stream a finite sequence of rows into an efficient in-memory
+-- representation for further manipulation. Each column of the input
+-- table will be stored optimally based on its type, making use of the
+-- resulting generator a matter of indexing into a densely packed
+-- representation.
+inCore :: (PrimMonad m, MonadIO m, PS.MonadMask m, I.RecVec rs, Monad n)
+ => P.Producer (Record rs) (SafeT m) () -> m (P.Producer (Record rs) n ())
+inCore = runSafeT . I.inCore
+
+-- | Stream a finite sequence of rows into an efficient in-memory
+-- representation for further manipulation. Each column of the input
+-- table will be stored optimally based on its type, making use of the
+-- resulting generators a matter of indexing into a densely packed
+-- representation. Returns the number of rows and a record of column
+-- indexing functions. See 'toAoS' to convert the result to a 'Frame'
+-- which provides an easier-to-use function that indexes into the
+-- table in a row-major fashion.
+inCoreSoA :: (PrimMonad m, MonadIO m, PS.MonadMask m, I.RecVec rs)
+ => P.Producer (Record rs) (SafeT m) () -> m (Int, Rec ((->) Int) rs)
+inCoreSoA = runSafeT . I.inCoreSoA
diff --git a/src/Frames/CSV.hs b/src/Frames/CSV.hs
index 1369771..95efdc0 100644
--- a/src/Frames/CSV.hs
+++ b/src/Frames/CSV.hs
@@ -1,15 +1,7 @@
-{-# LANGUAGE BangPatterns,
- CPP,
- DataKinds,
- FlexibleInstances,
- KindSignatures,
- LambdaCase,
- MultiParamTypeClasses,
- OverloadedStrings,
- QuasiQuotes,
- RecordWildCards,
- ScopedTypeVariables,
- TemplateHaskell,
+{-# LANGUAGE BangPatterns, CPP, DataKinds, FlexibleInstances,
+ KindSignatures, LambdaCase, MultiParamTypeClasses,
+ OverloadedStrings, QuasiQuotes, RankNTypes,
+ RecordWildCards, ScopedTypeVariables, TemplateHaskell,
TypeOperators #-}
-- | Infer row types from comma-separated values (CSV) data and read
-- that data from files. Template Haskell is used to generate the
@@ -24,8 +16,7 @@ import Data.Monoid (Monoid)
#endif
import Control.Arrow (first, second)
-import Control.Monad (MonadPlus(..), when, void)
-import Control.Monad.IO.Class
+import Control.Monad (when, void)
import Data.Char (isAlpha, isAlphaNum, toLower, toUpper)
import qualified Data.Foldable as F
import Data.List (intercalate)
@@ -33,7 +24,6 @@ import Data.Maybe (isNothing, fromMaybe)
import Data.Monoid ((<>))
import Data.Proxy
import qualified Data.Text as T
-import qualified Data.Text.IO as T
import Data.Vinyl (RElem, Rec)
import Data.Vinyl.TypeLevel (RecAll, RIndex)
import Data.Vinyl.Functor (Identity)
@@ -45,9 +35,18 @@ import Frames.RecF
import Frames.RecLens
import Language.Haskell.TH
import Language.Haskell.TH.Syntax
+import Pipes ((>->))
import qualified Pipes as P
import qualified Pipes.Prelude as P
-import System.IO (Handle, hIsEOF, openFile, IOMode(..), withFile)
+import qualified Pipes.ByteString
+import qualified Pipes.Group
+import qualified Pipes.Parse as P
+import qualified Pipes.Prelude.Text as PT
+import qualified Pipes.Text as PT
+import qualified Pipes.Text.Encoding as PT
+import qualified Pipes.Safe as P
+import qualified Pipes.Safe.Prelude
+import System.IO (IOMode(ReadMode))
type Separator = T.Text
@@ -150,25 +149,26 @@ reassembleRFC4180QuotedParts sep quoteChar = go
-- | Infer column types from a prefix (up to 1000 lines) of a CSV
-- file.
-prefixInference :: (ColumnTypeable a, Monoid a)
- => ParserOptions -> Handle -> IO [a]
-prefixInference opts h = T.hGetLine h >>= go prefixSize . inferCols
- where prefixSize = 1000 :: Int
- inferCols = map inferType . tokenizeRow opts
- go 0 ts = return ts
- go !n ts =
- hIsEOF h >>= \case
- True -> return ts
- False -> T.hGetLine h >>= go (n - 1) . zipWith (<>) ts . inferCols
+prefixInference :: (ColumnTypeable a, Monoid a, Monad m)
+ => ParserOptions
+ -> P.Parser T.Text m [a]
+prefixInference opts = P.draw >>= \case
+ Nothing -> return []
+ Just row1 -> P.foldAll (\ts -> zipWith (<>) ts . inferCols)
+ (inferCols row1)
+ id
+ where inferCols = map inferType . tokenizeRow opts
-- | Extract column names and inferred types from a CSV file.
-readColHeaders :: (ColumnTypeable a, Monoid a)
- => ParserOptions -> FilePath -> IO [(T.Text, a)]
-readColHeaders opts f = withFile f ReadMode $ \h ->
- zip <$> maybe (tokenizeRow opts <$> T.hGetLine h)
- pure
- (headerOverride opts)
- <*> prefixInference opts h
+readColHeaders :: (ColumnTypeable a, Monoid a, Monad m)
+ => ParserOptions -> P.Producer T.Text m () -> m [(T.Text, a)]
+readColHeaders opts = P.evalStateT $
+ do headerRow <- maybe ((tokenizeRow opts
+ . fromMaybe (error "Empty Producer has no header row")) <$> P.draw)
+ pure
+ (headerOverride opts)
+ colTypes <- prefixInference opts
+ return (zip headerRow colTypes)
-- * Loading Data
@@ -184,70 +184,101 @@ instance (Parseable t, ReadRec ts) => ReadRec (s :-> t ': ts) where
readRec [] = frameCons Nothing (readRec [])
readRec (h:t) = frameCons (parse' h) (readRec t)
+-- | Produce the lines of a latin1 (or ISO8859 Part 1) encoded file as
+-- ’T.Text’ values. Similar to ’PT.readFileLn’ that uses the system
+-- locale for decoding, but built on the ’PT.decodeIso8859_1’ decoder.
+readFileLatin1Ln :: P.MonadSafe m => FilePath -> P.Producer T.Text m ()
+readFileLatin1Ln fp = Pipes.Safe.Prelude.withFile fp ReadMode $ \h ->
+ let latinText = void (PT.decodeIso8859_1 (Pipes.ByteString.fromHandle h))
+ latinLines = PT.decode PT.lines latinText
+ in Pipes.Group.concats latinLines
+
-- | Read a 'RecF' from one line of CSV.
readRow :: ReadRec rs => ParserOptions -> T.Text -> Rec Maybe rs
readRow = (readRec .) . tokenizeRow
-- | Produce rows where any given entry can fail to parse.
-readTableMaybeOpt :: (MonadIO m, ReadRec rs)
+readTableMaybeOpt :: (P.MonadSafe m, ReadRec rs)
=> ParserOptions -> FilePath -> P.Producer (Rec Maybe rs) m ()
readTableMaybeOpt opts csvFile =
- do h <- liftIO $ do
- h <- openFile csvFile ReadMode
- when (isNothing $ headerOverride opts) (void $ T.hGetLine h)
- return h
- let go = liftIO (hIsEOF h) >>= \case
- True -> return ()
- False -> liftIO (readRow opts <$> T.hGetLine h) >>= P.yield >> go
- go
+ PT.readFileLn csvFile >-> pipeTableMaybeOpt opts
{-# INLINE readTableMaybeOpt #-}
+-- | Stream lines of CSV data into rows of ’Rec’ values values where
+-- any given entry can fail to parse.
+pipeTableMaybeOpt :: (Monad m, ReadRec rs)
+ => ParserOptions
+ -> P.Pipe T.Text (Rec Maybe rs) m ()
+pipeTableMaybeOpt opts = do
+ when (isNothing (headerOverride opts)) (() <$ P.await)
+ P.map (readRow opts)
+{-# INLINE pipeTableMaybeOpt #-}
+
-- | Produce rows where any given entry can fail to parse.
-readTableMaybe :: (MonadIO m, ReadRec rs)
+readTableMaybe :: (P.MonadSafe m, ReadRec rs)
=> FilePath -> P.Producer (Rec Maybe rs) m ()
readTableMaybe = readTableMaybeOpt defaultParser
{-# INLINE readTableMaybe #-}
--- | Returns a `MonadPlus` producer of rows for which each column was
--- successfully parsed. This is typically slower than 'readTableOpt'.
-readTableOpt' :: forall m rs.
- (MonadPlus m, MonadIO m, ReadRec rs)
- => ParserOptions -> FilePath -> m (Record rs)
-readTableOpt' opts csvFile =
- do h <- liftIO $ do
- h <- openFile csvFile ReadMode
- when (isNothing $ headerOverride opts) (void $ T.hGetLine h)
- return h
- let go = liftIO (hIsEOF h) >>= \case
- True -> mzero
- False -> let r = recMaybe . readRow opts <$> T.hGetLine h
- in liftIO r >>= maybe go (flip mplus go . return)
- go
-{-# INLINE readTableOpt' #-}
-
--- | Returns a `MonadPlus` producer of rows for which each column was
--- successfully parsed. This is typically slower than 'readTable'.
-readTable' :: forall m rs. (MonadPlus m, MonadIO m, ReadRec rs)
- => FilePath -> m (Record rs)
-readTable' = readTableOpt' defaultParser
-{-# INLINE readTable' #-}
+-- | Stream lines of CSV data into rows of ’Rec’ values where any
+-- given entry can fail to parse.
+pipeTableMaybe :: (Monad m, ReadRec rs) => P.Pipe T.Text (Rec Maybe rs) m ()
+pipeTableMaybe = pipeTableMaybeOpt defaultParser
+{-# INLINE pipeTableMaybe #-}
+
+-- -- | Returns a `MonadPlus` producer of rows for which each column was
+-- -- successfully parsed. This is typically slower than 'readTableOpt'.
+-- readTableOpt' :: forall m rs.
+-- (MonadPlus m, MonadIO m, ReadRec rs)
+-- => ParserOptions -> FilePath -> m (Record rs)
+-- readTableOpt' opts csvFile =
+-- do h <- liftIO $ do
+-- h <- openFile csvFile ReadMode
+-- when (isNothing $ headerOverride opts) (void $ T.hGetLine h)
+-- return h
+-- let go = liftIO (hIsEOF h) >>= \case
+-- True -> mzero
+-- False -> let r = recMaybe . readRow opts <$> T.hGetLine h
+-- in liftIO r >>= maybe go (flip mplus go . return)
+-- go
+-- {-# INLINE readTableOpt' #-}
+
+-- -- | Returns a `MonadPlus` producer of rows for which each column was
+-- -- successfully parsed. This is typically slower than 'readTable'.
+-- readTable' :: forall m rs. (P.MonadSafe m, ReadRec rs)
+-- => FilePath -> m (Record rs)
+-- readTable' = readTableOpt' defaultParser
+-- {-# INLINE readTable' #-}
-- | Returns a producer of rows for which each column was successfully
-- parsed.
readTableOpt :: forall m rs.
- (MonadIO m, ReadRec rs)
+ (P.MonadSafe m, ReadRec rs)
=> ParserOptions -> FilePath -> P.Producer (Record rs) m ()
readTableOpt opts csvFile = readTableMaybeOpt opts csvFile P.>-> go
where go = P.await >>= maybe go (\x -> P.yield x >> go) . recMaybe
{-# INLINE readTableOpt #-}
+-- | Pipe lines of CSV text into rows for which each column was
+-- successfully parsed.
+pipeTableOpt :: (ReadRec rs, Monad m)
+ => ParserOptions -> P.Pipe T.Text (Record rs) m ()
+pipeTableOpt opts = pipeTableMaybeOpt opts >-> P.map recMaybe >-> P.concat
+{-# INLINE pipeTableOpt #-}
+
-- | Returns a producer of rows for which each column was successfully
-- parsed.
-readTable :: forall m rs. (MonadIO m, ReadRec rs)
+readTable :: forall m rs. (P.MonadSafe m, ReadRec rs)
=> FilePath -> P.Producer (Record rs) m ()
readTable = readTableOpt defaultParser
{-# INLINE readTable #-}
+-- | Pipe lines of CSV text into rows for which each column was
+-- successfully parsed.
+pipeTable :: (ReadRec rs, Monad m) => P.Pipe T.Text (Record rs) m ()
+pipeTable = pipeTableOpt defaultParser
+{-# INLINE pipeTable #-}
+
-- * Template Haskell
-- | Generate a column type.
@@ -257,14 +288,18 @@ recDec = appT [t|Record|] . go
go ((n,t):cs) =
[t|($(litT $ strTyLit (T.unpack n)) :-> $(t)) ': $(go cs) |]
+
+-- | Capitalize the first letter of a 'T.Text'.
+capitalize1 :: T.Text -> T.Text
+capitalize1 = foldMap (onHead toUpper) . T.split (not . isAlphaNum)
+ where onHead f = maybe mempty (uncurry T.cons . first f) . T.uncons
+
-- | Massage a column name from a CSV file into a valid Haskell type
-- identifier.
sanitizeTypeName :: T.Text -> T.Text
sanitizeTypeName = unreserved . fixupStart
- . T.concat . T.split (not . valid) . toTitle'
+ . T.concat . T.split (not . valid) . capitalize1
where valid c = isAlphaNum c || c == '\'' || c == '_'
- toTitle' = foldMap (onHead toUpper) . T.split (not . isAlphaNum)
- onHead f = maybe mempty (uncurry T.cons) . fmap (first f) . T.uncons
unreserved t
| t `elem` ["Type", "Class"] = "Col" <> t
| otherwise = t
@@ -312,7 +347,7 @@ lowerHead = fmap aux . T.uncons
colDec :: ColumnTypeable a => T.Text -> T.Text -> a -> DecsQ
colDec prefix colName colTy = (:) <$> mkColTDec colTypeQ colTName'
<*> mkColPDec colTName' colTyQ colPName
- where colTName = sanitizeTypeName (prefix <> colName)
+ where colTName = sanitizeTypeName (prefix <> capitalize1 colName)
colPName = fromMaybe "colDec impossible" (lowerHead colTName)
colTName' = mkName $ T.unpack colTName
colTyQ = colType colTy
@@ -353,6 +388,8 @@ data RowGen a = RowGen { columnNames :: [String]
-- can be used to classify a column. This is
-- essentially a type-level list of types. See
-- 'colQ'.
+ , lineReader :: P.Producer T.Text (P.SafeT IO) ()
+ -- ^ A producer of lines of ’T.Text’xs
}
-- | Shorthand for a 'Proxy' value of 'ColumnUniverse' applied to the
@@ -364,13 +401,13 @@ colQ n = [e| (Proxy :: Proxy (ColumnUniverse $(conT n))) |]
-- get column names from the data file, use the default column
-- separator (a comma), infer column types from the default 'Columns'
-- set of types, and produce a row type with name @Row@.
-rowGen :: RowGen Columns
-rowGen = RowGen [] "" defaultSep "Row" Proxy
+rowGen :: FilePath -> RowGen Columns
+rowGen = RowGen [] "" defaultSep "Row" Proxy . PT.readFileLn
-- | Generate a type for each row of a table. This will be something
-- like @Record ["x" :-> a, "y" :-> b, "z" :-> c]@.
tableType :: String -> FilePath -> DecsQ
-tableType n = tableType' rowGen { rowTypeName = n }
+tableType n fp = tableType' (rowGen fp) { rowTypeName = n }
-- | Like 'tableType', but additionally generates a type synonym for
-- each column, and a proxy value of that type. If the CSV file has
@@ -378,32 +415,42 @@ tableType n = tableType' rowGen { rowTypeName = n }
-- @type Foo = "foo" :-> Int@, for example, @foo = rlens (Proxy :: Proxy
-- Foo)@, and @foo' = rlens' (Proxy :: Proxy Foo)@.
tableTypes :: String -> FilePath -> DecsQ
-tableTypes n = tableTypes' rowGen { rowTypeName = n }
+tableTypes n fp = tableTypes' (rowGen fp) { rowTypeName = n }
-- * Customized Data Set Parsing
+-- | Inspect no more than this many lines when inferring column types.
+prefixSize :: Int
+prefixSize = 1000
+
-- | Generate a type for a row of a table. This will be something like
-- @Record ["x" :-> a, "y" :-> b, "z" :-> c]@. Column type synonyms
-- are /not/ generated (see 'tableTypes'').
tableType' :: forall a. (ColumnTypeable a, Monoid a)
- => RowGen a -> FilePath -> DecsQ
-tableType' (RowGen {..}) csvFile =
+ => RowGen a -> DecsQ
+tableType' (RowGen {..}) =
pure . TySynD (mkName rowTypeName) [] <$>
- (runIO (readColHeaders opts csvFile) >>= recDec')
+ (runIO (P.runSafeT (readColHeaders opts lineSource)) >>= recDec')
where recDec' = recDec . map (second colType) :: [(T.Text, a)] -> Q Type
colNames' | null columnNames = Nothing
| otherwise = Just (map T.pack columnNames)
opts = ParserOptions colNames' separator (RFC4180Quoting '\"')
+ lineSource = lineReader >-> P.take prefixSize
+
+-- | Tokenize the first line of a ’P.Producer’.
+colNamesP :: Monad m
+ => ParserOptions -> P.Producer T.Text m () -> m [T.Text]
+colNamesP opts src = either (const []) (tokenizeRow opts . fst) <$> P.next src
-- | Generate a type for a row of a table all of whose columns remain
-- unparsed 'Text' values.
tableTypesText' :: forall a. (ColumnTypeable a, Monoid a)
- => RowGen a -> FilePath -> DecsQ
-tableTypesText' (RowGen {..}) csvFile =
- do colNames <- runIO $ withFile csvFile ReadMode $ \h ->
- maybe (tokenizeRow opts <$> T.hGetLine h)
- pure
- (headerOverride opts)
+ => RowGen a -> DecsQ
+tableTypesText' (RowGen {..}) =
+ do colNames <- runIO . P.runSafeT $
+ maybe (colNamesP opts lineReader)
+ pure
+ (headerOverride opts)
let headers = zip colNames (repeat (inferType " "))
recTy <- tySynD (mkName rowTypeName) [] (recDec' headers)
let optsName = case rowTypeName of
@@ -424,9 +471,9 @@ tableTypesText' (RowGen {..}) csvFile =
-- @type Foo = "foo" :-> Int@, for example, @foo = rlens (Proxy ::
-- Proxy Foo)@, and @foo' = rlens' (Proxy :: Proxy Foo)@.
tableTypes' :: forall a. (ColumnTypeable a, Monoid a)
- => RowGen a -> FilePath -> DecsQ
-tableTypes' (RowGen {..}) csvFile =
- do headers <- runIO $ readColHeaders opts csvFile
+ => RowGen a -> DecsQ
+tableTypes' (RowGen {..}) =
+ do headers <- runIO . P.runSafeT $ readColHeaders opts lineSource
recTy <- tySynD (mkName rowTypeName) [] (recDec' headers)
let optsName = case rowTypeName of
[] -> error "Row type name shouldn't be empty"
@@ -441,6 +488,7 @@ tableTypes' (RowGen {..}) csvFile =
colNames' | null columnNames = Nothing
| otherwise = Just (map T.pack columnNames)
opts = ParserOptions colNames' separator (RFC4180Quoting '\"')
+ lineSource = lineReader >-> P.take prefixSize
mkColDecs colNm colTy = do
let safeName = tablePrefix ++ (T.unpack . sanitizeTypeName $ colNm)
mColNm <- lookupTypeName safeName
@@ -465,5 +513,5 @@ produceCSV recs = do
writeCSV :: (ColumnHeaders ts, AsVinyl ts, Foldable f,
RecAll Identity (UnColumn ts) Show)
=> FilePath -> f (Record ts) -> IO ()
-writeCSV fp recs = withFile fp WriteMode $ \h ->
- P.runEffect $ produceCSV recs P.>-> P.toHandle h
+writeCSV fp recs = P.runSafeT . P.runEffect $
+ produceCSV recs >-> P.map T.pack >-> PT.writeFileLn fp
diff --git a/src/Frames/RecF.hs b/src/Frames/RecF.hs
index 82de417..5ec6315 100644
--- a/src/Frames/RecF.hs
+++ b/src/Frames/RecF.hs
@@ -16,13 +16,15 @@ module Frames.RecF (V.rappend, V.rtraverse, rdel, CanDelete,
frameCons, frameConsA, frameSnoc,
pattern (:&), pattern Nil, AllCols,
UnColumn, AsVinyl(..), mapMono, mapMethod,
+ runcurry, runcurry', runcurryA, runcurryA',
ShowRec, showRec, ColFun, ColumnHeaders,
columnHeaders, reifyDict) where
import Data.List (intercalate)
import Data.Proxy
import qualified Data.Vinyl as V
import Data.Vinyl (Rec(RNil), RecApplicative(rpure))
-import Data.Vinyl.Functor (Identity)
+import qualified Data.Vinyl.Curry as V
+import Data.Vinyl.Functor (Compose, Identity)
import Data.Vinyl.TypeLevel
import Frames.Col
import Frames.TypeLevel
@@ -127,6 +129,32 @@ mapMethod :: forall f c ts.
=> Proxy c -> (forall a. c a => a -> a) -> Rec f ts -> Rec f ts
mapMethod p f = fromVinyl . mapMethodV p f . toVinyl
+-- * Currying Adapted from "Vinyl.Curry"
+
+-- | N-ary version of 'uncurry' over functorial frame rows. See 'V.runcurry'.
+runcurry :: (Functor f, AsVinyl ts)
+ => V.CurriedF f (UnColumn ts) a -> Rec f ts -> a
+runcurry = (. toVinyl) . V.runcurry
+{-# INLINABLE runcurry #-}
+
+-- | N-ary version of 'uncurry' over pure frame rows. See 'V.runcurry''.
+runcurry' :: AsVinyl ts => V.Curried (UnColumn ts) a -> Rec Identity ts -> a
+runcurry' = (. toVinyl) . V.runcurry'
+{-# INLINABLE runcurry' #-}
+
+-- | Lift an N-ary function to work over a row of 'Applicative'
+-- computations. See 'V.runcurryA'.
+runcurryA' :: (Applicative f, AsVinyl ts)
+ => V.Curried (UnColumn ts) a -> Rec f ts -> f a
+runcurryA' = (. toVinyl) . V.runcurryA'
+
+-- | Lift an N-ary function over types in @g@ to work over a record of
+-- 'Compose'd 'Applicative' computations. A more general version of
+-- 'runcurryA''.
+runcurryA :: (Applicative f, Functor g, AsVinyl ts)
+ => V.CurriedF g (UnColumn ts) a -> Rec (Compose f g) ts -> f a
+runcurryA = (. toVinyl) . V.runcurryA
+
-- | A constraint that a field can be deleted from a record.
type CanDelete r rs = (V.RElem r rs (RIndex r rs), RDelete r rs V.⊆ rs)