summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorOzgunAtaman <>2011-06-29 01:28:47 (GMT)
committerLuite Stegeman <luite@luite.com>2011-06-29 01:28:47 (GMT)
commit1d6e81527d47f6269ffae270d19a0a9218a46387 (patch)
tree927aca62df24df2aff639779ec35b279ae1cb245
parent3cf9384b2a7083685e7a4463a40376800cb2a713 (diff)
version 0.9.00.9.0
-rw-r--r--README.markdown5
-rw-r--r--csv-enumerator.cabal2
-rw-r--r--src/Data/CSV/Enumerator.hs137
3 files changed, 97 insertions, 47 deletions
diff --git a/README.markdown b/README.markdown
index 3c3a54b..01f19da 100644
--- a/README.markdown
+++ b/README.markdown
@@ -81,11 +81,14 @@ Further examples to be provided at a later time.
### TODO - Next Steps
+* Refactor all operations to use iterCSV as the basic building block --
+ in progress.
+* The CSVeable typeclass can be refactored to have a more minimal definition.
+* Get mapCSVFiles out of the typeclass if possible.
* Need to think about specializing an Exception type for the library and
properly notifying the user when parsing-related problems occur.
* Some operations can be further broken down to their atoms, increasing the
flexibility of the library.
-* The CSVeable typeclass can be refactored to have a more minimal definition.
* Operating on Text in addition to ByteString would be phenomenal.
* A test-suite needs to be added.
* Some benchmarking would be nice.
diff --git a/csv-enumerator.cabal b/csv-enumerator.cabal
index 4795641..88f27d5 100644
--- a/csv-enumerator.cabal
+++ b/csv-enumerator.cabal
@@ -1,5 +1,5 @@
Name: csv-enumerator
-Version: 0.8.2
+Version: 0.9.0
Synopsis: A flexible, fast, enumerator-based CSV parser library for Haskell.
Homepage: http://github.com/ozataman/csv-enumerator
License: BSD3
diff --git a/src/Data/CSV/Enumerator.hs b/src/Data/CSV/Enumerator.hs
index 63ef42f..daf773b 100644
--- a/src/Data/CSV/Enumerator.hs
+++ b/src/Data/CSV/Enumerator.hs
@@ -10,7 +10,6 @@ module Data.CSV.Enumerator
, MapRow
, CSVeable(..)
-
, ParsedRow(..)
-- * CSV Setttings
@@ -18,16 +17,23 @@ module Data.CSV.Enumerator
, defCSVSettings
-- * Reading / Writing CSV Files
+ -- | These are some simple file-related operations for basic use cases.
, readCSVFile
, writeCSVFile
, appendCSVFile
- -- * Folding Over CSV Files
- -- | These enumerators generalize the map* family of functions with a running accumulator.
+ -- * Generic Folds Over CSV Files
+ -- | These operations enable you to do whatever you want with CSV files;
+ -- including interleaved IO, etc.
+ , foldCSVFile
, CSVAction
, funToIter
, funToIterIO
+ -- * Mapping Over CSV Files
+ , mapCSVFile
+ , mapIntoHandle
+
-- * Primitive Iteratees
, collectRows
, outputRowIter
@@ -44,7 +50,7 @@ where
import Control.Applicative hiding (many)
import Control.Exception (bracket, SomeException)
-import Control.Monad (mzero, mplus, foldM, when)
+import Control.Monad (mzero, mplus, foldM, when, liftM)
import Control.Monad.IO.Class (liftIO, MonadIO)
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as B8
@@ -57,7 +63,6 @@ import System.PosixCompat.Files (getFileStatus, fileSize)
import Data.Attoparsec as P hiding (take)
import qualified Data.Attoparsec.Char8 as C8
--- import Data.Attoparsec.Enum
import Data.Attoparsec.Enumerator
import qualified Data.Enumerator as E
import Data.Enumerator (($$), yield, continue)
@@ -81,40 +86,14 @@ class CSVeable r where
-> a
-> E.Iteratee B.ByteString IO a
- -- | Iteratee to push rows into a given file
- fileSink :: CSVSettings
- -> FilePath
- -> (Maybe Handle, Int)
- -> ParsedRow r
- -> E.Iteratee B.ByteString IO (Maybe Handle, Int)
-
- -- | Open & fold over the CSV file. Processing starts on row 2 for MapRow
- -- instance to use first row as column headers.
- foldCSVFile :: FilePath -- ^ File to open as a CSV file
- -> CSVSettings -- ^ CSV settings to use on the input file
- -> CSVAction r a -- ^ Fold action
- -> a -- ^ Initial accumulator
- -> IO (Either SomeException a) -- ^ Error or the resulting accumulator
-
- -- | Take a CSV file, apply function to each of its rows and save the
- -- resulting rows into a new file.
- --
- -- Each row is simply a list of fields.
- mapCSVFile :: FilePath -- ^ Input file
- -> CSVSettings -- ^ CSV Settings
- -> (r -> [r]) -- ^ A function to map a row onto rows
- -> FilePath -- ^ Output file
- -> IO (Either SomeException Int) -- ^ Number of rows processed
- mapCSVFile fi s f fo = do
- res <- foldCSVFile fi s iter (Nothing, 0)
- return $ snd `fmap` res
- where
- iter !acc (ParsedRow (Just !r)) = foldM chain acc (f r)
- iter !acc x = fileSink s fo acc x
-
- chain !acc !r = singleSink r acc
- singleSink !x !acc = fileSink s fo acc (ParsedRow (Just x))
+ -- | Iteratee to push rows into a given file
+ fileSink
+ :: CSVSettings
+ -> FilePath
+ -> (Maybe Handle, Int)
+ -> ParsedRow r
+ -> E.Iteratee B.ByteString IO (Maybe Handle, Int)
----------------------------------------------------------------------------
@@ -152,11 +131,6 @@ instance CSVeable Row where
comboIter acc' = procRow acc' >>= loop
- foldCSVFile fp csvs f acc = E.run iter
- where
- iter = enumFile fp $$ iterCSV csvs f acc
-
-
fileSink csvs fo = iter
where
iter :: (Maybe Handle, Int)
@@ -237,8 +211,6 @@ instance CSVeable MapRow where
toMapCSV !headers !fs = yield (fs >>= (Just . M.fromList . zip headers)) (E.Chunks [])
- foldCSVFile fp csvs f !acc = E.run (enumFile fp $$ iterCSV csvs f acc)
-
fileSink s fo = mapIter
where
@@ -299,6 +271,42 @@ instance CSVeable MapRow where
------------------------------------------------------------------------------
+-- | Open & fold over the CSV file.
+--
+-- Processing starts on row 2 for MapRow instance to use first row as column
+-- headers.
+foldCSVFile
+ :: (CSVeable r)
+ => FilePath -- ^ File to open as a CSV file
+ -> CSVSettings -- ^ CSV settings to use on the input file
+ -> CSVAction r a -- ^ Fold action
+ -> a -- ^ Initial accumulator
+ -> IO (Either SomeException a) -- ^ Error or the resulting accumulator
+foldCSVFile fp csvs f acc = E.run (enumFile fp $$ iterCSV csvs f acc)
+
+
+------------------------------------------------------------------------------
+-- | Take a CSV file, apply function to each of its rows and save the
+-- resulting rows into a new file.
+--
+-- Each row is simply a list of fields.
+mapCSVFile
+ :: (CSVeable r)
+ => FilePath -- ^ Input file
+ -> CSVSettings -- ^ CSV Settings
+ -> (r -> [r]) -- ^ A function to map a row onto rows
+ -> FilePath -- ^ Output file
+ -> IO (Either SomeException Int) -- ^ Number of rows processed
+mapCSVFile fi s f fo = do
+ res <- foldCSVFile fi s iter (Nothing, 0)
+ return $ snd `fmap` res
+ where
+ iter !acc (ParsedRow (Just !r)) = foldM chain acc (f r)
+ iter !acc x = fileSink s fo acc x
+ chain !acc !r = fileSink s fo acc (ParsedRow (Just r))
+
+
+------------------------------------------------------------------------------
readCSVFile :: (CSVeable r) => CSVSettings -- ^ CSV settings
-> FilePath -- ^ FilePath
-> IO (Either SomeException [r]) -- ^ Collected data
@@ -431,6 +439,45 @@ funToIter f = iterf
iterf !acc r = yield (f acc r) (E.Chunks [])
+
+------------------------------------------------------------------------------
+-- | Create an iteratee that can map over a CSV stream and output results to
+-- a handle in an interleaved fashion.
+--
+-- Example use: Let's map over a CSV file coming in through 'stdin' and push
+-- results to 'stdout'.
+--
+-- > f r = return [r] -- a function that just returns the given row
+--
+-- > E.run (E.enumHandle 4096 stdin $$ mapIntoHandle defCSVSettings True stdout f)
+--
+-- This nicely allows us to do things like (assuming you have pv installed):
+--
+-- > pv inputFile.csv | myApp > output.CSV
+--
+-- And monitor the ongoing progress of processing.
+mapIntoHandle
+ :: (CSVeable r)
+ => CSVSettings -- ^ 'CSVSettings'
+ -> Bool -- ^ Whether to write headers
+ -> Handle -- ^ Handle to stream results
+ -> (r -> IO [r]) -- ^ Map function
+ -> E.Iteratee ByteString IO Int -- ^ Resulting Iteratee
+mapIntoHandle csvs outh h f = do
+ snd `liftM` iterCSV csvs (funToIterIO f') (False,0)
+ where
+ f' acc EOF = return acc
+ f' acc (ParsedRow Nothing) = return acc
+ f' (False, _) r'@(ParsedRow (Just r)) = do
+ rs <- f r
+ when outh $ writeHeaders csvs h rs
+ f' (True, 0) r'
+ f' (True, !i) (ParsedRow (Just r)) = do
+ rs <- f r
+ outputRows csvs h rs
+ return (True, i+1)
+
+
------------------------------------------------------------------------------
-- | Just collect all rows into an array. This will cancel out the incremental
-- nature of this library.