summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBryanOSullivan <>2011-07-22 19:38:10 (GMT)
committerLuite Stegeman <luite@luite.com>2011-07-22 19:38:10 (GMT)
commit210aebabf37d2a8421ac0a65a0bb5bb2dd3b7b18 (patch)
tree9b780b8f84823ba3a0ae36a944a52e81b3cd8bb3
parent7194379f201db397920e5720f1ca9de2b997987f (diff)
version 0.9.0.00.9.0.0
-rw-r--r--Statistics/Distribution/Binomial.hs17
-rw-r--r--Statistics/Distribution/ChiSquared.hs8
-rw-r--r--Statistics/Distribution/Exponential.hs9
-rw-r--r--Statistics/Distribution/Gamma.hs11
-rw-r--r--Statistics/Distribution/Geometric.hs7
-rw-r--r--Statistics/Distribution/Hypergeometric.hs17
-rw-r--r--Statistics/Distribution/Normal.hs15
-rw-r--r--Statistics/Distribution/Poisson.hs7
-rw-r--r--Statistics/Function.hs25
-rw-r--r--Statistics/KernelDensity.hs2
-rw-r--r--Statistics/Resampling.hs68
-rw-r--r--Statistics/Resampling/Bootstrap.hs45
-rw-r--r--Statistics/Sample.hs8
-rw-r--r--statistics.cabal9
14 files changed, 178 insertions, 70 deletions
diff --git a/Statistics/Distribution/Binomial.hs b/Statistics/Distribution/Binomial.hs
index 38fc6ab..44ac89b 100644
--- a/Statistics/Distribution/Binomial.hs
+++ b/Statistics/Distribution/Binomial.hs
@@ -23,7 +23,6 @@ module Statistics.Distribution.Binomial
, bdProbability
) where
-import Control.Exception (assert)
import Data.Typeable (Typeable)
import qualified Statistics.Distribution as D
import Statistics.Math (choose)
@@ -49,7 +48,7 @@ instance D.Mean BinomialDistribution where
mean = mean
--- This could be slow for bin n
+-- This could be slow for big n
probability :: BinomialDistribution -> Int -> Double
probability (BD n p) k
| k < 0 || k > n = 0
@@ -77,12 +76,16 @@ variance :: BinomialDistribution -> Double
variance (BD n p) = fromIntegral n * p * (1 - p)
{-# INLINE variance #-}
--- | Construct binomial distribution
+-- | Construct binomial distribution. Number of trials must be
+-- positive and probability must be in [0,1] range
binomial :: Int -- ^ Number of trials.
-> Double -- ^ Probability.
-> BinomialDistribution
-binomial n p =
- assert (n > 0) .
- assert (p > 0 && p < 1) $
- BD n p
+binomial n p
+ | n <= 0 =
+ error $ msg ++ "number of trials must be positive. Got " ++ show n
+ | p < 0 || p > 1 =
+ error $ msg++"probability must be in [0,1] range. Got " ++ show p
+ | otherwise = BD n p
+ where msg = "Statistics.Distribution.Binomial.binomial: "
{-# INLINE binomial #-}
diff --git a/Statistics/Distribution/ChiSquared.hs b/Statistics/Distribution/ChiSquared.hs
index ca60be4..e5ab17c 100644
--- a/Statistics/Distribution/ChiSquared.hs
+++ b/Statistics/Distribution/ChiSquared.hs
@@ -34,9 +34,13 @@ chiSquaredNDF :: ChiSquared -> Int
chiSquaredNDF (ChiSquared ndf) = ndf
{-# INLINE chiSquaredNDF #-}
--- | Construct chi-squared distribution. Number of degrees of free
+-- | Construct chi-squared distribution. Number of degrees of freedom
+-- must be positive.
chiSquared :: Int -> ChiSquared
-chiSquared x = ChiSquared x
+chiSquared n
+ | n <= 0 = error $
+ "Statistics.Distribution.ChiSquared.chiSquared: N.D.F. must be positive. Got " ++ show n
+ | otherwise = ChiSquared n
{-# INLINE chiSquared #-}
instance D.Distribution ChiSquared where
diff --git a/Statistics/Distribution/Exponential.hs b/Statistics/Distribution/Exponential.hs
index d2d96df..978448f 100644
--- a/Statistics/Distribution/Exponential.hs
+++ b/Statistics/Distribution/Exponential.hs
@@ -61,14 +61,17 @@ quantile :: ExponentialDistribution -> Double -> Double
quantile (ED l) p = -log (1 - p) / l
{-# INLINE quantile #-}
--- | Create exponential distribution
+-- | Create an exponential distribution.
exponential :: Double -- ^ &#955; (scale) parameter.
-> ExponentialDistribution
-exponential = ED
+exponential l
+ | l <= 0 =
+ error $ "Statistics.Distribution.Exponential.exponential: scale parameter must be positive. Got " ++ show l
+ | otherwise = ED l
{-# INLINE exponential #-}
-- | Create exponential distribution from sample. No tests are made to
--- check whether it really exponential
+-- check whether it truly is exponential.
exponentialFromSample :: Sample -> ExponentialDistribution
exponentialFromSample = ED . S.mean
{-# INLINE exponentialFromSample #-}
diff --git a/Statistics/Distribution/Gamma.hs b/Statistics/Distribution/Gamma.hs
index b42ae36..27c6a54 100644
--- a/Statistics/Distribution/Gamma.hs
+++ b/Statistics/Distribution/Gamma.hs
@@ -35,8 +35,15 @@ data GammaDistribution = GD {
, gdScale :: {-# UNPACK #-} !Double -- ^ Scale parameter, &#977;.
} deriving (Eq, Read, Show, Typeable)
-gammaDistr :: Double -> Double -> GammaDistribution
-gammaDistr = GD
+-- | Create gamma distrivution. Both shape and scale parameters must be positive.
+gammaDistr :: Double -- ^ Shape parameter. /k/
+ -> Double -- ^ Scale parameter, &#977;.
+ -> GammaDistribution
+gammaDistr k theta
+ | k <= 0 = error $ msg ++ "shape must be positive. Got " ++ show k
+ | theta <= 0 = error $ msg ++ "scale must be positive. Got " ++ show theta
+ | otherwise = GD k theta
+ where msg = "Statistics.Distribution.Gamma.gammaDistr: "
{-# INLINE gammaDistr #-}
instance D.Distribution GammaDistribution where
diff --git a/Statistics/Distribution/Geometric.hs b/Statistics/Distribution/Geometric.hs
index 8b940d6..142d5ac 100644
--- a/Statistics/Distribution/Geometric.hs
+++ b/Statistics/Distribution/Geometric.hs
@@ -51,8 +51,11 @@ instance D.Mean GeometricDistribution where
-- | Create geometric distribution
geometric :: Double -- ^ Success rate
-> GeometricDistribution
-geometric x = assert (x >= 0 && x <= 1)
- GD x
+geometric x
+ | x < 0 || x > 1 =
+ error $ "Statistics.Distribution.Geometric.geometric: probability must be in [0,1] range. Got " ++ show x
+ | otherwise =
+ GD x
{-# INLINE geometric #-}
probability :: GeometricDistribution -> Int -> Double
diff --git a/Statistics/Distribution/Hypergeometric.hs b/Statistics/Distribution/Hypergeometric.hs
index d586d95..9e0cc31 100644
--- a/Statistics/Distribution/Hypergeometric.hs
+++ b/Statistics/Distribution/Hypergeometric.hs
@@ -27,9 +27,8 @@ module Statistics.Distribution.Hypergeometric
, hdK
) where
-import Control.Exception (assert)
-import Data.Typeable (Typeable)
-import Statistics.Math (choose)
+import Data.Typeable (Typeable)
+import Statistics.Math (choose)
import qualified Statistics.Distribution as D
data HypergeometricDistribution = HD {
@@ -66,11 +65,13 @@ hypergeometric :: Int -- ^ /m/
-> Int -- ^ /l/
-> Int -- ^ /k/
-> HypergeometricDistribution
-hypergeometric m l k =
- assert (m >= 0 && m <= l) .
- assert (l > 0) .
- assert (k > 0 && k <= l) $
- HD m l k
+hypergeometric m l k
+ | not (l > 0) = error $ msg ++ "l must be positive"
+ | not (m >= 0 && m <= l) = error $ msg ++ "m must lie in [0,l] range"
+ | not (k > 0 && k <= l) = error $ msg ++ "k must lie in (0,l] range"
+ | otherwise = HD m l k
+ where
+ msg = "Statistics.Distribution.Hypergeometric.hypergeometric: "
{-# INLINE hypergeometric #-}
-- Naive implementation
diff --git a/Statistics/Distribution/Normal.hs b/Statistics/Distribution/Normal.hs
index 1efaca2..4fd56be 100644
--- a/Statistics/Distribution/Normal.hs
+++ b/Statistics/Distribution/Normal.hs
@@ -20,7 +20,6 @@ module Statistics.Distribution.Normal
, standard
) where
-import Control.Exception (assert)
import Data.Number.Erf (erfc)
import Data.Typeable (Typeable)
import Statistics.Constants (m_sqrt_2, m_sqrt_2_pi)
@@ -60,12 +59,14 @@ standard = ND { mean = 0.0
normalDistr :: Double -- ^ Mean of distribution
-> Double -- ^ Variance of distribution
-> NormalDistribution
-normalDistr m v = assert (v > 0)
- ND { mean = m
- , variance = v
- , ndPdfDenom = m_sqrt_2_pi * sv
- , ndCdfDenom = m_sqrt_2 * sv
- }
+normalDistr m v
+ | v <= 0 =
+ error $ "Statistics.Distribution.Normal.normalDistr: variance must be positive. Got " ++ show v
+ | otherwise = ND { mean = m
+ , variance = v
+ , ndPdfDenom = m_sqrt_2_pi * sv
+ , ndCdfDenom = m_sqrt_2 * sv
+ }
where sv = sqrt v
-- | Create distribution using parameters estimated from
diff --git a/Statistics/Distribution/Poisson.hs b/Statistics/Distribution/Poisson.hs
index d138c82..d3636a2 100644
--- a/Statistics/Distribution/Poisson.hs
+++ b/Statistics/Distribution/Poisson.hs
@@ -45,9 +45,12 @@ instance D.Mean PoissonDistribution where
mean = poissonLambda
{-# INLINE mean #-}
--- | Create po
+-- | Create poisson distribution.
poisson :: Double -> PoissonDistribution
-poisson = PD
+poisson l
+ | l <= 0 =
+ error $ "Statistics.Distribution.Poisson.poisson: lambda must be positive. Got " ++ show l
+ | otherwise = PD l
{-# INLINE poisson #-}
probability :: PoissonDistribution -> Int -> Double
diff --git a/Statistics/Function.hs b/Statistics/Function.hs
index 2fca6fb..33999ab 100644
--- a/Statistics/Function.hs
+++ b/Statistics/Function.hs
@@ -1,8 +1,7 @@
-{-# LANGUAGE Rank2Types #-}
-{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE CPP, FlexibleContexts, Rank2Types #-}
-- |
-- Module : Statistics.Function
--- Copyright : (c) 2009, 2010 Bryan O'Sullivan
+-- Copyright : (c) 2009, 2010, 2011 Bryan O'Sullivan
-- License : BSD3
--
-- Maintainer : bos@serpentine.com
@@ -18,12 +17,16 @@ module Statistics.Function
, partialSort
, indexed
, indices
+ , nextHighestPowerOfTwo
-- * Vector setup
, create
) where
+#include "MachDeps.h"
+
import Control.Exception (assert)
import Control.Monad.Primitive (PrimMonad)
+import Data.Bits ((.|.), shiftR)
import Data.Vector.Generic (modify, unsafeFreeze)
import qualified Data.Vector.Algorithms.Intro as I
import qualified Data.Vector.Generic as G
@@ -74,3 +77,19 @@ create size itemAt = assert (size >= 0) $
M.write arr k r
loop (k+1) arr
{-# INLINE create #-}
+
+-- | Efficiently compute the next highest power of two for a
+-- non-negative integer. If the given value is already a power of
+-- two, it is returned unchanged. If negative, zero is returned.
+nextHighestPowerOfTwo :: Int -> Int
+nextHighestPowerOfTwo n = o + 1
+ where m = n - 1
+ o = m
+ .|. (m `shiftR` 1)
+ .|. (m `shiftR` 2)
+ .|. (m `shiftR` 4)
+ .|. (m `shiftR` 8)
+ .|. (m `shiftR` 16)
+#if WORD_SIZE_IN_BITS == 64
+ .|. (m `shiftR` 32)
+#endif
diff --git a/Statistics/KernelDensity.hs b/Statistics/KernelDensity.hs
index ed5c724..3b24f99 100644
--- a/Statistics/KernelDensity.hs
+++ b/Statistics/KernelDensity.hs
@@ -115,7 +115,7 @@ epanechnikovKernel f h p v
gaussianKernel :: Kernel
gaussianKernel f h p v = exp (-0.5 * u * u) * g
where u = (v - p) / h
- g = f * m_2_sqrt_pi * m_1_sqrt_2
+ g = f * 0.5 * m_2_sqrt_pi * m_1_sqrt_2
-- | Kernel density estimator, providing a non-parametric way of
-- estimating the PDF of a random variable.
diff --git a/Statistics/Resampling.hs b/Statistics/Resampling.hs
index c4d40d6..961eec1 100644
--- a/Statistics/Resampling.hs
+++ b/Statistics/Resampling.hs
@@ -1,3 +1,5 @@
+{-# LANGUAGE BangPatterns #-}
+
-- |
-- Module : Statistics.Resampling
-- Copyright : (c) 2009, 2010 Bryan O'Sullivan
@@ -16,14 +18,16 @@ module Statistics.Resampling
, resample
) where
-import Control.Monad (forM_, liftM)
+import Control.Concurrent (forkIO, newChan, readChan, writeChan)
+import Control.Monad (forM_, liftM, replicateM_)
import Control.Monad.Primitive (PrimMonad, PrimState)
import Data.Vector.Algorithms.Intro (sort)
import Data.Vector.Generic (unsafeFreeze)
-import Data.Vector.Unboxed ((!))
-import Statistics.Function (create, indexed, indices)
+import Data.Word (Word32)
+import GHC.Conc (numCapabilities)
+import Statistics.Function (create, indices)
import Statistics.Types (Estimator, Sample)
-import System.Random.MWC (Gen, uniform)
+import System.Random.MWC (Gen, initialize, uniform, uniformVector)
import qualified Data.Vector.Unboxed as U
import qualified Data.Vector.Unboxed.Mutable as MU
@@ -34,34 +38,54 @@ newtype Resample = Resample {
fromResample :: U.Vector Double
} deriving (Eq, Show)
--- | Resample a data set repeatedly, with replacement, computing each
--- estimate over the resampled data.
-resample :: (PrimMonad m) => Gen (PrimState m) -> [Estimator] -> Int -> Sample -> m [Resample]
+-- | /O(e*r*s)/ Resample a data set repeatedly, with replacement,
+-- computing each estimate over the resampled data.
+--
+-- This function is expensive; it has to do work proportional to
+-- /e*r*s/, where /e/ is the number of estimation functions, /r/ is
+-- the number of resamples to compute, and /s/ is the number of
+-- original samples.
+--
+-- To improve performance, this function will make use of all
+-- available CPUs. At least with GHC 7.0, parallel performance seems
+-- best if the parallel garbage collector is disabled (RTS option
+-- @-qg@).
+resample :: Gen (PrimState IO)
+ -> [Estimator] -- ^ Estimation functions.
+ -> Int -- ^ Number of resamples to compute.
+ -> Sample -- ^ Original sample.
+ -> IO [Resample]
resample gen ests numResamples samples = do
+ let !numSamples = U.length samples
+ ixs = scanl (+) 0 $
+ zipWith (+) (replicate numCapabilities q)
+ (replicate r 1 ++ repeat 0)
+ where (q,r) = numResamples `quotRem` numCapabilities
results <- mapM (const (MU.new numResamples)) $ ests
- loop 0 (zip ests results)
+ done <- newChan
+ forM_ (zip ixs (tail ixs)) $ \ (start,!end) -> do
+ gen' <- initialize =<< (uniformVector gen 256 :: IO (U.Vector Word32))
+ forkIO $ do
+ let loop k ers | k >= end = writeChan done ()
+ | otherwise = do
+ re <- create numSamples $ \_ -> do
+ r <- uniform gen'
+ return (U.unsafeIndex samples (r `mod` numSamples))
+ forM_ ers $ \(est,arr) ->
+ MU.write arr k . est $ re
+ loop (k+1) ers
+ loop start (zip ests results)
+ replicateM_ numCapabilities $ readChan done
mapM_ sort results
mapM (liftM Resample . unsafeFreeze) results
- where
- loop k ers | k >= numResamples = return ()
- | otherwise = do
- re <- create n $ \_ -> do
- r <- uniform gen
- return (samples ! (abs r `mod` n))
- forM_ ers $ \(est,arr) ->
- MU.write arr k . est $ re
- loop (k+1) ers
- n = U.length samples
-{-# INLINE resample #-}
-- | Compute a statistical estimate repeatedly over a sample, each
-- time omitting a successive element.
jackknife :: Estimator -> Sample -> U.Vector Double
jackknife est sample = U.map f . indices $ sample
where f i = est (dropAt i sample)
-{-# INLINE jackknife #-}
+{- INLINE jackknife #-}
-- | Drop the /k/th element of a vector.
dropAt :: U.Unbox e => Int -> U.Vector e -> U.Vector e
-dropAt n = U.map snd . U.filter notN . indexed
- where notN (i , _) = i /= n
+dropAt n v = U.slice 0 n v U.++ U.slice (n+1) (U.length v - n - 1) v
diff --git a/Statistics/Resampling/Bootstrap.hs b/Statistics/Resampling/Bootstrap.hs
index 22ed7dd..b0b3c84 100644
--- a/Statistics/Resampling/Bootstrap.hs
+++ b/Statistics/Resampling/Bootstrap.hs
@@ -1,6 +1,8 @@
+{-# LANGUAGE DeriveDataTypeable, OverloadedStrings, RecordWildCards #-}
+
-- |
-- Module : Statistics.Resampling.Bootstrap
--- Copyright : (c) 2009 Bryan O'Sullivan
+-- Copyright : (c) 2009, 2011 Bryan O'Sullivan
-- License : BSD3
--
-- Maintainer : bos@serpentine.com
@@ -13,11 +15,18 @@ module Statistics.Resampling.Bootstrap
(
Estimate(..)
, bootstrapBCA
+ , scale
-- * References
-- $references
) where
+import Control.Applicative ((<$>), (<*>), empty)
+import Control.DeepSeq (NFData)
import Control.Exception (assert)
+import Control.Monad.Par (runPar, parMap)
+import Data.Aeson.Types
+import Data.Data (Data)
+import Data.Typeable (Typeable)
import Data.Vector.Unboxed ((!))
import Statistics.Distribution (cumulative, quantile)
import Statistics.Distribution.Normal
@@ -38,7 +47,35 @@ data Estimate = Estimate {
-- the confidence interval).
, estConfidenceLevel :: {-# UNPACK #-} !Double
-- ^ Confidence level of the confidence intervals.
- } deriving (Eq, Show)
+ } deriving (Eq, Show, Typeable, Data)
+
+instance NFData Estimate
+
+instance ToJSON Estimate where
+ toJSON Estimate{..} = object [
+ "estPoint" .= estPoint
+ , "estLowerBound" .= estLowerBound
+ , "estUpperBound" .= estUpperBound
+ , "estConfidenceLevel" .= estConfidenceLevel
+ ]
+
+instance FromJSON Estimate where
+ parseJSON (Object v) = Estimate <$>
+ v .: "estPoint" <*>
+ v .: "estLowerBound" <*>
+ v .: "estUpperBound" <*>
+ v .: "estConfidenceLevel"
+ parseJSON _ = empty
+
+-- | Multiply the point, lower bound, and upper bound in an 'Estimate'
+-- by the given value.
+scale :: Double -- ^ Value to multiply by.
+ -> Estimate -> Estimate
+scale f e@Estimate{..} = e {
+ estPoint = f * estPoint
+ , estLowerBound = f * estLowerBound
+ , estUpperBound = f * estUpperBound
+ }
estimate :: Double -> Double -> Double -> Double -> Estimate
estimate pt lb ub cl =
@@ -60,9 +97,9 @@ bootstrapBCA :: Double -- ^ Confidence level
-> [Estimator] -- ^ Estimators
-> [Resample] -- ^ Resampled data
-> [Estimate]
-bootstrapBCA confidenceLevel sample =
+bootstrapBCA confidenceLevel sample estimators resamples =
assert (confidenceLevel > 0 && confidenceLevel < 1)
- zipWith e
+ runPar $ parMap (uncurry e) (zip estimators resamples)
where
e est (Resample resample)
| U.length sample == 1 = estimate pt pt pt confidenceLevel
diff --git a/Statistics/Sample.hs b/Statistics/Sample.hs
index 8f80b13..b380174 100644
--- a/Statistics/Sample.hs
+++ b/Statistics/Sample.hs
@@ -60,6 +60,8 @@ import qualified Data.Vector.Generic as G
-- Operator ^ will be overriden
import Prelude hiding ((^))
+-- | Range. The difference between the largest and smallest elements
+-- of a sample.
range :: (G.Vector v Double) => v Double -> Double
range s = hi - lo
where (lo , hi) = minMax s
@@ -203,11 +205,9 @@ kurtosis xs = c4 / (c2 * c2) - 3
data V = V {-# UNPACK #-} !Double {-# UNPACK #-} !Double
-sqr :: Double -> Double
-sqr x = x * x
-
robustSumVar :: (G.Vector v Double) => Double -> v Double -> Double
-robustSumVar m samp = G.sum . G.map (sqr . subtract m) $ samp
+robustSumVar m samp = G.sum . G.map (square . subtract m) $ samp
+ where square x = x * x
{-# INLINE robustSumVar #-}
-- | Maximum likelihood estimate of a sample's variance. Also known
diff --git a/statistics.cabal b/statistics.cabal
index a16a4b8..c8a1657 100644
--- a/statistics.cabal
+++ b/statistics.cabal
@@ -1,5 +1,5 @@
name: statistics
-version: 0.8.0.5
+version: 0.9.0.0
synopsis: A library of statistical types, data, and functions
description:
This library provides a number of common functions and types useful
@@ -26,7 +26,7 @@ homepage: http://bitbucket.org/bos/statistics
bug-reports: http://bitbucket.org/bos/statistics/issues
author: Bryan O'Sullivan <bos@serpentine.com>
maintainer: Bryan O'Sullivan <bos@serpentine.com>
-copyright: 2009, 2010 Bryan O'Sullivan
+copyright: 2009, 2010, 2011 Bryan O'Sullivan
category: Math, Statistics
build-type: Simple
cabal-version: >= 1.6
@@ -58,9 +58,12 @@ library
other-modules:
Statistics.Internal
build-depends:
+ aeson,
base < 5,
+ deepseq >= 1.1.0.2,
erf,
- mwc-random >= 0.8.0.3,
+ monad-par >= 0.1.0.1,
+ mwc-random >= 0.8.0.5,
primitive >= 0.3,
time,
vector >= 0.7.0.0,