summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBryanOSullivan <>2011-11-11 04:37:59 (GMT)
committerhdiff <hdiff@luite.com>2011-11-11 04:37:59 (GMT)
commiteebd81440ef5aab6214cc5c97b83fb0ab6565a56 (patch)
tree5d4fcdf2f5f1dcf5163d776106fa522e8896ae6c
parent210aebabf37d2a8421ac0a65a0bb5bb2dd3b7b18 (diff)
version 0.10.0.00.10.0.0
-rw-r--r--Statistics/Constants.hs9
-rw-r--r--Statistics/Distribution.hs55
-rw-r--r--Statistics/Distribution/Binomial.hs11
-rw-r--r--Statistics/Distribution/CauchyLorentz.hs65
-rw-r--r--Statistics/Distribution/ChiSquared.hs22
-rw-r--r--Statistics/Distribution/Exponential.hs29
-rw-r--r--Statistics/Distribution/FDistribution.hs80
-rw-r--r--Statistics/Distribution/Gamma.hs41
-rw-r--r--Statistics/Distribution/Geometric.hs17
-rw-r--r--Statistics/Distribution/Hypergeometric.hs25
-rw-r--r--Statistics/Distribution/Normal.hs54
-rw-r--r--Statistics/Distribution/Poisson.hs41
-rw-r--r--Statistics/Distribution/Poisson/Internal.hs32
-rw-r--r--Statistics/Distribution/StudentT.hs71
-rw-r--r--Statistics/Distribution/Uniform.hs62
-rw-r--r--Statistics/Function.hs35
-rw-r--r--Statistics/Function/Comparison.hs40
-rw-r--r--Statistics/Math.hs281
-rw-r--r--Statistics/Math/RootFinding.hs127
-rw-r--r--Statistics/Quantile.hs7
-rw-r--r--Statistics/Resampling.hs6
-rw-r--r--Statistics/Resampling/Bootstrap.hs18
-rw-r--r--Statistics/Sample.hs16
-rw-r--r--Statistics/Sample/Histogram.hs97
-rw-r--r--Statistics/Sample/KernelDensity.hs108
-rw-r--r--Statistics/Sample/KernelDensity/Simple.hs (renamed from Statistics/KernelDensity.hs)27
-rw-r--r--Statistics/Test/Internal.hs46
-rw-r--r--Statistics/Test/MannWhitneyU.hs238
-rw-r--r--Statistics/Test/NonParametric.hs323
-rw-r--r--Statistics/Test/Types.hs27
-rw-r--r--Statistics/Test/WilcoxonT.hs182
-rw-r--r--Statistics/Transform.hs108
-rw-r--r--examples/kde/KDE.hs23
-rw-r--r--examples/kde/data/faithful.csv273
-rw-r--r--examples/kde/kde.html28
-rw-r--r--examples/kde/kde.tpl28
-rw-r--r--statistics.cabal162
-rw-r--r--tests/tests.hs13
38 files changed, 2357 insertions, 470 deletions
diff --git a/Statistics/Constants.hs b/Statistics/Constants.hs
index fc8f735..460b514 100644
--- a/Statistics/Constants.hs
+++ b/Statistics/Constants.hs
@@ -1,6 +1,6 @@
-- |
-- Module : Statistics.Constants
--- Copyright : (c) 2009 Bryan O'Sullivan
+-- Copyright : (c) 2009, 2011 Bryan O'Sullivan
-- License : BSD3
--
-- Maintainer : bos@serpentine.com
@@ -13,6 +13,7 @@ module Statistics.Constants
(
m_epsilon
, m_huge
+ , m_tiny
, m_1_sqrt_2
, m_2_sqrt_pi
, m_ln_sqrt_2_pi
@@ -29,6 +30,10 @@ m_huge :: Double
m_huge = 1.7976931348623157e308
{-# INLINE m_huge #-}
+m_tiny :: Double
+m_tiny = 2.2250738585072014e-308
+{-# INLINE m_tiny #-}
+
-- | The largest 'Int' /x/ such that 2**(/x/-1) is approximately
-- representable as a 'Double'.
m_max_exp :: Int
@@ -59,7 +64,7 @@ m_epsilon :: Double
m_epsilon = encodeFloat (signif+1) expo - 1.0
where (signif,expo) = decodeFloat (1.0::Double)
--- | @log(sqrt((2*pi)) / 2@
+-- | @log(sqrt((2*pi))@
m_ln_sqrt_2_pi :: Double
m_ln_sqrt_2_pi = 0.9189385332046727417803297364056176398613974736377834128171
{-# INLINE m_ln_sqrt_2_pi #-}
diff --git a/Statistics/Distribution.hs b/Statistics/Distribution.hs
index f4e312a..858823b 100644
--- a/Statistics/Distribution.hs
+++ b/Statistics/Distribution.hs
@@ -16,15 +16,21 @@ module Statistics.Distribution
Distribution(..)
, DiscreteDistr(..)
, ContDistr(..)
+ -- ** Distribution statistics
+ , MaybeMean(..)
, Mean(..)
+ , MaybeVariance(..)
, Variance(..)
-- * Helper functions
, findRoot
, sumProbabilities
) where
+import Control.Applicative ((<$>), Applicative(..))
import qualified Data.Vector.Unboxed as U
+
+
-- | Type class common to all distributions. Only c.d.f. could be
-- defined for both discrete and continous distributions.
class Distribution d where
@@ -33,6 +39,16 @@ class Distribution d where
-- i.e. P(/X/&#8804;/x/).
cumulative :: d -> Double -> Double
+ -- | One's complement of cumulative distibution:
+ --
+ -- > complCumulative d x = 1 - cumulative d x
+ --
+ -- It's useful when one is interested in P(/X/&#8805;/x/) and
+ -- expression on the right side begin to lose precision. This
+ -- function have default implementation but implementors are
+ -- encouraged to provide more precise implementation
+ complCumulative :: d -> Double -> Double
+ complCumulative d x = 1 - cumulative d x
-- | Discrete probability distribution.
class Distribution d => DiscreteDistr d where
@@ -48,18 +64,47 @@ class Distribution d => ContDistr d where
density :: d -> Double -> Double
-- | Inverse of the cumulative distribution function. The value
- -- /x/ for which P(/X/&#8804;/x/) = /p/.
+ -- /x/ for which P(/X/&#8804;/x/) = /p/. If probability is outside
+ -- of [0,1] range function should call 'error'
quantile :: d -> Double -> Double
--- | Type class for distributions with mean.
-class Distribution d => Mean d where
+
+-- | Type class for distributions with mean. 'maybeMean' should return
+-- 'Nothing' if it's undefined for current value of data
+class Distribution d => MaybeMean d where
+ maybeMean :: d -> Maybe Double
+
+-- | Type class for distributions with mean. If distribution have
+-- finite mean for all valid values of parameters it should be
+-- instance of this type class.
+class MaybeMean d => Mean d where
mean :: d -> Double
--- | Type class for distributions with variance.
-class Mean d => Variance d where
+
+-- | Type class for distributions with variance. If variance is
+-- undefined for some parameter values both 'maybeVariance' and
+-- 'maybeStdDev' should return Nothing.
+--
+-- Minimal complete definition is 'maybeVariance' or 'maybeStdDev'
+class MaybeMean d => MaybeVariance d where
+ maybeVariance :: d -> Maybe Double
+ maybeVariance d = (*) <$> x <*> x where x = maybeStdDev d
+ maybeStdDev :: d -> Maybe Double
+ maybeStdDev = fmap sqrt . maybeVariance
+
+-- | Type class for distributions with variance. If distibution have
+-- finite variance for all valid parameter values it should be
+-- instance of this type class.
+--
+-- Minimal complete definition is 'variance' or 'stdDev'
+class (Mean d, MaybeVariance d) => Variance d where
variance :: d -> Double
+ variance d = x * x where x = stdDev d
+ stdDev :: d -> Double
+ stdDev = sqrt . variance
+
data P = P {-# UNPACK #-} !Double {-# UNPACK #-} !Double
diff --git a/Statistics/Distribution/Binomial.hs b/Statistics/Distribution/Binomial.hs
index 44ac89b..394477e 100644
--- a/Statistics/Distribution/Binomial.hs
+++ b/Statistics/Distribution/Binomial.hs
@@ -41,11 +41,18 @@ instance D.Distribution BinomialDistribution where
instance D.DiscreteDistr BinomialDistribution where
probability = probability
+instance D.Mean BinomialDistribution where
+ mean = mean
+
instance D.Variance BinomialDistribution where
variance = variance
-instance D.Mean BinomialDistribution where
- mean = mean
+instance D.MaybeMean BinomialDistribution where
+ maybeMean = Just . D.mean
+
+instance D.MaybeVariance BinomialDistribution where
+ maybeStdDev = Just . D.stdDev
+ maybeVariance = Just . D.variance
-- This could be slow for big n
diff --git a/Statistics/Distribution/CauchyLorentz.hs b/Statistics/Distribution/CauchyLorentz.hs
new file mode 100644
index 0000000..6787d35
--- /dev/null
+++ b/Statistics/Distribution/CauchyLorentz.hs
@@ -0,0 +1,65 @@
+{-# LANGUAGE DeriveDataTypeable #-}
+-- |
+-- Module : Statistics.Distribution.CauchyLorentz
+-- Copyright : (c) 2011 Aleksey Khudyakov
+-- License : BSD3
+--
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : portable
+--
+-- The Cauchy-Lorentz distribution. It's also known as Lorentz
+-- distribution or Breit–Wigner distribution.
+--
+-- It doesn't have mean and variance.
+module Statistics.Distribution.CauchyLorentz (
+ CauchyDistribution
+ , cauchyDistribMedian
+ , cauchyDistribScale
+ -- * Constructors
+ , cauchyDistribution
+ , standardCauchy
+ ) where
+
+import Data.Typeable (Typeable)
+import qualified Statistics.Distribution as D
+
+
+-- | Cauchy-Lorentz distribution.
+data CauchyDistribution = CD {
+ -- | Central value of Cauchy-Lorentz distribution which is its
+ -- mode and median. Distribution doesn't have mean so function
+ -- is named after median.
+ cauchyDistribMedian :: {-# UNPACK #-} !Double
+ -- | Scale parameter of Cauchy-Lorentz distribution. It's
+ -- different from variance and specify half width at half
+ -- maximum (HWHM).
+ , cauchyDistribScale :: {-# UNPACK #-} !Double
+ }
+ deriving (Eq,Show,Read,Typeable)
+
+-- | Cauchy distribution
+cauchyDistribution :: Double -- ^ Central point
+ -> Double -- ^ Scale parameter (FWHM)
+ -> CauchyDistribution
+cauchyDistribution m s
+ | s > 0 = CD m s
+ | otherwise =
+ error $ "Statistics.Distribution.CauchyLorentz.cauchyDistribution: FWHM must be positive. Got " ++ show s
+
+standardCauchy :: CauchyDistribution
+standardCauchy = CD 0 1
+
+
+instance D.Distribution CauchyDistribution where
+ cumulative (CD m s) x = 0.5 + atan( (x - m) / s ) / pi
+
+instance D.ContDistr CauchyDistribution where
+ density (CD m s) x = (1 / pi) / (s * (1 + y*y))
+ where y = (x - m) / s
+ quantile (CD m s) p
+ | p > 0 && p < 1 = m + s * tan( pi * (p - 0.5) )
+ | p == 0 = -1 / 0
+ | p == 1 = 1 / 0
+ | otherwise =
+ error $ "Statistics.Distribution.CauchyLorentz..quantile: p must be in [0,1] range. Got: "++show p
diff --git a/Statistics/Distribution/ChiSquared.hs b/Statistics/Distribution/ChiSquared.hs
index e5ab17c..de0849d 100644
--- a/Statistics/Distribution/ChiSquared.hs
+++ b/Statistics/Distribution/ChiSquared.hs
@@ -18,9 +18,8 @@ module Statistics.Distribution.ChiSquared (
, chiSquaredNDF
) where
-import Data.Typeable (Typeable)
-import Statistics.Constants (m_huge)
-import Statistics.Math (incompleteGamma,logGamma)
+import Data.Typeable (Typeable)
+import Statistics.Math (incompleteGamma,invIncompleteGamma,logGamma)
import qualified Statistics.Distribution as D
@@ -58,6 +57,13 @@ instance D.Variance ChiSquared where
variance (ChiSquared ndf) = fromIntegral (2*ndf)
{-# INLINE variance #-}
+instance D.MaybeMean ChiSquared where
+ maybeMean = Just . D.mean
+
+instance D.MaybeVariance ChiSquared where
+ maybeStdDev = Just . D.stdDev
+ maybeVariance = Just . D.variance
+
cumulative :: ChiSquared -> Double -> Double
cumulative chi x
| x <= 0 = 0
@@ -77,8 +83,10 @@ density chi x
{-# INLINE density #-}
quantile :: ChiSquared -> Double -> Double
-quantile d@(ChiSquared ndf) p
- | p == 0 = -1/0
- | p == 1 = 1/0
- | otherwise = D.findRoot d p (fromIntegral ndf) 0 m_huge
+quantile (ChiSquared ndf) p
+ | p == 0 = 0
+ | p == 1 = 1/0
+ | p > 0 && p < 1 = 2 * invIncompleteGamma (fromIntegral ndf / 2) p
+ | otherwise =
+ error $ "Statistics.Distribution.ChiSquared.quantile: p must be in [0,1] range. Got: "++show p
{-# INLINE quantile #-}
diff --git a/Statistics/Distribution/Exponential.hs b/Statistics/Distribution/Exponential.hs
index 978448f..1830a03 100644
--- a/Statistics/Distribution/Exponential.hs
+++ b/Statistics/Distribution/Exponential.hs
@@ -33,32 +33,49 @@ newtype ExponentialDistribution = ED {
} deriving (Eq, Read, Show, Typeable)
instance D.Distribution ExponentialDistribution where
- cumulative = cumulative
+ cumulative = cumulative
+ complCumulative = complCumulative
instance D.ContDistr ExponentialDistribution where
density = density
quantile = quantile
+instance D.Mean ExponentialDistribution where
+ mean (ED l) = 1 / l
+ {-# INLINE mean #-}
+
instance D.Variance ExponentialDistribution where
variance (ED l) = 1 / (l * l)
{-# INLINE variance #-}
-instance D.Mean ExponentialDistribution where
- mean (ED l) = 1 / l
- {-# INLINE mean #-}
+instance D.MaybeMean ExponentialDistribution where
+ maybeMean = Just . D.mean
+
+instance D.MaybeVariance ExponentialDistribution where
+ maybeStdDev = Just . D.stdDev
+ maybeVariance = Just . D.variance
cumulative :: ExponentialDistribution -> Double -> Double
-cumulative (ED l) x | x < 0 = 0
+cumulative (ED l) x | x <= 0 = 0
| otherwise = 1 - exp (-l * x)
{-# INLINE cumulative #-}
+complCumulative :: ExponentialDistribution -> Double -> Double
+complCumulative (ED l) x | x <= 0 = 1
+ | otherwise = exp (-l * x)
+{-# INLINE complCumulative #-}
+
density :: ExponentialDistribution -> Double -> Double
density (ED l) x | x < 0 = 0
| otherwise = l * exp (-l * x)
{-# INLINE density #-}
quantile :: ExponentialDistribution -> Double -> Double
-quantile (ED l) p = -log (1 - p) / l
+quantile (ED l) p
+ | p == 1 = 1 / 0
+ | p >= 0 && p < 1 = -log (1 - p) / l
+ | otherwise =
+ error $ "Statistics.Distribution.Exponential.quantile: p must be in [0,1] range. Got: "++show p
{-# INLINE quantile #-}
-- | Create an exponential distribution.
diff --git a/Statistics/Distribution/FDistribution.hs b/Statistics/Distribution/FDistribution.hs
new file mode 100644
index 0000000..dde5e5b
--- /dev/null
+++ b/Statistics/Distribution/FDistribution.hs
@@ -0,0 +1,80 @@
+{-# LANGUAGE DeriveDataTypeable #-}
+-- |
+-- Module : Statistics.Distribution.FDistribution
+-- Copyright : (c) 2011 Aleksey Khudyakov
+-- License : BSD3
+--
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : portable
+--
+-- Fisher F distribution
+module Statistics.Distribution.FDistribution (
+ FDistribution
+ , fDistribution
+ , fDistributionNDF1
+ , fDistributionNDF2
+ ) where
+
+import qualified Statistics.Distribution as D
+import Data.Typeable (Typeable)
+import Statistics.Math (logBeta, incompleteBeta, invIncompleteBeta)
+
+
+
+-- | Student-T distribution
+data FDistribution = F { fDistributionNDF1 :: {-# UNPACK #-} !Double
+ , fDistributionNDF2 :: {-# UNPACK #-} !Double
+ , pdfFactor :: {-# UNPACK #-} !Double
+ }
+ deriving (Eq,Show,Read,Typeable)
+
+
+fDistribution :: Int -> Int -> FDistribution
+fDistribution n m
+ | n > 0 && m > 0 =
+ let n' = fromIntegral n
+ m' = fromIntegral m
+ f' = 0.5 * (log m' * m' + log n' * n') - logBeta (0.5*n') (0.5*m')
+ in F n' m' f'
+ | otherwise =
+ error "Statistics.Distribution.FDistribution.fDistribution: non-positive number of degrees of freedom"
+
+instance D.Distribution FDistribution where
+ cumulative = cumulative
+
+instance D.ContDistr FDistribution where
+ density = density
+ quantile = quantile
+
+cumulative :: FDistribution -> Double -> Double
+cumulative (F n m _) x
+ | x > 0 = let y = n*x in incompleteBeta (0.5 * n) (0.5 * m) (y / (m + y))
+ | otherwise = 0
+
+density :: FDistribution -> Double -> Double
+density (F n m fac) x
+ | x > 0 = exp $ fac + log x * (0.5 * n - 1) - log(m + n*x) * 0.5 * (n + m)
+ | otherwise = 0
+
+quantile :: FDistribution -> Double -> Double
+quantile (F n m _) p
+ | p >= 0 && p <= 1 =
+ let x = invIncompleteBeta (0.5 * n) (0.5 * m) p
+ in m * x / (n * (1 - x))
+ | otherwise =
+ error $ "Statistics.Distribution.Uniform.quantile: p must be in [0,1] range. Got: "++show p
+
+
+instance D.MaybeMean FDistribution where
+ maybeMean (F _ m _) | m > 2 = Just $ m / (m - 2)
+ | otherwise = Nothing
+
+instance D.MaybeVariance FDistribution where
+ maybeStdDev (F n m _)
+ | m > 4 = Just $ 2 * sqr m * (m + n - 2) / (n * sqr (m - 2) * (m - 4))
+ | otherwise = Nothing
+
+sqr :: Double -> Double
+sqr x = x * x
+{-# INLINE sqr #-}
diff --git a/Statistics/Distribution/Gamma.hs b/Statistics/Distribution/Gamma.hs
index 27c6a54..132ccb1 100644
--- a/Statistics/Distribution/Gamma.hs
+++ b/Statistics/Distribution/Gamma.hs
@@ -1,7 +1,7 @@
{-# LANGUAGE DeriveDataTypeable #-}
-- |
-- Module : Statistics.Distribution.Gamma
--- Copyright : (c) 2009 Bryan O'Sullivan
+-- Copyright : (c) 2009, 2011 Bryan O'Sullivan
-- License : BSD3
--
-- Maintainer : bos@serpentine.com
@@ -25,8 +25,9 @@ module Statistics.Distribution.Gamma
) where
import Data.Typeable (Typeable)
-import Statistics.Constants (m_huge)
-import Statistics.Math (incompleteGamma, logGamma)
+import Statistics.Constants (m_pos_inf, m_NaN)
+import Statistics.Distribution.Poisson.Internal as Poisson
+import Statistics.Math (incompleteGamma, invIncompleteGamma)
import qualified Statistics.Distribution as D
-- | The gamma distribution.
@@ -35,7 +36,8 @@ data GammaDistribution = GD {
, gdScale :: {-# UNPACK #-} !Double -- ^ Scale parameter, &#977;.
} deriving (Eq, Read, Show, Typeable)
--- | Create gamma distrivution. Both shape and scale parameters must be positive.
+-- | Create gamma distribution. Both shape and scale parameters must
+-- be positive.
gammaDistr :: Double -- ^ Shape parameter. /k/
-> Double -- ^ Scale parameter, &#977;.
-> GammaDistribution
@@ -54,17 +56,30 @@ instance D.ContDistr GammaDistribution where
quantile = quantile
instance D.Variance GammaDistribution where
- variance (GD a l) = a / (l * l)
+ variance (GD a l) = a * l * l
{-# INLINE variance #-}
instance D.Mean GammaDistribution where
- mean (GD a l) = a / l
+ mean (GD a l) = a * l
{-# INLINE mean #-}
+instance D.MaybeMean GammaDistribution where
+ maybeMean = Just . D.mean
+
+instance D.MaybeVariance GammaDistribution where
+ maybeStdDev = Just . D.stdDev
+ maybeVariance = Just . D.variance
+
+
+
density :: GammaDistribution -> Double -> Double
density (GD a l) x
- | x <= 0 = 0
- | otherwise = x ** (a-1) * exp (-x/l) / (exp (logGamma a) * l ** a)
+ | a < 0 || l <= 0 = m_NaN
+ | x <= 0 = 0
+ | a == 0 = if x == 0 then m_pos_inf else 0
+ | x == 0 = if a < 1 then m_pos_inf else if a > 1 then 0 else 1/l
+ | a < 1 = Poisson.probability (x/l) a * a / x
+ | otherwise = Poisson.probability (x/l) (a-1) / l
{-# INLINE density #-}
cumulative :: GammaDistribution -> Double -> Double
@@ -74,8 +89,10 @@ cumulative (GD k l) x
{-# INLINE cumulative #-}
quantile :: GammaDistribution -> Double -> Double
-quantile d p
- | p == 0 = -1/0
- | p == 1 = 1/0
- | otherwise = D.findRoot d p (gdShape d) 0 m_huge
+quantile (GD k l) p
+ | p == 0 = 0
+ | p == 1 = 1/0
+ | p > 0 && p < 1 = l * invIncompleteGamma k p
+ | otherwise =
+ error $ "Statistics.Distribution.Gamma.quantile: p must be in [0,1] range. Got: "++show p
{-# INLINE quantile #-}
diff --git a/Statistics/Distribution/Geometric.hs b/Statistics/Distribution/Geometric.hs
index 142d5ac..c6cac00 100644
--- a/Statistics/Distribution/Geometric.hs
+++ b/Statistics/Distribution/Geometric.hs
@@ -26,7 +26,6 @@ module Statistics.Distribution.Geometric
, gdSuccess
) where
-import Control.Exception (assert)
import Data.Typeable (Typeable)
import qualified Statistics.Distribution as D
@@ -40,15 +39,23 @@ instance D.Distribution GeometricDistribution where
instance D.DiscreteDistr GeometricDistribution where
probability = probability
+instance D.Mean GeometricDistribution where
+ mean (GD s) = 1 / s
+ {-# INLINE mean #-}
+
instance D.Variance GeometricDistribution where
variance (GD s) = (1 - s) / (s * s)
{-# INLINE variance #-}
-instance D.Mean GeometricDistribution where
- mean (GD s) = 1 / s
- {-# INLINE mean #-}
+instance D.MaybeMean GeometricDistribution where
+ maybeMean = Just . D.mean
+
+instance D.MaybeVariance GeometricDistribution where
+ maybeStdDev = Just . D.stdDev
+ maybeVariance = Just . D.variance
+
--- | Create geometric distribution
+-- | Create geometric distribution.
geometric :: Double -- ^ Success rate
-> GeometricDistribution
geometric x
diff --git a/Statistics/Distribution/Hypergeometric.hs b/Statistics/Distribution/Hypergeometric.hs
index 9e0cc31..3adc40e 100644
--- a/Statistics/Distribution/Hypergeometric.hs
+++ b/Statistics/Distribution/Hypergeometric.hs
@@ -38,16 +38,25 @@ data HypergeometricDistribution = HD {
} deriving (Eq, Read, Show, Typeable)
instance D.Distribution HypergeometricDistribution where
- cumulative d x = D.sumProbabilities d 0 (floor x)
+ cumulative = cumulative
instance D.DiscreteDistr HypergeometricDistribution where
probability = probability
+instance D.Mean HypergeometricDistribution where
+ mean = mean
+
instance D.Variance HypergeometricDistribution where
variance = variance
-instance D.Mean HypergeometricDistribution where
- mean = mean
+instance D.MaybeMean HypergeometricDistribution where
+ maybeMean = Just . D.mean
+
+instance D.MaybeVariance HypergeometricDistribution where
+ maybeStdDev = Just . D.stdDev
+ maybeVariance = Just . D.variance
+
+
variance :: HypergeometricDistribution -> Double
variance (HD m l k) = (k' * ml) * (1 - ml) * (l' - k') / (l' - 1)
@@ -81,3 +90,13 @@ probability (HD mi li ki) n
| otherwise =
choose mi n * choose (li - mi) (ki - n) / choose li ki
{-# INLINE probability #-}
+
+cumulative :: HypergeometricDistribution -> Double -> Double
+cumulative d@(HD mi li ki) x
+ | n < minN = 0
+ | n >= maxN = 1
+ | otherwise = D.sumProbabilities d minN n
+ where
+ n = floor x
+ minN = max 0 (mi+ki-li)
+ maxN = min mi ki \ No newline at end of file
diff --git a/Statistics/Distribution/Normal.hs b/Statistics/Distribution/Normal.hs
index 4fd56be..dd25943 100644
--- a/Statistics/Distribution/Normal.hs
+++ b/Statistics/Distribution/Normal.hs
@@ -29,65 +29,81 @@ import qualified Statistics.Sample as S
-- | The normal distribution.
data NormalDistribution = ND {
mean :: {-# UNPACK #-} !Double
- , variance :: {-# UNPACK #-} !Double
+ , stdDev :: {-# UNPACK #-} !Double
, ndPdfDenom :: {-# UNPACK #-} !Double
, ndCdfDenom :: {-# UNPACK #-} !Double
} deriving (Eq, Read, Show, Typeable)
instance D.Distribution NormalDistribution where
- cumulative = cumulative
+ cumulative = cumulative
+ complCumulative = complCumulative
instance D.ContDistr NormalDistribution where
density = density
quantile = quantile
-instance D.Variance NormalDistribution where
- variance = variance
+instance D.MaybeMean NormalDistribution where
+ maybeMean = Just . D.mean
instance D.Mean NormalDistribution where
mean = mean
+instance D.MaybeVariance NormalDistribution where
+ maybeStdDev = Just . D.stdDev
+ maybeVariance = Just . D.variance
+
+instance D.Variance NormalDistribution where
+ stdDev = stdDev
+
+
-- | Standard normal distribution with mean equal to 0 and variance equal to 1
standard :: NormalDistribution
standard = ND { mean = 0.0
- , variance = 1.0
+ , stdDev = 1.0
, ndPdfDenom = m_sqrt_2_pi
, ndCdfDenom = m_sqrt_2
}
--- | Create normal distribution from parameters
+-- | Create normal distribution from parameters.
+--
+-- IMPORTANT: prior to 0.10 release second parameter was variance not
+-- standard deviation.
normalDistr :: Double -- ^ Mean of distribution
- -> Double -- ^ Variance of distribution
+ -> Double -- ^ Standard deviation of distribution
-> NormalDistribution
-normalDistr m v
- | v <= 0 =
- error $ "Statistics.Distribution.Normal.normalDistr: variance must be positive. Got " ++ show v
- | otherwise = ND { mean = m
- , variance = v
- , ndPdfDenom = m_sqrt_2_pi * sv
- , ndCdfDenom = m_sqrt_2 * sv
+normalDistr m sd
+ | sd > 0 = ND { mean = m
+ , stdDev = sd
+ , ndPdfDenom = m_sqrt_2_pi * sd
+ , ndCdfDenom = m_sqrt_2 * sd
}
- where sv = sqrt v
+ | otherwise =
+ error $ "Statistics.Distribution.Normal.normalDistr: standard deviation must be positive. Got " ++ show sd
-- | Create distribution using parameters estimated from
-- sample. Variance is estimated using maximum likelihood method
-- (biased estimation).
normalFromSample :: S.Sample -> NormalDistribution
-normalFromSample a = normalDistr (S.mean a) (S.variance a)
+normalFromSample a = normalDistr (S.mean a) (S.stdDev a)
density :: NormalDistribution -> Double -> Double
-density d x = exp (-xm * xm / (2 * variance d)) / ndPdfDenom d
+density d x = exp (-xm * xm / (2 * sd * sd)) / ndPdfDenom d
where xm = x - mean d
+ sd = stdDev d
cumulative :: NormalDistribution -> Double -> Double
cumulative d x = erfc ((mean d - x) / ndCdfDenom d) / 2
+complCumulative :: NormalDistribution -> Double -> Double
+complCumulative d x = erfc ((x - mean d) / ndCdfDenom d) / 2
+
quantile :: NormalDistribution -> Double -> Double
quantile d p
- | p < 0 || p > 1 = inf/inf
| p == 0 = -inf
| p == 1 = inf
| p == 0.5 = mean d
- | otherwise = x * sqrt (variance d) + mean d
+ | p > 0 && p < 1 = x * stdDev d + mean d
+ | otherwise =
+ error $ "Statistics.Distribution.Normal.quantile: p must be in [0,1] range. Got: "++show p
where x = D.findRoot standard p 0 (-100) 100
inf = 1/0
diff --git a/Statistics/Distribution/Poisson.hs b/Statistics/Distribution/Poisson.hs
index d3636a2..28c8d26 100644
--- a/Statistics/Distribution/Poisson.hs
+++ b/Statistics/Distribution/Poisson.hs
@@ -1,7 +1,7 @@
{-# LANGUAGE DeriveDataTypeable #-}
-- |
-- Module : Statistics.Distribution.Poisson
--- Copyright : (c) 2009 Bryan O'Sullivan
+-- Copyright : (c) 2009, 2011 Bryan O'Sullivan
-- License : BSD3
--
-- Maintainer : bos@serpentine.com
@@ -20,22 +20,30 @@ module Statistics.Distribution.Poisson
, poisson
-- * Accessors
, poissonLambda
+ -- * References
+ -- $references
) where
import Data.Typeable (Typeable)
import qualified Statistics.Distribution as D
-import Statistics.Math (logGamma, factorial)
+import qualified Statistics.Distribution.Poisson.Internal as I
+import Statistics.Math (incompleteGamma)
+
+
newtype PoissonDistribution = PD {
poissonLambda :: Double
} deriving (Eq, Read, Show, Typeable)
instance D.Distribution PoissonDistribution where
- cumulative d x = D.sumProbabilities d 0 (floor x)
+ cumulative (PD lambda) x
+ | x < 0 = 0
+ | otherwise = 1 - incompleteGamma (fromIntegral (floor x + 1 :: Int)) lambda
{-# INLINE cumulative #-}
instance D.DiscreteDistr PoissonDistribution where
- probability = probability
+ probability (PD lambda) x = I.probability lambda (fromIntegral x)
+ {-# INLINE probability #-}
instance D.Variance PoissonDistribution where
variance = poissonLambda
@@ -45,19 +53,22 @@ instance D.Mean PoissonDistribution where
mean = poissonLambda
{-# INLINE mean #-}
--- | Create poisson distribution.
+instance D.MaybeMean PoissonDistribution where
+ maybeMean = Just . D.mean
+
+instance D.MaybeVariance PoissonDistribution where
+ maybeStdDev = Just . D.stdDev
+
+
+-- | Create Poisson distribution.
poisson :: Double -> PoissonDistribution
poisson l
- | l <= 0 =
- error $ "Statistics.Distribution.Poisson.poisson: lambda must be positive. Got " ++ show l
+ | l <= 0 = error $ "Statistics.Distribution.Poisson.poisson:\
+ \ lambda must be positive. Got " ++ show l
| otherwise = PD l
{-# INLINE poisson #-}
-probability :: PoissonDistribution -> Int -> Double
-probability (PD l) n
- | n < 0 = 0
- | l < 20 && n <= 100 = exp (-l) * l ** x / factorial n
- | otherwise = exp (x * log l - logGamma (x + 1) - l)
- where
- x = fromIntegral n
-{-# INLINE probability #-}
+-- $references
+--
+-- * Loader, C. (2000) Fast and Accurate Computation of Binomial
+-- Probabilities. <http://projects.scipy.org/scipy/raw-attachment/ticket/620/loader2000Fast.pdf>
diff --git a/Statistics/Distribution/Poisson/Internal.hs b/Statistics/Distribution/Poisson/Internal.hs
new file mode 100644
index 0000000..f4c9cda
--- /dev/null
+++ b/Statistics/Distribution/Poisson/Internal.hs
@@ -0,0 +1,32 @@
+-- |
+-- Module : Statistics.Distribution.Poisson.Internal
+-- Copyright : (c) 2011 Bryan O'Sullivan
+-- License : BSD3
+--
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : portable
+--
+-- Internal code for the Poisson distribution.
+
+module Statistics.Distribution.Poisson.Internal
+ (
+ probability
+ ) where
+
+import Statistics.Constants (m_sqrt_2_pi, m_tiny)
+import Statistics.Math (bd0, logGamma, stirlingError)
+
+-- | An unchecked, non-integer-valued version of Loader's saddle point
+-- algorithm.
+probability :: Double -> Double -> Double
+probability 0 0 = 1
+probability 0 1 = 0
+probability lambda x
+ | isInfinite lambda = 0
+ | x < 0 = 0
+ | x <= lambda * m_tiny = exp (-lambda)
+ | lambda < x * m_tiny = exp (-lambda + x * log lambda - logGamma (x+1))
+ | otherwise = exp (-(stirlingError x) - bd0 x lambda) /
+ (m_sqrt_2_pi * sqrt x)
+{-# INLINE probability #-}
diff --git a/Statistics/Distribution/StudentT.hs b/Statistics/Distribution/StudentT.hs
new file mode 100644
index 0000000..3fd4271
--- /dev/null
+++ b/Statistics/Distribution/StudentT.hs
@@ -0,0 +1,71 @@
+{-# LANGUAGE DeriveDataTypeable #-}
+-- |
+-- Module : Statistics.Distribution.StudentT
+-- Copyright : (c) 2011 Aleksey Khudyakov
+-- License : BSD3
+--
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : portable
+--
+-- Student-T distribution
+module Statistics.Distribution.StudentT (
+ StudentT
+ , studentT
+ , studentTndf
+ ) where
+
+import qualified Statistics.Distribution as D
+import Data.Typeable (Typeable)
+import Statistics.Math (logBeta, incompleteBeta, invIncompleteBeta)
+
+
+
+-- | Student-T distribution
+newtype StudentT = StudentT { studentTndf :: Double }
+ deriving (Eq,Show,Read,Typeable)
+
+
+-- | Create Student-T distribution. Number of parameters must be positive.
+studentT :: Double -> StudentT
+studentT ndf
+ | ndf > 0 = StudentT ndf
+ | otherwise =
+ error "Statistics.Distribution.StudentT.studentT: non-positive number of degrees of freedom"
+
+instance D.Distribution StudentT where
+ cumulative = cumulative
+
+instance D.ContDistr StudentT where
+ density = density
+ quantile = quantile
+
+cumulative :: StudentT -> Double -> Double
+cumulative (StudentT ndf) x
+ | x > 0 = 1 - 0.5 * ibeta
+ | otherwise = 0.5 * ibeta
+ where
+ ibeta = incompleteBeta (0.5 * ndf) 0.5 (ndf / (ndf + x*x))
+
+density :: StudentT -> Double -> Double
+density (StudentT ndf) x =
+ exp( log (ndf / (ndf + x*x)) * (0.5 * (1 + ndf)) - logBeta 0.5 (0.5 * ndf) ) / sqrt ndf
+
+quantile :: StudentT -> Double -> Double
+quantile (StudentT ndf) p
+ | p >= 0 && p <= 1 =
+ let x = invIncompleteBeta (0.5 * ndf) 0.5 (2 * min p (1 - p))
+ in case sqrt $ ndf * (1 - x) / x of
+ r | p < 0.5 -> -r
+ | otherwise -> r
+ | otherwise =
+ error $ "Statistics.Distribution.Uniform.quantile: p must be in [0,1] range. Got: "++show p
+
+
+instance D.MaybeMean StudentT where
+ maybeMean (StudentT ndf) | ndf > 1 = Just 0
+ | otherwise = Nothing
+
+instance D.MaybeVariance StudentT where
+ maybeStdDev (StudentT ndf) | ndf > 2 = Just $ ndf / (ndf - 2)
+ | otherwise = Nothing
diff --git a/Statistics/Distribution/Uniform.hs b/Statistics/Distribution/Uniform.hs
new file mode 100644
index 0000000..b43de0d
--- /dev/null
+++ b/Statistics/Distribution/Uniform.hs
@@ -0,0 +1,62 @@
+{-# LANGUAGE DeriveDataTypeable #-}
+-- |
+-- Module : Statistics.Distribution.Uniform
+-- Copyright : (c) 2011 Aleksey Khudyakov
+-- License : BSD3
+--
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : portable
+--
+-- Variate distributed uniformly in the interval.
+module Statistics.Distribution.Uniform (
+ UniformDistribution
+ , uniformDistr
+ ) where
+
+import Data.Typeable (Typeable)
+import qualified Statistics.Distribution as D
+
+
+-- | Uniform distribution
+data UniformDistribution = UniformDistribution {-# UNPACK #-} !Double {-# UNPACK #-} !Double
+ deriving (Eq,Show,Read,Typeable)
+
+-- | Create uniform distribution.
+uniformDistr :: Double -> Double -> UniformDistribution
+uniformDistr a b
+ | b < a = uniformDistr b a
+ | a < b = UniformDistribution a b
+ | otherwise = error "Statistics.Distribution.Uniform.uniform: wrong parameters"
+-- NOTE: failure is in default branch to guard againist NaNs.
+
+instance D.Distribution UniformDistribution where
+ cumulative (UniformDistribution a b) x
+ | x < a = 0
+ | x > b = 1
+ | otherwise = (x - a) / (b - a)
+
+instance D.ContDistr UniformDistribution where
+ density (UniformDistribution a b) x
+ | x < a = 0
+ | x > b = 0
+ | otherwise = 1 / (b - a)
+ quantile (UniformDistribution a b) p
+ | p >= 0 && p <= 1 = a + (b - a) * p
+ | otherwise =
+ error $ "Statistics.Distribution.Uniform.quantile: p must be in [0,1] range. Got: "++show p
+
+instance D.Mean UniformDistribution where
+ mean (UniformDistribution a b) = 0.5 * (a + b)
+
+instance D.Variance UniformDistribution where
+ -- NOTE: 1/sqrt 12 is not constant folded (#4101) so it's written as
+ -- numerical constant. (Also FIXME!)
+ stdDev (UniformDistribution a b) = 0.2886751345948129 * (b - a)
+ variance (UniformDistribution a b) = d * d / 12 where d = b - a
+
+instance D.MaybeMean UniformDistribution where
+ maybeMean = Just . D.mean
+
+instance D.MaybeVariance UniformDistribution where
+ maybeStdDev = Just . D.stdDev
diff --git a/Statistics/Function.hs b/Statistics/Function.hs
index 33999ab..f3c9c3a 100644
--- a/Statistics/Function.hs
+++ b/Statistics/Function.hs
@@ -12,38 +12,45 @@
module Statistics.Function
(
+ -- * Scanning
minMax
+ -- * Sorting
, sort
+ , sortBy
, partialSort
+ -- * Indexing
, indexed
, indices
+ -- * Bit twiddling
, nextHighestPowerOfTwo
- -- * Vector setup
- , create
+ -- * Comparison
+ , within
) where
#include "MachDeps.h"
-import Control.Exception (assert)
-import Control.Monad.Primitive (PrimMonad)
import Data.Bits ((.|.), shiftR)
-import Data.Vector.Generic (modify, unsafeFreeze)
import qualified Data.Vector.Algorithms.Intro as I
import qualified Data.Vector.Generic as G
-import qualified Data.Vector.Generic.Mutable as M
+import Statistics.Function.Comparison (within)
-- | Sort a vector.
sort :: (Ord e, G.Vector v e) => v e -> v e
-sort = modify I.sort
+sort = G.modify I.sort
{-# INLINE sort #-}
+-- | Sort a vector using a custom ordering.
+sortBy :: (G.Vector v e) => I.Comparison e -> v e -> v e
+sortBy f = G.modify $ I.sortBy f
+{-# INLINE sortBy #-}
+
-- | Partially sort a vector, such that the least /k/ elements will be
-- at the front.
partialSort :: (G.Vector v e, Ord e) =>
Int -- ^ The number /k/ of least elements.
-> v e
-> v e
-partialSort k = modify (\a -> I.partialSort a k)
+partialSort k = G.modify (`I.partialSort` k)
{-# INLINE partialSort #-}
-- | Return the indices of a vector.
@@ -66,18 +73,6 @@ minMax = fini . G.foldl' go (MM (1/0) (-1/0))
fini (MM lo hi) = (lo, hi)
{-# INLINE minMax #-}
--- | Create a vector, using the given action to populate each
--- element.
-create :: (PrimMonad m, G.Vector v e) => Int -> (Int -> m e) -> m (v e)
-create size itemAt = assert (size >= 0) $
- M.new size >>= loop 0
- where
- loop k arr | k >= size = unsafeFreeze arr
- | otherwise = do r <- itemAt k
- M.write arr k r
- loop (k+1) arr
-{-# INLINE create #-}
-
-- | Efficiently compute the next highest power of two for a
-- non-negative integer. If the given value is already a power of
-- two, it is returned unchanged. If negative, zero is returned.
diff --git a/Statistics/Function/Comparison.hs b/Statistics/Function/Comparison.hs
new file mode 100644
index 0000000..d29b22a
--- /dev/null
+++ b/Statistics/Function/Comparison.hs
@@ -0,0 +1,40 @@
+-- |
+-- Module : Statistics.Function.Comparison
+-- Copyright : (c) 2011 Bryan O'Sullivan
+-- License : BSD3
+--
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : portable
+--
+-- Approximate floating point comparison, based on Bruce Dawson's
+-- \"Comparing floating point numbers\":
+-- <http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm>
+
+module Statistics.Function.Comparison
+ (
+ within
+ ) where
+
+import Control.Monad.ST (runST)
+import Data.Primitive.ByteArray (newByteArray, readByteArray, writeByteArray)
+import Data.Int (Int64)
+
+-- | Compare two 'Double' values for approximate equality, using
+-- Dawson's method.
+--
+-- The required accuracy is specified in ULPs (units of least
+-- precision). If the two numbers differ by the given number of ULPs
+-- or less, this function returns @True@.
+within :: Int -- ^ Number of ULPs of accuracy desired.
+ -> Double -> Double -> Bool
+within ulps a b = runST $ do
+ buf <- newByteArray 8
+ ai0 <- writeByteArray buf 0 a >> readByteArray buf 0
+ bi0 <- writeByteArray buf 0 b >> readByteArray buf 0
+ let big = 0x8000000000000000 :: Int64
+ ai | ai0 < 0 = big - ai0
+ | otherwise = ai0
+ bi | bi0 < 0 = big - bi0
+ | otherwise = bi0
+ return $ abs (ai - bi) <= fromIntegral ulps
diff --git a/Statistics/Math.hs b/Statistics/Math.hs
index 85a8a0d..b2a0208 100644
--- a/Statistics/Math.hs
+++ b/Statistics/Math.hs
@@ -1,8 +1,7 @@
-{-# LANGUAGE BangPatterns #-}
-{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE BangPatterns, FlexibleContexts #-}
-- |
-- Module : Statistics.Math
--- Copyright : (c) 2009 Bryan O'Sullivan
+-- Copyright : (c) 2009, 2011 Bryan O'Sullivan
-- License : BSD3
--
-- Maintainer : bos@serpentine.com
@@ -17,6 +16,9 @@ module Statistics.Math
choose
-- ** Beta function
, logBeta
+ , incompleteBeta
+ , incompleteBeta_
+ , invIncompleteBeta
-- ** Chebyshev polynomials
-- $chebyshev
, chebyshev
@@ -25,15 +27,21 @@ module Statistics.Math
, factorial
, logFactorial
-- ** Gamma function
- , incompleteGamma
, logGamma
, logGammaL
+ , incompleteGamma
+ , invIncompleteGamma
-- ** Logarithm
, log1p
+ , log2
+ -- ** Stirling's approximation
+ , stirlingError
+ , bd0
-- * References
-- $references
) where
+import Data.Bits ((.&.), (.|.), shiftR)
import Data.Int (Int64)
import Data.Word (Word64)
import Statistics.Constants (m_epsilon, m_sqrt_2_pi, m_ln_sqrt_2_pi, m_NaN,
@@ -43,6 +51,7 @@ import Statistics.Distribution.Normal (standard)
import qualified Data.Vector.Unboxed as U
import qualified Data.Vector.Generic as G
+
-- $chebyshev
--
-- A Chebyshev polynomial of the first kind is defined by the
@@ -122,7 +131,7 @@ factorial n
| n < 0 = error "Statistics.Math.factorial: negative input"
| n <= 1 = 1
| n <= 14 = fini . U.foldl' goLong (F 1 1) $ ns
- | otherwise = U.foldl' goDouble 1 $ ns
+ | otherwise = U.foldl' goDouble 1 ns
where goDouble t k = t * fromIntegral k
goLong (F z x) _ = F (z * x') x'
where x' = x + 1
@@ -185,6 +194,76 @@ incompleteGamma p x
tolerance = 1e-14
overflow = 1e37
+
+
+-- Adapted from Numerical Recipes §6.2.1
+
+-- | Inverse incomplete gamma function. It's approximately inverse of
+-- 'incompleteGamma' for the same /s/. So following equality
+-- approximately holds:
+--
+-- > invIncompleteGamma s . incompleteGamma s = id
+--
+-- For @invIncompleteGamma s p@ /s/ must be positive and /p/ must be
+-- in [0,1] range.
+invIncompleteGamma :: Double -> Double -> Double
+invIncompleteGamma a p
+ | a <= 0 =
+ error $ "Statistics.Math.invIncompleteGamma: a must be positive. Got: " ++ show a
+ | p < 0 || p > 1 =
+ error $ "Statistics.Math.invIncompleteGamma: p must be in [0,1] range. Got: " ++ show p
+ | p == 0 = 0
+ | p == 1 = 1 / 0
+ | otherwise = loop 0 guess
+ where
+ -- Solve equation γ(a,x) = p using Halley method
+ loop :: Int -> Double -> Double
+ loop i x
+ | i >= 12 = x
+ | otherwise =
+ let
+ -- Value of γ(a,x) - p
+ f = incompleteGamma a x - p
+ -- dγ(a,x)/dx
+ f' | a > 1 = afac * exp( -(x - a1) + a1 * (log x - lna1))
+ | otherwise = exp( -x + a1 * log x - gln)
+ u = f / f'
+ -- Halley correction to Newton-Rapson step
+ corr = u * (a1 / x - 1)
+ dx = u / (1 - 0.5 * min 1.0 corr)
+ -- New approximation to x
+ x' | x < dx = 0.5 * x -- Do not go below 0
+ | otherwise = x - dx
+ in if abs dx < eps * x'
+ then x'
+ else loop (i+1) x'
+ -- Calculate inital guess for root
+ guess
+ --
+ | a > 1 =
+ let t = sqrt $ -2 * log(if p < 0.5 then p else 1 - p)
+ x1 = (2.30753 + t * 0.27061) / (1 + t * (0.99229 + t * 0.04481)) - t
+ x2 = if p < 0.5 then -x1 else x1
+ in max 1e-3 (a * (1 - 1/(9*a) - x2 / (3 * sqrt a)) ** 3)
+ -- For a <= 1 use following approximations:
+ -- γ(a,1) ≈ 0.253a + 0.12a²
+ --
+ -- γ(a,x) ≈ γ(a,1)·x^a x < 1
+ -- γ(a,x) ≈ γ(a,1) + (1 - γ(a,1))(1 - exp(1 - x)) x >= 1
+ | otherwise =
+ let t = 1 - a * (0.253 + a*0.12)
+ in if p < t
+ then (p / t) ** (1 / a)
+ else 1 - log( 1 - (p-t) / (1-t))
+ -- Constants
+ a1 = a - 1
+ lna1 = log a1
+ afac = exp( a1 * (lna1 - 1) - gln )
+ gln = logGamma a
+ eps = 1e-8
+
+
+
-- Adapted from http://people.sc.fsu.edu/~burkardt/f_src/asa245/asa245.html
-- | Compute the logarithm of the gamma function &#915;(/x/). Uses
@@ -307,6 +386,106 @@ logBeta a b
pq = p + q
c = logGammaCorrection q - logGammaCorrection pq
+-- | Regularized incomplete beta function. Uses algorithm AS63 by
+-- Majumder abd Bhattachrjee.
+incompleteBeta :: Double -- ^ /p/ > 0
+ -> Double -- ^ /q/ > 0
+ -> Double -- ^ /x/, must lie in [0,1] range
+ -> Double
+incompleteBeta p q = incompleteBeta_ (logBeta p q) p q
+
+-- | Regularized incomplete beta function. Same as 'incompleteBeta'
+-- but also takes value of lo
+incompleteBeta_ :: Double -- ^ logarithm of beta function
+ -> Double -- ^ /p/ > 0
+ -> Double -- ^ /q/ > 0
+ -> Double -- ^ /x/, must lie in [0,1] range
+ -> Double
+incompleteBeta_ beta p q x
+ | p <= 0 || q <= 0 = error "p <= 0 || q <= 0"
+ | x < 0 || x > 1 = error "x < 0 || x > 1"
+ | x == 0 || x == 1 = x
+ | p >= (p+q) * x = incompleteBetaWorker beta p q x
+ | otherwise = 1 - incompleteBetaWorker beta q p (1 - x)
+
+-- Worker for incomplete beta function. It is separate function to
+-- avoid confusion with parameter during parameter swapping
+incompleteBetaWorker :: Double -> Double -> Double -> Double -> Double
+incompleteBetaWorker beta p q x = loop (p+q) (truncate $ q + cx * (p+q) :: Int) 1 1 1
+ where
+ -- Constants
+ eps = 1e-15
+ cx = 1 - x
+ -- Loop
+ loop psq ns ai term betain
+ | done = betain' * exp( p * log x + (q - 1) * log cx - beta) / p
+ | otherwise = loop psq' (ns - 1) (ai + 1) term' betain'
+ where
+ -- New values
+ term' = term * fact / (p + ai)
+ betain' = betain + term'
+ fact | ns > 0 = (q - ai) * x/cx
+ | ns == 0 = (q - ai) * x
+ | otherwise = psq * x
+ -- Iterations are complete
+ done = db <= eps && db <= eps*betain' where db = abs term'
+ psq' = if ns < 0 then psq + 1 else psq
+
+-- | Compute inverse of regularized incomplete beta function. Uses
+-- initial approximation from AS109 and Halley method to solve equation.
+invIncompleteBeta :: Double -- ^ /p/
+ -> Double -- ^ /q/
+ -> Double -- ^ /a/
+ -> Double
+invIncompleteBeta p q a
+ | p <= 0 || q <= 0 = error "p <= 0 || q <= 0"
+ | a < 0 || a > 1 = error "bad a"
+ | a == 0 || a == 1 = a
+ | a > 0.5 = 1 - invIncompleteBetaWorker (logBeta p q) q p (1 - a)
+ | otherwise = invIncompleteBetaWorker (logBeta p q) p q a
+
+invIncompleteBetaWorker :: Double -> Double -> Double -> Double -> Double
+invIncompleteBetaWorker beta p q a = loop (0::Int) guess
+ where
+ p1 = p - 1
+ q1 = q - 1
+ -- Solve equation using Halley method
+ loop !i !x
+ | x == 0 || x == 1 = x
+ | i >= 10 = x
+ | abs dx <= 16 * m_epsilon * x = x
+ | otherwise = loop (i+1) x'
+ where
+ f = incompleteBeta_ beta p q x - a
+ f' = exp $ p1 * log x + q1 * log (1 - x) - beta
+ u = f / f'
+ dx = u / (1 - 0.5 * min 1 (u * (p1 / x - q1 / (1 - x))))
+ x' | z < 0 = x / 2
+ | z > 1 = (x + 1) / 2
+ | otherwise = z
+ where z = x - dx
+ -- Calculate initial guess
+ guess
+ | p > 1 && q > 1 =
+ let rr = (y*y - 3) / 6
+ ss = 1 / (2*p - 1)
+ tt = 1 / (2*q - 1)
+ hh = 2 / (ss + tt)
+ ww = y * sqrt(hh + rr) / hh - (tt - ss) * (rr + 5/6 - 2 / (3 * hh))
+ in p / (p + q * exp(2 * ww))
+ | t' <= 0 = 1 - exp( (log((1 - a) * q) + beta) / q )
+ | t'' <= 1 = exp( (log(a * p) + beta) / p )
+ | otherwise = 1 - 2 / (t'' + 1)
+ where
+ r = sqrt ( - log ( a * a ) )
+ y = r - ( 2.30753 + 0.27061 * r )
+ / ( 1.0 + ( 0.99229 + 0.04481 * r ) * r )
+ t = 1 / (9 * q)
+ t' = 2 * q * (1 - t + y * sqrt t) ** 3
+ t'' = (4*p + 2*q - 2) / t'
+
+
+
-- | Compute the natural logarithm of 1 + @x@. This is accurate even
-- for values of @x@ near zero, where use of @log(1+x)@ would lose
-- precision.
@@ -347,6 +526,77 @@ log1p x
-0.10324619158271569595141333961932e-15
]
+-- | Calculate the error term of the Stirling approximation. This is
+-- only defined for non-negative values.
+--
+-- > stirlingError @n@ = @log(n!) - log(sqrt(2*pi*n)*(n/e)^n)
+stirlingError :: Double -> Double
+stirlingError n
+ | n <= 15.0 = case properFraction (n+n) of
+ (i,0) -> sfe `U.unsafeIndex` i
+ _ -> logGamma (n+1.0) - (n+0.5) * log n + n -
+ m_ln_sqrt_2_pi
+ | n > 500 = (s0-s1/nn)/n
+ | n > 80 = (s0-(s1-s2/nn)/nn)/n
+ | n > 35 = (s0-(s1-(s2-s3/nn)/nn)/nn)/n
+ | otherwise = (s0-(s1-(s2-(s3-s4/nn)/nn)/nn)/nn)/n
+ where
+ nn = n*n
+ s0 = 0.083333333333333333333 -- 1/12
+ s1 = 0.00277777777777777777778 -- 1/360
+ s2 = 0.00079365079365079365079365 -- 1/1260
+ s3 = 0.000595238095238095238095238 -- 1/1680
+ s4 = 0.0008417508417508417508417508 -- 1/1188
+ sfe = U.fromList [ 0.0,
+ 0.1534264097200273452913848, 0.0810614667953272582196702,
+ 0.0548141210519176538961390, 0.0413406959554092940938221,
+ 0.03316287351993628748511048, 0.02767792568499833914878929,
+ 0.02374616365629749597132920, 0.02079067210376509311152277,
+ 0.01848845053267318523077934, 0.01664469118982119216319487,
+ 0.01513497322191737887351255, 0.01387612882307074799874573,
+ 0.01281046524292022692424986, 0.01189670994589177009505572,
+ 0.01110455975820691732662991, 0.010411265261972096497478567,
+ 0.009799416126158803298389475, 0.009255462182712732917728637,
+ 0.008768700134139385462952823, 0.008330563433362871256469318,
+ 0.007934114564314020547248100, 0.007573675487951840794972024,
+ 0.007244554301320383179543912, 0.006942840107209529865664152,
+ 0.006665247032707682442354394, 0.006408994188004207068439631,
+ 0.006171712263039457647532867, 0.005951370112758847735624416,
+ 0.005746216513010115682023589, 0.005554733551962801371038690 ]
+
+
+-- | Evaluate the deviance term @x log(x/np) + np - x@.
+bd0 :: Double -- ^ @x@
+ -> Double -- ^ @np@
+ -> Double
+bd0 x np
+ | isInfinite x || isInfinite np || np == 0 = m_NaN
+ | abs x_np >= 0.1*(x+np) = x * log (x/np) - x_np
+ | otherwise = loop 1 (ej0*vv) s0
+ where
+ x_np = x - np
+ v = x_np / (x+np)
+ s0 = x_np * v
+ ej0 = 2*x*v
+ vv = v*v
+ loop j ej s = case s + ej/(2*j+1) of
+ s' | s' == s -> s'
+ | otherwise -> loop (j+1) (ej*vv) s'
+
+-- | /O(log n)/ Compute the logarithm in base 2 of the given value.
+log2 :: Int -> Int
+log2 v0
+ | v0 <= 0 = error "Statistics.Math.log2: invalid input"
+ | otherwise = go 5 0 v0
+ where
+ go !i !r !v | i == -1 = r
+ | v .&. b i /= 0 = let si = U.unsafeIndex sv i
+ in go (i-1) (r .|. si) (v `shiftR` si)
+ | otherwise = go (i-1) r v
+ b = U.unsafeIndex bv
+ !bv = U.fromList [0x2, 0xc, 0xf0, 0xff00, 0xffff0000, 0xffffffff00000000]
+ !sv = U.fromList [1,2,4,8,16,32]
+
-- $references
--
-- * Broucke, R. (1973) Algorithm 446: Ten subroutines for the
@@ -361,6 +611,9 @@ log1p x
-- function. /SIAM Journal on Numerical Analysis B/
-- 1:86&#8211;96. <http://www.jstor.org/stable/2949767>
--
+-- * Loader, C. (2000) Fast and Accurate Computation of Binomial
+-- Probabilities. <http://projects.scipy.org/scipy/raw-attachment/ticket/620/loader2000Fast.pdf>
+--
-- * Macleod, A.J. (1989) Algorithm AS 245: A robust and reliable
-- algorithm for the logarithm of the gamma function.
-- /Journal of the Royal Statistical Society, Series C (Applied Statistics)/
@@ -369,3 +622,21 @@ log1p x
-- * Shea, B. (1988) Algorithm AS 239: Chi-squared and incomplete
-- gamma integral. /Applied Statistics/
-- 37(3):466&#8211;473. <http://www.jstor.org/stable/2347328>
+--
+-- * K. L. Majumder, G. P. Bhattacharjee (1973) Algorithm AS 63: The
+-- Incomplete Beta Integral. /Journal of the Royal Statistical
+-- Society. Series C (Applied Statistics)/ Vol. 22, No. 3 (1973),
+-- pp. 409-411. <http://www.jstor.org/pss/2346797>
+--
+-- * K. L. Majumder, G. P. Bhattacharjee (1973) Algorithm AS 64:
+-- Inverse of the Incomplete Beta Function Ratio. /Journal of the
+-- Royal Statistical Society. Series C (Applied Statistics)/
+-- Vol. 22, No. 3 (1973), pp. 411-414
+-- <http://www.jstor.org/pss/2346798>
+--
+-- * G. W. Cran, K. J. Martin and G. E. Thomas (1977) Remark AS R19
+-- and Algorithm AS 109: A Remark on Algorithms: AS 63: The
+-- Incomplete Beta Integral AS 64: Inverse of the Incomplete Beta
+-- Function Ratio. /Journal of the Royal Statistical Society. Series
+-- C (Applied Statistics)/ Vol. 26, No. 1 (1977), pp. 111-114
+-- <http://www.jstor.org/pss/2346887>
diff --git a/Statistics/Math/RootFinding.hs b/Statistics/Math/RootFinding.hs
new file mode 100644
index 0000000..143ca2b
--- /dev/null
+++ b/Statistics/Math/RootFinding.hs
@@ -0,0 +1,127 @@
+{-# LANGUAGE BangPatterns, DeriveDataTypeable #-}
+
+-- |
+-- Module : Statistics.Math.RootFinding
+-- Copyright : (c) 2011 Bryan O'Sullivan
+-- License : BSD3
+--
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : portable
+--
+-- Haskell functions for finding the roots of mathematical functions.
+
+module Statistics.Math.RootFinding
+ (
+ Root(..)
+ , fromRoot
+ , ridders
+ -- * References
+ -- $references
+ ) where
+
+import Statistics.Function.Comparison
+
+import Control.Applicative
+import Control.Monad (MonadPlus(..), ap)
+import Data.Typeable (Typeable)
+
+
+-- | The result of searching for a root of a mathematical function.
+data Root a = NotBracketed
+ -- ^ The function does not have opposite signs when
+ -- evaluated at the lower and upper bounds of the search.
+ | SearchFailed
+ -- ^ The search failed to converge to within the given
+ -- error tolerance after the given number of iterations.
+ | Root a
+ -- ^ A root was successfully found.
+ deriving (Eq, Read, Show, Typeable)
+
+instance Functor Root where
+ fmap _ NotBracketed = NotBracketed
+ fmap _ SearchFailed = SearchFailed
+ fmap f (Root a) = Root (f a)
+
+instance Monad Root where
+ NotBracketed >>= _ = NotBracketed
+ SearchFailed >>= _ = SearchFailed
+ Root a >>= m = m a
+
+ return = Root
+
+instance MonadPlus Root where
+ mzero = SearchFailed
+
+ r@(Root _) `mplus` _ = r
+ _ `mplus` p = p
+
+instance Applicative Root where
+ pure = Root
+ (<*>) = ap
+
+instance Alternative Root where
+ empty = SearchFailed
+
+ r@(Root _) <|> _ = r
+ _ <|> p = p
+
+-- | Returns either the result of a search for a root, or the default
+-- value if the search failed.
+fromRoot :: a -- ^ Default value.
+ -> Root a -- ^ Result of search for a root.
+ -> a
+fromRoot _ (Root a) = a
+fromRoot a _ = a
+
+
+-- | Use the method of Ridders to compute a root of a function.
+--
+-- The function must have opposite signs when evaluated at the lower
+-- and upper bounds of the search (i.e. the root must be bracketed).
+ridders :: Double -- ^ Absolute error tolerance.
+ -> (Double,Double) -- ^ Lower and upper bounds for the search.
+ -> (Double -> Double) -- ^ Function to find the roots of.
+ -> Root Double
+ridders tol (lo,hi) f
+ | flo == 0 = Root lo
+ | fhi == 0 = Root hi
+ | flo*fhi > 0 = NotBracketed -- root is not bracketed
+ | otherwise = go lo flo hi fhi 0
+ where
+ go !a !fa !b !fb !i
+ -- Root is bracketed within 1 ulp. No improvement could be made
+ | within 1 a b = Root a
+ -- Root is found. Check that f(m) == 0 is nessesary to ensure
+ -- that root is never passed to 'go'
+ | fm == 0 = Root m
+ | fn == 0 = Root n
+ | d < tol = Root n
+ -- Too many iterations performed. Fail
+ | i >= (100 :: Int) = SearchFailed
+ -- Ridder's approximation coincide with one of old
+ -- bounds. Revert to bisection
+ | n == a || n == b = case () of
+ _| fm*fa < 0 -> go a fa m fm (i+1)
+ | otherwise -> go m fm b fb (i+1)
+ -- Proceed as usual
+ | fn*fm < 0 = go n fn m fm (i+1)
+ | fn*fa < 0 = go a fa n fn (i+1)
+ | otherwise = go n fn b fb (i+1)
+ where
+ d = abs (b - a)
+ dm = (b - a) * 0.5
+ !m = a + dm
+ !fm = f m
+ !dn = signum (fb - fa) * dm * fm / sqrt(fm*fm - fa*fb)
+ !n = m - signum dn * min (abs dn) (abs dm - 0.5 * tol)
+ !fn = f n
+ !flo = f lo
+ !fhi = f hi
+
+
+-- $references
+--
+-- * Ridders, C.F.J. (1979) A new algorithm for computing a single
+-- root of a real continuous function.
+-- /IEEE Transactions on Circuits and Systems/ 26:979&#8211;980.
diff --git a/Statistics/Quantile.hs b/Statistics/Quantile.hs
index 4ed1bec..2747d25 100644
--- a/Statistics/Quantile.hs
+++ b/Statistics/Quantile.hs
@@ -50,7 +50,9 @@ weightedAvg :: G.Vector v Double =>
-> Int -- ^ /q/, the number of quantiles.
-> v Double -- ^ /x/, the sample data.
-> Double
-weightedAvg k q x =
+weightedAvg k q x
+ | n == 1 = G.head x
+ | otherwise =
assert (q >= 2) .
assert (k >= 0) .
assert (k < q) .
@@ -58,11 +60,12 @@ weightedAvg k q x =
xj + g * (xj1 - xj)
where
j = floor idx
- idx = fromIntegral (G.length x - 1) * fromIntegral k / fromIntegral q
+ idx = fromIntegral (n - 1) * fromIntegral k / fromIntegral q
g = idx - fromIntegral j
xj = sx ! j
xj1 = sx ! (j+1)
sx = partialSort (j+2) x
+ n = G.length x
{-# INLINE weightedAvg #-}
-- | Parameters /a/ and /b/ to the 'continuousBy' function.
diff --git a/Statistics/Resampling.hs b/Statistics/Resampling.hs
index 961eec1..1ad9a02 100644
--- a/Statistics/Resampling.hs
+++ b/Statistics/Resampling.hs
@@ -25,7 +25,7 @@ import Data.Vector.Algorithms.Intro (sort)
import Data.Vector.Generic (unsafeFreeze)
import Data.Word (Word32)
import GHC.Conc (numCapabilities)
-import Statistics.Function (create, indices)
+import Statistics.Function (indices)
import Statistics.Types (Estimator, Sample)
import System.Random.MWC (Gen, initialize, uniform, uniformVector)
import qualified Data.Vector.Unboxed as U
@@ -61,14 +61,14 @@ resample gen ests numResamples samples = do
zipWith (+) (replicate numCapabilities q)
(replicate r 1 ++ repeat 0)
where (q,r) = numResamples `quotRem` numCapabilities
- results <- mapM (const (MU.new numResamples)) $ ests
+ results <- mapM (const (MU.new numResamples)) ests
done <- newChan
forM_ (zip ixs (tail ixs)) $ \ (start,!end) -> do
gen' <- initialize =<< (uniformVector gen 256 :: IO (U.Vector Word32))
forkIO $ do
let loop k ers | k >= end = writeChan done ()
| otherwise = do
- re <- create numSamples $ \_ -> do
+ re <- U.replicateM numSamples $ do
r <- uniform gen'
return (U.unsafeIndex samples (r `mod` numSamples))
forM_ ers $ \(est,arr) ->
diff --git a/Statistics/Resampling/Bootstrap.hs b/Statistics/Resampling/Bootstrap.hs
index b0b3c84..c84c497 100644
--- a/Statistics/Resampling/Bootstrap.hs
+++ b/Statistics/Resampling/Bootstrap.hs
@@ -20,11 +20,9 @@ module Statistics.Resampling.Bootstrap
-- $references
) where
-import Control.Applicative ((<$>), (<*>), empty)
import Control.DeepSeq (NFData)
import Control.Exception (assert)
import Control.Monad.Par (runPar, parMap)
-import Data.Aeson.Types
import Data.Data (Data)
import Data.Typeable (Typeable)
import Data.Vector.Unboxed ((!))
@@ -51,22 +49,6 @@ data Estimate = Estimate {
instance NFData Estimate
-instance ToJSON Estimate where
- toJSON Estimate{..} = object [
- "estPoint" .= estPoint
- , "estLowerBound" .= estLowerBound
- , "estUpperBound" .= estUpperBound
- , "estConfidenceLevel" .= estConfidenceLevel
- ]
-
-instance FromJSON Estimate where
- parseJSON (Object v) = Estimate <$>
- v .: "estPoint" <*>
- v .: "estLowerBound" <*>
- v .: "estUpperBound" <*>
- v .: "estConfidenceLevel"
- parseJSON _ = empty
-
-- | Multiply the point, lower bound, and upper bound in an 'Estimate'
-- by the given value.
scale :: Double -- ^ Value to multiply by.
diff --git a/Statistics/Sample.hs b/Statistics/Sample.hs
index b380174..671a137 100644
--- a/Statistics/Sample.hs
+++ b/Statistics/Sample.hs
@@ -60,14 +60,14 @@ import qualified Data.Vector.Generic as G
-- Operator ^ will be overriden
import Prelude hiding ((^))
--- | Range. The difference between the largest and smallest elements
--- of a sample.
+-- | /O(n)/ Range. The difference between the largest and smallest
+-- elements of a sample.
range :: (G.Vector v Double) => v Double -> Double
range s = hi - lo
where (lo , hi) = minMax s
{-# INLINE range #-}
--- | Arithmetic mean. This uses Welford's algorithm to provide
+-- | /O(n)/ Arithmetic mean. This uses Welford's algorithm to provide
-- numerical stability, using a single pass over the sample data.
mean :: (G.Vector v Double) => v Double -> Double
mean = fini . G.foldl' go (T 0 0)
@@ -78,8 +78,8 @@ mean = fini . G.foldl' go (T 0 0)
n' = n + 1
{-# INLINE mean #-}
--- | Arithmetic mean for weighted sample. It uses algorithm analogous
--- to one in 'mean'
+-- | /O(n)/ Arithmetic mean for weighted sample. It uses a single-pass
+-- algorithm analogous to the one used by 'mean'.
meanWeighted :: (G.Vector v (Double,Double)) => v (Double,Double) -> Double
meanWeighted = fini . G.foldl' go (V 0 0)
where
@@ -90,8 +90,8 @@ meanWeighted = fini . G.foldl' go (V 0 0)
w' = w + xw
{-# INLINE meanWeighted #-}
--- | Harmonic mean. This algorithm performs a single pass over the
--- sample.
+-- | /O(n)/ Harmonic mean. This algorithm performs a single pass over
+-- the sample.
harmonicMean :: (G.Vector v Double) => v Double -> Double
harmonicMean = fini . G.foldl' go (T 0 0)
where
@@ -99,7 +99,7 @@ harmonicMean = fini . G.foldl' go (T 0 0)
go (T x y) n = T (x + (1/n)) (y+1)
{-# INLINE harmonicMean #-}
--- | Geometric mean of a sample containing no negative values.
+-- | /O(n)/ Geometric mean of a sample containing no negative values.
geometricMean :: (G.Vector v Double) => v Double -> Double
geometricMean = fini . G.foldl' go (T 1 0)
where
diff --git a/Statistics/Sample/Histogram.hs b/Statistics/Sample/Histogram.hs
new file mode 100644
index 0000000..13fdeff
--- /dev/null
+++ b/Statistics/Sample/Histogram.hs
@@ -0,0 +1,97 @@
+{-# LANGUAGE FlexibleContexts #-}
+
+-- |
+-- Module : Statistics.Sample.Histogram
+-- Copyright : (c) 2011 Bryan O'Sullivan
+-- License : BSD3
+--
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : portable
+--
+-- Functions for computing histograms of sample data.
+
+module Statistics.Sample.Histogram
+ (
+ histogram
+ -- * Building blocks
+ , histogram_
+ , range
+ ) where
+
+import Statistics.Function (minMax)
+import qualified Data.Vector.Generic as G
+import qualified Data.Vector.Generic.Mutable as GM
+
+-- | /O(n)/ Compute a histogram over a data set.
+--
+-- The result consists of a pair of vectors:
+--
+-- * The lower bound of each interval.
+-- * The number of samples within the interval.
+--
+-- Interval (bin) sizes are uniform, and the upper and lower bounds
+-- are chosen automatically using the 'range' function. To specify
+-- these parameters directly, use the 'histogram_' function.
+histogram :: (G.Vector v0 Double, G.Vector v1 Double, Num b, G.Vector v1 b) =>
+ Int -- ^ Number of bins (must be positive).
+ -> v0 Double -- ^ Sample data (cannot be empty).
+ -> (v1 Double, v1 b)
+histogram numBins xs = (G.generate numBins step, histogram_ numBins lo hi xs)
+ where (lo,hi) = range numBins xs
+ step i = lo + d * fromIntegral i
+ d = (hi - lo) / fromIntegral numBins
+{-# INLINE histogram #-}
+
+-- | /O(n)/ Compute a histogram over a data set.
+--
+-- Interval (bin) sizes are uniform, based on the supplied upper
+-- and lower bounds.
+histogram_ :: (Num b, RealFrac a, G.Vector v0 a, G.Vector v1 b) =>
+ Int
+ -- ^ Number of bins. This value must be positive. A zero
+ -- or negative value will cause an error.
+ -> a
+ -- ^ Lower bound on interval range. Sample data less than
+ -- this will cause an error.
+ -> a
+ -- ^ Upper bound on interval range. This value must not be
+ -- less than the lower bound. Sample data that falls above
+ -- the upper bound will cause an error.
+ -> v0 a
+ -- ^ Sample data.
+ -> v1 b
+histogram_ numBins lo hi xs0 = G.create (GM.replicate numBins 0 >>= bin xs0)
+ where
+ bin xs bins = go 0
+ where
+ go i | i >= len = return bins
+ | otherwise = do
+ let x = xs `G.unsafeIndex` i
+ b = truncate $ (x - lo) / d
+ GM.write bins b . (+1) =<< GM.read bins b
+ go (i+1)
+ len = G.length xs
+ d = (hi - lo) / fromIntegral numBins
+{-# INLINE histogram_ #-}
+
+-- | /O(n)/ Compute decent defaults for the lower and upper bounds of
+-- a histogram, based on the desired number of bins and the range of
+-- the sample data.
+--
+-- The upper and lower bounds used are @(lo-d, hi+d)@, where
+--
+-- @d = (maximum sample - minimum sample) / ((bins - 1) * 2)@
+range :: (G.Vector v Double) =>
+ Int -- ^ Number of bins (must be positive).
+ -> v Double -- ^ Sample data (cannot be empty).
+ -> (Double, Double)
+range numBins xs
+ | numBins < 1 = error "Statistics.Histogram.range: invalid bin count"
+ | G.null xs = error "Statistics.Histogram.range: empty sample"
+ | otherwise = (lo-d, hi+d)
+ where
+ d | numBins == 1 = 0
+ | otherwise = (hi - lo) / ((fromIntegral numBins - 1) * 2)
+ (lo,hi) = minMax xs
+{-# INLINE range #-}
diff --git a/Statistics/Sample/KernelDensity.hs b/Statistics/Sample/KernelDensity.hs
new file mode 100644
index 0000000..57c01bc
--- /dev/null
+++ b/Statistics/Sample/KernelDensity.hs
@@ -0,0 +1,108 @@
+{-# LANGUAGE BangPatterns, FlexibleContexts, UnboxedTuples #-}
+-- |
+-- Module : Statistics.Sample.KernelDensity
+-- Copyright : (c) 2011 Bryan O'Sullivan
+-- License : BSD3
+--
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : portable
+--
+-- Kernel density estimation. This module provides a fast, robust,
+-- non-parametric way to estimate the probability density function of
+-- a sample.
+--
+-- This estimator does not use the commonly employed \"Gaussian rule
+-- of thumb\". As a result, it outperforms many plug-in methods on
+-- multimodal samples with widely separated modes.
+
+module Statistics.Sample.KernelDensity
+ (
+ -- * Estimation functions
+ kde
+ , kde_
+ -- * References
+ -- $references
+ ) where
+
+import Data.Complex (Complex(..))
+import Prelude hiding (const,min,max)
+import Statistics.Constants (m_sqrt_2_pi)
+import Statistics.Function (minMax, nextHighestPowerOfTwo)
+import Statistics.Math.RootFinding (fromRoot, ridders)
+import Statistics.Sample.Histogram (histogram_)
+import Statistics.Transform (dct, idct)
+import qualified Data.Vector.Generic as G
+import qualified Data.Vector.Unboxed as U
+
+-- | Gaussian kernel density estimator for one-dimensional data, using
+-- the method of Botev et al.
+--
+-- The result is a pair of vectors, containing:
+--
+-- * The coordinates of each mesh point. The mesh interval is chosen
+-- to be 20% larger than the range of the sample. (To specify the
+-- mesh interval, use 'kde_'.)
+--
+-- * Density estimates at each mesh point.
+kde :: Int
+ -- ^ The number of mesh points to use in the uniform discretization
+ -- of the interval @(min,max)@. If this value is not a power of
+ -- two, then it is rounded up to the next power of two.
+ -> U.Vector Double -> (U.Vector Double, U.Vector Double)
+kde n0 xs = kde_ n0 (lo - range / 10) (hi + range / 10) xs
+ where
+ (lo,hi) = minMax xs
+ range = hi - lo
+
+-- | Gaussian kernel density estimator for one-dimensional data, using
+-- the method of Botev et al.
+--
+-- The result is a pair of vectors, containing:
+--
+-- * The coordinates of each mesh point.
+--
+-- * Density estimates at each mesh point.
+kde_ :: Int
+ -- ^ The number of mesh points to use in the uniform discretization
+ -- of the interval @(min,max)@. If this value is not a power of
+ -- two, then it is rounded up to the next power of two.
+ -> Double
+ -- ^ Lower bound (@min@) of the mesh range.
+ -> Double
+ -- ^ Upper bound (@max@) of the mesh range.
+ -> U.Vector Double -> (U.Vector Double, U.Vector Double)
+kde_ n0 min max xs
+ | n0 < 1 = error "Statistics.KernelDensity.kde: invalid number of points"
+ | otherwise = (mesh, density)
+ where
+ mesh = G.generate ni $ \z -> min + (d * fromIntegral z)
+ where d = r / (n-1)
+ density = G.map (/r) . idct $ G.zipWith f a (G.enumFromTo 0 (n-1))
+ where f b z = b * exp (sqr z * sqr pi * t_star * (-0.5)) :+ 0
+ !n = fromIntegral ni
+ !ni = nextHighestPowerOfTwo n0
+ !r = max - min
+ a = dct . G.map (/ G.sum h) $ h
+ where h = G.map (/ (len :+ 0)) $ histogram_ ni min max xs
+ !len = fromIntegral (G.length xs)
+ !t_star = fromRoot (0.28 * len ** (-0.4)) . ridders 1e-14 (0,0.1) $ \x ->
+ x - (len * (2 * sqrt pi) * go 6 (f 7 x)) ** (-0.4)
+ where
+ f q t = 2 * pi ** (q*2) * G.sum (G.zipWith g iv a2v)
+ where g i a2 = i ** q * a2 * exp ((-i) * sqr pi * t)
+ a2v = G.map (sqr . (*0.5)) $ G.tail a
+ iv = G.map sqr $ G.enumFromTo 1 (n-1)
+ go s !h | s == 1 = h
+ | otherwise = go (s-1) (f s time)
+ where time = (2 * const * k0 / len / h) ** (2 / (3 + 2 * s))
+ const = (1 + 0.5 ** (s+0.5)) / 3
+ k0 = U.product (G.enumFromThenTo 1 3 (2*s-1)) / m_sqrt_2_pi
+ _bandwidth = sqrt t_star * r
+ sqr x = x * x
+
+-- $references
+--
+-- Botev. Z.I., Grotowski J.F., Kroese D.P. (2010). Kernel density
+-- estimation via diffusion. /Annals of Statistics/
+-- 38(5):2916&#8211;2957. <http://arxiv.org/pdf/1011.2602>
diff --git a/Statistics/KernelDensity.hs b/Statistics/Sample/KernelDensity/Simple.hs
index 3b24f99..f0473d6 100644
--- a/Statistics/KernelDensity.hs
+++ b/Statistics/Sample/KernelDensity/Simple.hs
@@ -1,6 +1,6 @@
{-# LANGUAGE FlexibleContexts #-}
-- |
--- Module : Statistics.KernelDensity
+-- Module : Statistics.Sample.KernelDensity.Simple
-- Copyright : (c) 2009 Bryan O'Sullivan
-- License : BSD3
--
@@ -10,8 +10,14 @@
--
-- Kernel density estimation code, providing non-parametric ways to
-- estimate the probability density function of a sample.
+--
+-- The techniques used by functions in this module are relatively
+-- fast, but they generally give inferior results to the KDE function
+-- in the main 'Statistics.KernelDensity' module (due to the
+-- oversmoothing documented for 'bandwidth' below).
-module Statistics.KernelDensity
+module Statistics.Sample.KernelDensity.Simple
+ {-# DEPRECATED "Use Statistics.Sample.KernelDensity instead." #-}
(
-- * Simple entry points
epanechnikovPDF
@@ -36,6 +42,8 @@ module Statistics.KernelDensity
-- ** Low-level estimation
, estimatePDF
, simplePDF
+ -- * References
+ -- $references
) where
import Statistics.Constants (m_1_sqrt_2, m_2_sqrt_pi)
@@ -60,8 +68,13 @@ gaussianBW n = (4 / (n * 3)) ** 0.2
-- | The width of the convolution kernel used.
type Bandwidth = Double
--- | Compute the optimal bandwidth from the observed data for the given
--- kernel.
+-- | Compute the optimal bandwidth from the observed data for the
+-- given kernel.
+--
+-- This function uses an estimate based on the standard deviation of a
+-- sample (due to Deheuvels), which performs reasonably well for
+-- unimodal distributions but leads to oversmoothing for more complex
+-- ones.
bandwidth :: G.Vector v Double =>
(Double -> Bandwidth)
-> v Double
@@ -170,3 +183,9 @@ gaussianPDF = simplePDF gaussianBW gaussianKernel 3
errorShort :: String -> a
errorShort func = error ("Statistics.KernelDensity." ++ func ++
": at least two points required")
+
+-- $references
+--
+-- * Deheuvels, P. (1977) Estimation non paramétrique de la densité
+-- par histogrammes
+-- généralisés. Mhttp://archive.numdam.org/article/RSA_1977__25_3_5_0.pdf>
diff --git a/Statistics/Test/Internal.hs b/Statistics/Test/Internal.hs
new file mode 100644
index 0000000..4383a50
--- /dev/null
+++ b/Statistics/Test/Internal.hs
@@ -0,0 +1,46 @@
+{-# LANGUAGE FlexibleContexts #-}
+module Statistics.Test.Internal (
+ rank
+ , splitByTags
+ ) where
+
+import qualified Data.Vector.Generic as G
+
+
+
+-- Private data type for unfolding
+data Rank v a = Rank {
+ rankCnt :: {-# UNPACK #-} !Int -- Number of ranks to return
+ , rankVal :: {-# UNPACK #-} !Double -- Rank to return
+ , rankNum :: {-# UNPACK #-} !Double -- Current rank
+ , rankVec :: v a -- Remaining vector
+ }
+
+-- | Calculate rank of sample. Sample should be already sorted.
+rank :: (G.Vector v a, G.Vector v Double)
+ => (a -> a -> Bool) -- ^ Equivalence relation
+ -> v a -- ^ Vector to rank
+ -> v Double
+rank eq vec = G.unfoldr go (Rank 0 (-1) 1 vec)
+ where
+ go (Rank 0 _ r v)
+ | G.null v = Nothing
+ | otherwise =
+ case G.length h of
+ 1 -> Just (r, Rank 0 0 (r+1) rest)
+ n -> go Rank { rankCnt = n
+ , rankVal = 0.5 * (r*2 + fromIntegral (n-1))
+ , rankNum = r + fromIntegral n
+ , rankVec = rest
+ }
+ where
+ (h,rest) = G.span (eq $ G.head v) v
+ go (Rank n val r v) = Just (val, Rank (n-1) val r v)
+{-# INLINE rank #-}
+
+-- | Split tagged vector
+splitByTags :: (G.Vector v a, G.Vector v (Bool,a)) => v (Bool,a) -> (v a, v a)
+splitByTags vs = (G.map snd a, G.map snd b)
+ where
+ (a,b) = G.unstablePartition fst vs
+{-# INLINE splitByTags #-}
diff --git a/Statistics/Test/MannWhitneyU.hs b/Statistics/Test/MannWhitneyU.hs
new file mode 100644
index 0000000..f4bad2d
--- /dev/null
+++ b/Statistics/Test/MannWhitneyU.hs
@@ -0,0 +1,238 @@
+-- |
+-- Module : Statistics.Test.MannWhitneyU
+-- Copyright : (c) 2010 Neil Brown
+-- License : BSD3
+--
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : portable
+--
+-- Mann-Whitney U test (also know as Mann-Whitney-Wilcoxon and
+-- Wilcoxon rank sum test) is a non-parametric test for assesing
+-- whether two samples of independent observations have different
+-- mean.
+module Statistics.Test.MannWhitneyU (
+ -- * Mann-Whitney U test
+ mannWhitneyUtest
+ , mannWhitneyU
+ , mannWhitneyUCriticalValue
+ , mannWhitneyUSignificant
+ -- ** Wilcoxon rank sum test
+ , wilcoxonRankSums
+ -- * Data types
+ , TestType(..)
+ , TestResult(..)
+ -- * References
+ -- $references
+ ) where
+
+import Control.Applicative ((<$>))
+import Data.List (findIndex)
+import Data.Ord (comparing)
+import qualified Data.Vector.Unboxed as U
+
+import Statistics.Distribution (quantile)
+import Statistics.Distribution.Normal (standard)
+import Statistics.Math (choose)
+import Statistics.Types (Sample)
+import Statistics.Function (sortBy)
+import Statistics.Test.Types
+import Statistics.Test.Internal
+
+
+
+-- | The Wilcoxon Rank Sums Test.
+--
+-- This test calculates the sum of ranks for the given two samples. The samples
+-- are ordered, and assigned ranks (ties are given their average rank), then these
+-- ranks are summed for each sample.
+--
+-- The return value is (W&#8321;, W&#8322;) where W&#8321; is the sum of ranks of the first sample
+-- and W&#8322; is the sum of ranks of the second sample. This test is trivially transformed
+-- into the Mann-Whitney U test. You will probably want to use 'mannWhitneyU'
+-- and the related functions for testing significance, but this function is exposed
+-- for completeness.
+wilcoxonRankSums :: Sample -> Sample -> (Double, Double)
+wilcoxonRankSums xs1 xs2 = ( U.sum ranks1 , U.sum ranks2
+ )
+ where
+ -- Ranks for each sample
+ (ranks1,ranks2) = splitByTags $ U.zip tags (rank (==) joinSample)
+ -- Sorted and tagged sample
+ (tags,joinSample) = U.unzip
+ $ sortBy (comparing snd)
+ $ tagSample True xs1 U.++ tagSample False xs2
+ -- Add tag to a sample
+ tagSample t = U.map ((,) t)
+
+
+
+-- | The Mann-Whitney U Test.
+--
+-- This is sometimes known as the Mann-Whitney-Wilcoxon U test, and
+-- confusingly many sources state that the Mann-Whitney U test is the same as
+-- the Wilcoxon's rank sum test (which is provided as 'wilcoxonRankSums').
+-- The Mann-Whitney U is a simple transform of Wilcoxon's rank sum test.
+--
+-- Again confusingly, different sources state reversed definitions for U&#8321;
+-- and U&#8322;, so it is worth being explicit about what this function returns.
+-- Given two samples, the first, xs&#8321;, of size n&#8321; and the second, xs&#8322;,
+-- of size n&#8322;, this function returns (U&#8321;, U&#8322;)
+-- where U&#8321; = W&#8321; - (n&#8321;(n&#8321;+1))\/2
+-- and U&#8322; = W&#8322; - (n&#8322;(n&#8322;+1))\/2,
+-- where (W&#8321;, W&#8322;) is the return value of @wilcoxonRankSums xs1 xs2@.
+--
+-- Some sources instead state that U&#8321; and U&#8322; should be the other way round, often
+-- expressing this using U&#8321;' = n&#8321;n&#8322; - U&#8321; (since U&#8321; + U&#8322; = n&#8321;n&#8322;).
+--
+-- All of which you probably don't care about if you just feed this into 'mannWhitneyUSignificant'.
+mannWhitneyU :: Sample -> Sample -> (Double, Double)
+mannWhitneyU xs1 xs2
+ = (fst summedRanks - (n1*(n1 + 1))/2
+ ,snd summedRanks - (n2*(n2 + 1))/2)
+ where
+ n1 = fromIntegral $ U.length xs1
+ n2 = fromIntegral $ U.length xs2
+
+ summedRanks = wilcoxonRankSums xs1 xs2
+
+-- | Calculates the critical value of Mann-Whitney U for the given sample
+-- sizes and significance level.
+--
+-- This function returns the exact calculated value of U for all sample sizes;
+-- it does not use the normal approximation at all. Above sample size 20 it is
+-- generally recommended to use the normal approximation instead, but this function
+-- will calculate the higher critical values if you need them.
+--
+-- The algorithm to generate these values is a faster, memoised version of the
+-- simple unoptimised generating function given in section 2 of \"The Mann Whitney
+-- Wilcoxon Distribution Using Linked Lists\"
+mannWhitneyUCriticalValue :: (Int, Int) -- ^ The sample size
+ -> Double -- ^ The p-value (e.g. 0.05) for which you want the critical value.
+ -> Maybe Int -- ^ The critical value (of U).
+mannWhitneyUCriticalValue (m, n) p
+ | m < 1 || n < 1 = Nothing -- Sample must be nonempty
+ | p >= 1 = Nothing -- Nonsensical p-value
+ | p' <= 1 = Nothing -- p-value is too small. Null hypothesys couln't be disproved
+ | otherwise = findIndex (>= p')
+ $ take (m*n)
+ $ tail
+ $ alookup !! (m+n-2) !! (min m n - 1)
+ where
+ mnCn = (m+n) `choose` n
+ p' = mnCn * p
+
+
+{-
+-- Original function, without memoisation, from Cheung and Klotz:
+-- Double is needed to avoid integer overflows.
+a :: Int -> Int -> Int -> Double
+a u bigN m
+ | u < 0 = 0
+ | u >= m * n = bigN `choose` m
+ | m == 1 || n == 1 = fromIntegral (u + 1)
+ | otherwise = a u (bigN - 1) m
+ + a (u - n) (bigN - 1) (m-1)
+ where
+ n = bigN - m
+-}
+
+-- Memoised version of the original a function, above.
+--
+-- Doubles are stored to avoid integer overflow. 32-bit Ints begin to
+-- overflow for bigN as small as 33 (64-bit one at 66) while Double to
+-- go to infinity till bigN=1029
+--
+--
+-- outer list is indexed by big N - 2
+-- inner list by (m-1) (we know m < bigN)
+-- innermost list by u
+--
+-- So: (alookup !! (bigN - 2) !! (m - 1) ! u) == a u bigN m
+alookup :: [[[Double]]]
+alookup = gen 2 [1 : repeat 2]
+ where
+ gen bigN predBigNList
+ = let bigNlist = [ [ amemoed u m
+ | u <- [0 .. m*(bigN-m)]
+ ] ++ repeat (bigN `choose` m)
+ | m <- [1 .. (bigN-1)]] -- has bigN-1 elements
+ in bigNlist : gen (bigN+1) bigNlist
+ where
+ amemoed :: Int -> Int -> Double
+ amemoed u m
+ | m == 1 || n == 1 = fromIntegral (u + 1)
+ | otherwise = mList !! u
+ + if u < n then 0 else predmList !! (u-n)
+ where
+ n = bigN - m
+ (predmList : mList : _) = drop (m-2) predBigNList
+ -- Lists for m-1 and m respectively. i-th list correspond to m=i+1
+ --
+ -- We know that predBigNList has bigN - 2 elements
+ -- (and we know that n > 1 therefore bigN > m + 1)
+ -- So bigN - 2 >= m, i.e. predBigNList must have at least m elements
+ -- elements, so dropping (m-2) must leave at least 2
+
+
+-- | Calculates whether the Mann Whitney U test is significant.
+--
+-- If both sample sizes are less than or equal to 20, the exact U critical value
+-- (as calculated by 'mannWhitneyUCriticalValue') is used. If either sample is
+-- larger than 20, the normal approximation is used instead.
+--
+-- If you use a one-tailed test, the test indicates whether the first sample is
+-- significantly larger than the second. If you want the opposite, simply reverse
+-- the order in both the sample size and the (U&#8321;, U&#8322;) pairs.
+mannWhitneyUSignificant ::
+ TestType -- ^ Perform one-tailed test (see description above).
+ -> (Int, Int) -- ^ The samples' size from which the (U&#8321;,U&#8322;) values were derived.
+ -> Double -- ^ The p-value at which to test (e.g. 0.05)
+ -> (Double, Double) -- ^ The (U&#8321;, U&#8322;) values from 'mannWhitneyU'.
+ -> Maybe TestResult -- ^ Return 'Nothing' if the sample was too
+ -- small to make a decision.
+mannWhitneyUSignificant test (in1, in2) p (u1, u2)
+ --Use normal approximation
+ | in1 > 20 || in2 > 20 =
+ let mean = n1 * n2 / 2
+ sigma = sqrt $ n1*n2*(n1 + n2 + 1) / 12
+ z = (mean - u1) / sigma
+ in Just $ case test of
+ OneTailed -> significant $ z < quantile standard p
+ TwoTailed -> significant $ abs z > abs (quantile standard (p/2))
+ -- Use exact critical value
+ | otherwise = do crit <- fromIntegral <$> mannWhitneyUCriticalValue (in1, in2) p
+ return $ case test of
+ OneTailed -> significant $ u2 <= crit
+ TwoTailed -> significant $ min u1 u2 <= crit
+ where
+ n1 = fromIntegral in1
+ n2 = fromIntegral in2
+
+
+-- | Perform Mann-Whitney U Test for two samples and required
+-- significance. For additional information check documentation of
+-- 'mannWhitneyU' and 'mannWhitneyUSignificant'. This is just a helper
+-- function.
+--
+-- One-tailed test checks whether first sample is significantly larger
+-- than second. Two-tailed whether they are significantly different.
+mannWhitneyUtest :: TestType -- ^ Perform one-tailed test (see description above).
+ -> Double -- ^ The p-value at which to test (e.g. 0.05)
+ -> Sample -- ^ First sample
+ -> Sample -- ^ Second sample
+ -> Maybe TestResult
+ -- ^ Return 'Nothing' if the sample was too small to
+ -- make a decision.
+mannWhitneyUtest ontTail p smp1 smp2 =
+ mannWhitneyUSignificant ontTail (n1,n2) p $ mannWhitneyU smp1 smp2
+ where
+ n1 = U.length smp1
+ n2 = U.length smp2
+
+-- $references
+--
+-- * Cheung, Y.K.; Klotz, J.H. (1997) The Mann Whitney Wilcoxon
+-- distribution using linked lists. /Statistica Sinica/
+-- 7:805&#8211;813.
+-- <http://www3.stat.sinica.edu.tw/statistica/oldpdf/A7n316.pdf>.
diff --git a/Statistics/Test/NonParametric.hs b/Statistics/Test/NonParametric.hs
index 2cb108d..dc7d34f 100644
--- a/Statistics/Test/NonParametric.hs
+++ b/Statistics/Test/NonParametric.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE FlexibleContexts #-}
-- |
-- Module : Statistics.Test.NonParametric
-- Copyright : (c) 2010 Neil Brown
@@ -9,317 +10,15 @@
--
-- Functions for performing non-parametric tests (i.e. tests without an assumption
-- of underlying distribution).
-module Statistics.Test.NonParametric
- (-- * Mann-Whitney U test (non-parametric equivalent to the independent t-test)
- mannWhitneyU, mannWhitneyUCriticalValue, mannWhitneyUSignificant,
- -- * Wilcoxon signed-rank matched-pair test (non-parametric equivalent to the paired t-test)
- wilcoxonMatchedPairSignedRank, wilcoxonMatchedPairSignificant, wilcoxonMatchedPairSignificance, wilcoxonMatchedPairCriticalValue,
- -- * Wilcoxon rank sum test
- wilcoxonRankSums) where
-
-import Control.Applicative ((<$>))
-import Control.Arrow ((***))
-import Data.Function (on)
-import Data.List (findIndex, groupBy, partition, sortBy)
-import Data.Ord (comparing)
-import qualified Data.Vector.Unboxed as U (length, toList, zipWith)
-
-import Statistics.Distribution (quantile)
-import Statistics.Distribution.Normal (standard)
-import Statistics.Math (choose)
-import Statistics.Types (Sample)
-
--- | The Wilcoxon Rank Sums Test.
---
--- This test calculates the sum of ranks for the given two samples. The samples
--- are ordered, and assigned ranks (ties are given their average rank), then these
--- ranks are summed for each sample.
---
--- The return value is (W_1, W_2) where W_1 is the sum of ranks of the first sample
--- and W_2 is the sum of ranks of the second sample. This test is trivially transformed
--- into the Mann-Whitney U test. You will probably want to use 'mannWhitneyU'
--- and the related functions for testing significance, but this function is exposed
--- for completeness.
-wilcoxonRankSums :: Sample -> Sample -> (Double, Double)
-wilcoxonRankSums xs1 xs2
- = ((sum . map fst) *** (sum . map fst)) . -- sum the ranks per group
- partition snd . -- split them back into left and right
- concatMap mergeRanks . -- merge the ranks of duplicates
- groupBy ((==) `on` (snd . snd)) . -- group duplicate values
- zip [1..] . -- give them ranks (duplicates receive different ranks here)
- sortBy (comparing snd) $ -- sort by their values
- zip (repeat True) (U.toList xs1) ++ zip (repeat False) (U.toList xs2)
- -- Tag each sample with an identifier before we merge them
- where
- mergeRanks :: [(AbsoluteRank, (Bool, Double))] -> [(AbsoluteRank, Bool)]
- mergeRanks xs = zip (repeat rank) (map (fst . snd) xs)
- where
- -- Ranks are merged by assigning them all the average of their ranks:
- rank = sum (map fst xs) / fromIntegral (length xs)
-
--- | The Mann-Whitney U Test.
---
--- This is sometimes known as the Mann-Whitney-Wilcoxon U test, and
--- confusingly many sources state that the Mann-Whitney U test is the same as
--- the Wilcoxon's rank sum test (which is provided as 'wilcoxonRankSums').
--- The Mann-Whitney U is a simple transform of Wilcoxon's rank sum test.
---
--- Again confusingly, different sources state reversed definitions for U_1 and U_2,
--- so it is worth being explicit about what this function returns. Given two samples,
--- the first, xs_1, of size n_1 and the second, xs_2, of size n_2, this function
--- returns (U_1, U_2) where U_1 = W_1 - (n_1*(n_1+1))\/2 and U_2 = W_2 - (n_2*(n_2+1))\/2,
--- where (W_1, W_2) is the return value of @wilcoxonRankSums xs1 xs2@.
---
--- Some sources instead state that U_1 and U_2 should be the other way round, often
--- expressing this using U_1' = n_1*n_2 - U_1 (since U_1 + U_2 = n_1*n*2).
---
--- All of which you probably don't care about if you just feed this into 'mannWhitneyUSignificant'.
-mannWhitneyU :: Sample -> Sample -> (Double, Double)
-mannWhitneyU xs1 xs2
- = (fst summedRanks - (n1*(n1 + 1))/2
- ,snd summedRanks - (n2*(n2 + 1))/2)
- where
- n1 = fromIntegral $ U.length xs1
- n2 = fromIntegral $ U.length xs2
-
- summedRanks = wilcoxonRankSums xs1 xs2
-
--- | Calculates the critical value of Mann-Whitney U for the given sample
--- sizes and significance level.
---
--- This function returns the exact calculated value of U for all sample sizes;
--- it does not use the normal approximation at all. Above sample size 20 it is
--- generally recommended to use the normal approximation instead, but this function
--- will calculate the higher critical values if you need them.
---
--- The algorithm to generate these values is a faster, memoised version of the
--- simple unoptimised generating function given in section 2 of \"The Mann Whitney
--- Wilcoxon Distribution Using Linked Lists\", Cheung and Klotz, Statistica Sinica
--- 7 (1997), <http://www3.stat.sinica.edu.tw/statistica/oldpdf/A7n316.pdf>.
-mannWhitneyUCriticalValue :: (Int, Int) -- ^ The sample size
- -> Double -- ^ The p-value (e.g. 0.05) for which you want the critical value.
- -> Maybe Int -- ^ The critical value (of U).
-mannWhitneyUCriticalValue (m, n) p
- | p' <= 1 = Nothing
- | m < 1 || n < 1 = Nothing
- | otherwise = findIndex (>= p') $ let
- firstHalf = map fromIntegral $ take (((m*n)+1)`div`2) $ tail $ alookup !! (m+n-2) !! (min m n - 1)
- {- Original: [fromIntegral $ a k (m+n) (min m n) | k <- [1..m*n]] -}
- secondHalf
- | even (m*n) = reverse firstHalf
- | otherwise = tail $ reverse firstHalf
- in firstHalf ++ map (mnCn -) secondHalf
- where
- mnCn = (m+n) `choose` n
- p' = mnCn * p
-{- Original function, without memoisation, from Cheung and Klotz:
-a :: Int -> Int -> Int -> Int
-a u bigN m
- | u < 0 = 0
- | u >= (m * smalln) = floor $ fromIntegral bigN `choose` fromIntegral m
- | m == 1 || smalln == 1 = u + 1
- | otherwise = a u (bigN - 1) m
- + a (u - smalln) (bigN - 1) (m-1)
- where smalln = bigN - m
--}
-
--- Memoised version of the original a function, above.
---
--- outer list is indexed by big N - 2
--- inner list by m (we know m < bigN)
--- innermost list by u
---
--- So: (alookup ! (bigN - 2) ! m ! u) == a u bigN m
-alookup :: [[[Int]]]
-alookup = gen 2 [1 : repeat 2]
- where
- gen bigN predBigNList
- = let bigNlist = [ let limit = round $ fromIntegral bigN `choose` fromIntegral m
- in [amemoed u m | u <- [0..m*(bigN-m)]] ++ repeat limit
- | m <- [1..(bigN-1)]] -- has bigN-1 elements
- in bigNlist : gen (bigN+1) bigNlist
- where
- amemoed :: Int -> Int -> Int
- amemoed u m
- | m == 1 || smalln == 1 = u + 1
- | otherwise = let (predmList : mList : _) = drop (m-2) predBigNList -- m-2 because starts at 1
- -- We know that predBigNList has bigN - 2 elements
- -- (and we know that smalln > 1 therefore bigN > m + 1)
- -- So bigN - 2 >= m, i.e. predBigNList must have at least m elements
- -- elements, so dropping (m-2) must leave at least 2
- in (mList !! u) + (if u < smalln then 0 else predmList !! (u - smalln))
- where smalln = bigN - m
-
--- | Calculates whether the Mann Whitney U test is significant.
---
--- If both sample sizes are less than or equal to 20, the exact U critical value
--- (as calculated by 'mannWhitneyUCriticalValue') is used. If either sample is
--- larger than 20, the normal approximation is used instead.
---
--- If you use a one-tailed test, the test indicates whether the first sample is
--- significantly larger than the second. If you want the opposite, simply reverse
--- the order in both the sample size and the (U_1, U_2) pairs.
-mannWhitneyUSignificant :: Bool -- ^ Perform one-tailed test (see description above).
- -> (Int, Int) -- ^ The sample size from which the (U_1,U_2) values were derived.
- -> Double -- ^ The p-value at which to test (e.g. 0.05)
- -> (Double, Double) -- ^ The (U_1, U_2) values from 'mannWhitneyU'.
- -> Maybe Bool -- ^ Just True if the test is significant, Just
- -- False if it is not, and Nothing if the sample
- -- was too small to make a decision.
-mannWhitneyUSignificant oneTail (in1, in2) p (u1, u2)
- | in1 > 20 || in2 > 20 --Use normal approximation
--- = (n1*(n1+1))/2 - u1 - (n1*(n1+n2))/2
--- = (n1*(n1+1))/2 - (-2*u1 + n1*(n1+n2))/2
--- = (n1*(n1+1) - 2*u1 + n1*(n1+n2))/2
--- = (n1*(2*n1 + n2 + 1) - 2*u1)/2
- = let num = (n1*(2*n1 + n2 + 1)) / 2 - u1
- denom = sqrt $ n1*n2*(n1 + n2 + 1) / 12
- z = num / denom
- zcrit = quantile standard (1 - if oneTail then p else p/2)
- in Just $ (if oneTail then z else abs z) > zcrit
- | otherwise = do crit <- fromIntegral <$> mannWhitneyUCriticalValue (in1, in2) p
- return $ if oneTail
- then u2 <= crit
- else min u1 u2 <= crit
- where
- n1 = fromIntegral in1
- n2 = fromIntegral in2
-
--- | The Wilcoxon matched-pairs signed-rank test.
---
--- The value returned is the pair (T+, T-). T+ is the sum of positive ranks (the
--- ranks of the differences where the first parameter is higher) whereas T- is
--- the sum of negative ranks (the ranks of the differences where the second parameter is higher).
--- These values mean little by themselves, and should be combined with the 'wilcoxonSignificant'
--- function in this module to get a meaningful result.
---
--- The samples are zipped together: if one is longer than the other, both are truncated
--- to the the length of the shorter sample.
---
--- Note that: wilcoxonMatchedPairSignedRank == (\(x, y) -> (y, x)) . flip wilcoxonMatchedPairSignedRank
-wilcoxonMatchedPairSignedRank :: Sample -> Sample -> (Double, Double)
-wilcoxonMatchedPairSignedRank a b
- -- Best to read this function bottom to top:
- = (sum *** sum) . -- Sum the positive and negative ranks separately.
- partition (> 0) . -- Split the ranks into positive and negative. None of the
- -- ranks can be zero.
- concatMap mergeRanks . -- Then merge the ranks for any duplicates by taking
- -- the average of the ranks, and also make the rank
- -- into a signed rank
- groupBy ((==) `on` abs . snd) . -- Now group any duplicates together
- -- Note: duplicate means same absolute difference
- zip [1..] . -- Add a rank (note: at this stage, duplicates will get different ranks)
- dropWhile (== 0) . -- Remove any differences that are zero (i.e. ties in the
- -- original data). We know they must be at the head of
- -- the list because we just sorted it, so dropWhile not filter
- sortBy (comparing abs) . -- Sort the differences by absolute difference
- U.toList $ -- Convert to a list (could be done later in the pipeline?)
- U.zipWith (-) a b -- Work out differences
- where
- mergeRanks :: [(AbsoluteRank, Double)] -> [SignedRank]
- mergeRanks xs = map ((* rank) . signum . snd) xs
- -- Note that signum above will always be 1 or -1; any zero differences will
- -- have been removed before this function is called.
- where
- -- Ranks are merged by assigning them all the average of their ranks:
- rank = sum (map fst xs) / fromIntegral (length xs)
-
-type AbsoluteRank = Double
-type SignedRank = Double
-
--- | The coefficients for x^0, x^1, x^2, etc, in the expression
--- \prod_{r=1}^s (1 + x^r). See the Mitic paper for details.
---
--- We can define:
--- f(1) = 1 + x
--- f(r) = (1 + x^r)*f(r-1)
--- = f(r-1) + x^r * f(r-1)
--- The effect of multiplying the equation by x^r is to shift
--- all the coefficients by r down the list.
---
--- This list will be processed lazily from the head.
-coefficients :: Int -> [Int]
-coefficients 1 = [1, 1] -- 1 + x
-coefficients r = let coeffs = coefficients (r-1)
- (firstR, rest) = splitAt r coeffs
- in firstR ++ add rest coeffs
- where
- add (x:xs) (y:ys) = x + y : add xs ys
- add xs [] = xs
- add [] ys = ys
-
--- This list will be processed lazily from the head.
-summedCoefficients :: Int -> [Double]
-summedCoefficients = map fromIntegral . scanl1 (+) . coefficients
-
--- | Tests whether a given result from a Wilcoxon signed-rank matched-pairs test
--- is significant at the given level.
---
--- This function can perform a one-tailed or two-tailed test. If the first
--- parameter to this function is False, the test is performed two-tailed to
--- check if the two samples differ significantly. If the first parameter is
--- True, the check is performed one-tailed to decide whether the first sample
--- (i.e. the first sample you passed to 'wilcoxonMatchedPairSignedRank') is
--- greater than the second sample (i.e. the second sample you passed to
--- 'wilcoxonMatchedPairSignedRank'). If you wish to perform a one-tailed test
--- in the opposite direction, you can either pass the parameters in a different
--- order to 'wilcoxonMatchedPairSignedRank', or simply swap the values in the resulting
--- pair before passing them to this function.
-wilcoxonMatchedPairSignificant :: Bool -- ^ Perform one-tailed test (see description above).
- -> Int -- ^ The sample size from which the (T+,T-) values were derived.
- -> Double -- ^ The p-value at which to test (e.g. 0.05)
- -> (Double, Double) -- ^ The (T+, T-) values from 'wilcoxonMatchedPairSignedRank'.
- -> Maybe Bool -- ^ Just True if the test is significant, Just
- -- False if it is not, and Nothing if the sample
- -- was too small to make a decision.
-wilcoxonMatchedPairSignificant oneTail sampleSize p (tPlus, tMinus)
- -- According to my nearest book (Understanding Research Methods and Statistics
- -- by Gary W. Heiman, p590), to check that the first sample is bigger you must
- -- use the absolute value of T- for a one-tailed check:
- | oneTail = ((abs tMinus <=) . fromIntegral) <$> wilcoxonMatchedPairCriticalValue sampleSize p
- -- Otherwise you must use the value of T+ and T- with the smallest absolute value:
- | otherwise = ((t <=) . fromIntegral) <$> wilcoxonMatchedPairCriticalValue sampleSize (p/2)
- where
- t = min (abs tPlus) (abs tMinus)
-
--- | Obtains the critical value of T to compare against, given a sample size
--- and a p-value (significance level). Your T value must be less than or
--- equal to the return of this function in order for the test to work out
--- significant. If there is a Nothing return, the sample size is too small to
--- make a decision.
---
--- 'wilcoxonSignificant' tests the return value of 'wilcoxonMatchedPairSignedRank'
--- for you, so you should use 'wilcoxonSignificant' for determining test results.
--- However, this function is useful, for example, for generating lookup tables
--- for Wilcoxon signed rank critical values.
---
--- The return values of this function are generated using the method detailed in
--- the paper \"Critical Values for the Wilcoxon Signed Rank Statistic\", Peter
--- Mitic, The Mathematica Journal, volume 6, issue 3, 1996, which can be found
--- here: <http://www.mathematica-journal.com/issue/v6i3/article/mitic/contents/63mitic.pdf>.
--- According to that paper, the results may differ from other published lookup tables, but
--- (Mitic claims) the values obtained by this function will be the correct ones.
-wilcoxonMatchedPairCriticalValue :: Int -- ^ The sample size
- -> Double -- ^ The p-value (e.g. 0.05) for which you want the critical value.
- -> Maybe Int -- ^ The critical value (of T), or Nothing if
- -- the sample is too small to make a decision.
-wilcoxonMatchedPairCriticalValue sampleSize p
- = case critical of
- Just n | n < 0 -> Nothing
- | otherwise -> Just n
- Nothing -> Just maxBound -- shouldn't happen: beyond end of list
- where
- m = (2 ** fromIntegral sampleSize) * p
- critical = subtract 1 <$> findIndex (> m) (summedCoefficients sampleSize)
-
--- | Works out the significance level (p-value) of a T value, given a sample
--- size and a T value from the Wilcoxon signed-rank matched-pairs test.
---
--- See the notes on 'wilcoxonCriticalValue' for how this is calculated.
-wilcoxonMatchedPairSignificance :: Int -- ^ The sample size
- -> Double -- ^ The value of T for which you want the significance.
- -> Double -- ^^ The significance (p-value).
-wilcoxonMatchedPairSignificance sampleSize rank
- = (summedCoefficients sampleSize !! floor rank) / 2 ** fromIntegral sampleSize
+-- HADDOCK NOTE
+-- &#8321; is 1 subscript
+-- &#8322; is 2 subscript
+module Statistics.Test.NonParametric
+{-# DEPRECATED "Use S.Test.MannWhitneyU and S.Test.WilcoxonT instead" #-}
+ ( module Statistics.Test.MannWhitneyU
+ , module Statistics.Test.WilcoxonT
+ ) where
+import Statistics.Test.MannWhitneyU
+import Statistics.Test.WilcoxonT
diff --git a/Statistics/Test/Types.hs b/Statistics/Test/Types.hs
new file mode 100644
index 0000000..d0f0f33
--- /dev/null
+++ b/Statistics/Test/Types.hs
@@ -0,0 +1,27 @@
+{-# LANGUAGE DeriveDataTypeable #-}
+module Statistics.Test.Types (
+ TestType(..)
+ , TestResult(..)
+ , significant
+ ) where
+
+import Data.Typeable (Typeable)
+
+
+-- | Test type. Exact meaning depends on a specific test. But
+-- generally it's tested whether some statistics is too big (small)
+-- for 'OneTailed' or whether it too big or too small for 'TwoTailed'
+data TestType = OneTailed
+ | TwoTailed
+ deriving (Eq,Ord,Show,Typeable)
+
+-- | Result of hypothesis testing
+data TestResult = Significant -- ^ Null hypothesis should be rejected
+ | NotSignificant -- ^ Data is compatible with hypothesis
+ deriving (Eq,Ord,Show,Typeable)
+
+-- | Significant if parameter is 'True', not significant otherwiser
+significant :: Bool -> TestResult
+significant True = Significant
+significant False = NotSignificant
+{-# INLINE significant #-}
diff --git a/Statistics/Test/WilcoxonT.hs b/Statistics/Test/WilcoxonT.hs
new file mode 100644
index 0000000..41183a8
--- /dev/null
+++ b/Statistics/Test/WilcoxonT.hs
@@ -0,0 +1,182 @@
+-- |
+-- Module : Statistics.Test.WilcoxonT
+-- Copyright : (c) 2010 Neil Brown
+-- License : BSD3
+--
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : portable
+--
+-- The Wilcoxon matched-pairs signed-rank test is non-parametric test
+-- which could be used to whether two related samples have different
+-- means.
+module Statistics.Test.WilcoxonT (
+ -- * Wilcoxon signed-rank matched-pair test
+ wilcoxonMatchedPairTest
+ , wilcoxonMatchedPairSignedRank
+ , wilcoxonMatchedPairSignificant
+ , wilcoxonMatchedPairSignificance
+ , wilcoxonMatchedPairCriticalValue
+ -- * Data types
+ , TestType(..)
+ , TestResult(..)
+ ) where
+
+import Control.Applicative ((<$>))
+import Data.Function (on)
+import Data.List (findIndex)
+import Data.Ord (comparing)
+import qualified Data.Vector.Unboxed as U
+
+import Statistics.Types (Sample)
+import Statistics.Function (sortBy)
+import Statistics.Test.Types
+import Statistics.Test.Internal
+
+
+-- | The Wilcoxon matched-pairs signed-rank test.
+--
+-- The value returned is the pair (T+, T-). T+ is the sum of positive ranks (the
+-- ranks of the differences where the first parameter is higher) whereas T- is
+-- the sum of negative ranks (the ranks of the differences where the second parameter is higher).
+-- These values mean little by themselves, and should be combined with the 'wilcoxonSignificant'
+-- function in this module to get a meaningful result.
+--
+-- The samples are zipped together: if one is longer than the other, both are truncated
+-- to the the length of the shorter sample.
+--
+-- Note that: wilcoxonMatchedPairSignedRank == (\(x, y) -> (y, x)) . flip wilcoxonMatchedPairSignedRank
+wilcoxonMatchedPairSignedRank :: Sample -> Sample -> (Double, Double)
+wilcoxonMatchedPairSignedRank a b = ( U.sum ranks1
+ , negate $ U.sum ranks2
+ )
+ where
+ (ranks1, ranks2) = splitByTags
+ $ U.zip tags (rank ((==) `on` abs) diffs)
+ (tags,diffs) = U.unzip
+ $ U.map (\x -> (x>0 , x)) -- Attack tags to distribution elements
+ $ U.filter (/= 0.0) -- Remove equal elements
+ $ sortBy (comparing abs) -- Sort the differences by absolute difference
+ $ U.zipWith (-) a b -- Work out differences
+
+
+-- | The coefficients for x^0, x^1, x^2, etc, in the expression
+-- \prod_{r=1}^s (1 + x^r). See the Mitic paper for details.
+--
+-- We can define:
+-- f(1) = 1 + x
+-- f(r) = (1 + x^r)*f(r-1)
+-- = f(r-1) + x^r * f(r-1)
+-- The effect of multiplying the equation by x^r is to shift
+-- all the coefficients by r down the list.
+--
+-- This list will be processed lazily from the head.
+coefficients :: Int -> [Int]
+coefficients 1 = [1, 1] -- 1 + x
+coefficients r = let coeffs = coefficients (r-1)
+ (firstR, rest) = splitAt r coeffs
+ in firstR ++ add rest coeffs
+ where
+ add (x:xs) (y:ys) = x + y : add xs ys
+ add xs [] = xs
+ add [] ys = ys
+
+-- This list will be processed lazily from the head.
+summedCoefficients :: Int -> [Double]
+summedCoefficients = map fromIntegral . scanl1 (+) . coefficients
+
+-- | Tests whether a given result from a Wilcoxon signed-rank matched-pairs test
+-- is significant at the given level.
+--
+-- This function can perform a one-tailed or two-tailed test. If the first
+-- parameter to this function is 'TwoTailed', the test is performed two-tailed to
+-- check if the two samples differ significantly. If the first parameter is
+-- 'OneTailed', the check is performed one-tailed to decide whether the first sample
+-- (i.e. the first sample you passed to 'wilcoxonMatchedPairSignedRank') is
+-- greater than the second sample (i.e. the second sample you passed to
+-- 'wilcoxonMatchedPairSignedRank'). If you wish to perform a one-tailed test
+-- in the opposite direction, you can either pass the parameters in a different
+-- order to 'wilcoxonMatchedPairSignedRank', or simply swap the values in the resulting
+-- pair before passing them to this function.
+wilcoxonMatchedPairSignificant ::
+ TestType -- ^ Perform one-tailed test (see description above).
+ -> Int -- ^ The sample size from which the (T+,T-) values were derived.
+ -> Double -- ^ The p-value at which to test (e.g. 0.05)
+ -> (Double, Double) -- ^ The (T+, T-) values from 'wilcoxonMatchedPairSignedRank'.
+ -> Maybe TestResult -- ^ Return 'Nothing' if the sample was too
+ -- small to make a decision.
+wilcoxonMatchedPairSignificant test sampleSize p (tPlus, tMinus) =
+ case test of
+ -- According to my nearest book (Understanding Research Methods and Statistics
+ -- by Gary W. Heiman, p590), to check that the first sample is bigger you must
+ -- use the absolute value of T- for a one-tailed check:
+ OneTailed -> (significant . (abs tMinus <=) . fromIntegral) <$> wilcoxonMatchedPairCriticalValue sampleSize p
+ -- Otherwise you must use the value of T+ and T- with the smallest absolute value:
+ TwoTailed -> (significant . (t <=) . fromIntegral) <$> wilcoxonMatchedPairCriticalValue sampleSize (p/2)
+ where
+ t = min (abs tPlus) (abs tMinus)
+
+-- | Obtains the critical value of T to compare against, given a sample size
+-- and a p-value (significance level). Your T value must be less than or
+-- equal to the return of this function in order for the test to work out
+-- significant. If there is a Nothing return, the sample size is too small to
+-- make a decision.
+--
+-- 'wilcoxonSignificant' tests the return value of 'wilcoxonMatchedPairSignedRank'
+-- for you, so you should use 'wilcoxonSignificant' for determining test results.
+-- However, this function is useful, for example, for generating lookup tables
+-- for Wilcoxon signed rank critical values.
+--
+-- The return values of this function are generated using the method detailed in
+-- the paper \"Critical Values for the Wilcoxon Signed Rank Statistic\", Peter
+-- Mitic, The Mathematica Journal, volume 6, issue 3, 1996, which can be found
+-- here: <http://www.mathematica-journal.com/issue/v6i3/article/mitic/contents/63mitic.pdf>.
+-- According to that paper, the results may differ from other published lookup tables, but
+-- (Mitic claims) the values obtained by this function will be the correct ones.
+wilcoxonMatchedPairCriticalValue ::
+ Int -- ^ The sample size
+ -> Double -- ^ The p-value (e.g. 0.05) for which you want the critical value.
+ -> Maybe Int -- ^ The critical value (of T), or Nothing if
+ -- the sample is too small to make a decision.
+wilcoxonMatchedPairCriticalValue sampleSize p
+ = case critical of
+ Just n | n < 0 -> Nothing
+ | otherwise -> Just n
+ Nothing -> Just maxBound -- shouldn't happen: beyond end of list
+ where
+ m = (2 ** fromIntegral sampleSize) * p
+ critical = subtract 1 <$> findIndex (> m) (summedCoefficients sampleSize)
+
+-- | Works out the significance level (p-value) of a T value, given a sample
+-- size and a T value from the Wilcoxon signed-rank matched-pairs test.
+--
+-- See the notes on 'wilcoxonCriticalValue' for how this is calculated.
+wilcoxonMatchedPairSignificance :: Int -- ^ The sample size
+ -> Double -- ^ The value of T for which you want the significance.
+ -> Double -- ^ The significance (p-value).
+wilcoxonMatchedPairSignificance sampleSize rnk
+ = (summedCoefficients sampleSize !! floor rnk) / 2 ** fromIntegral sampleSize
+
+-- | The Wilcoxon matched-pairs signed-rank test. The samples are
+-- zipped together: if one is longer than the other, both are
+-- truncated to the the length of the shorter sample.
+--
+-- For one-tailed test it tests whether first sample is significantly
+-- greater than the second. For two-tailed it checks whether they
+-- significantly differ
+--
+-- Check 'wilcoxonMatchedPairSignedRank' and
+-- 'wilcoxonMatchedPairSignificant' for additional information.
+wilcoxonMatchedPairTest :: TestType -- ^ Perform one-tailed test.
+ -> Double -- ^ The p-value at which to test (e.g. 0.05)
+ -> Sample -- ^ First sample
+ -> Sample -- ^ Second sample
+ -> Maybe TestResult
+ -- ^ Return 'Nothing' if the sample was too
+ -- small to make a decision.
+wilcoxonMatchedPairTest test p smp1 smp2 =
+ wilcoxonMatchedPairSignificant test (min n1 n2) p
+ $ wilcoxonMatchedPairSignedRank smp1 smp2
+ where
+ n1 = U.length smp1
+ n2 = U.length smp2
diff --git a/Statistics/Transform.hs b/Statistics/Transform.hs
new file mode 100644
index 0000000..c9b778d
--- /dev/null
+++ b/Statistics/Transform.hs
@@ -0,0 +1,108 @@
+{-# LANGUAGE BangPatterns, FlexibleContexts #-}
+-- |
+-- Module : Statistics.Transform
+-- Copyright : (c) 2011 Bryan O'Sullivan
+-- License : BSD3
+--
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : portable
+--
+-- Fourier-related transformations of mathematical functions.
+--
+-- These functions are written for simplicity and correctness, not
+-- speed. If you need a fast FFT implementation for your application,
+-- you should strongly consider using a library of FFTW bindings
+-- instead.
+
+module Statistics.Transform
+ (
+ CD
+ , dct
+ , idct
+ , fft
+ , ifft
+ ) where
+
+import Control.Monad (when)
+import Control.Monad.ST (ST)
+import Data.Bits (shiftL, shiftR)
+import Data.Complex (Complex(..), conjugate, realPart)
+import Statistics.Math (log2)
+import qualified Data.Vector.Generic as G
+import qualified Data.Vector.Generic.Mutable as M
+import qualified Data.Vector.Unboxed as U
+
+type CD = Complex Double
+
+-- | Discrete cosine transform.
+dct :: U.Vector CD -> U.Vector Double
+dct xs = G.map realPart $ G.zipWith (*) weights (fft interleaved)
+ where
+ interleaved = G.backpermute xs $ G.enumFromThenTo 0 2 (len-2) G.++
+ G.enumFromThenTo (len-1) (len-3) 1
+ weights = G.cons 1 . G.generate (len-1) $ \x ->
+ 2 * exp ((0:+(-1))*fi (x+1)*pi/(2*n))
+ where n = fi len
+ len = G.length xs
+
+-- | Inverse discrete cosine transform.
+idct :: U.Vector CD -> U.Vector Double
+idct xs = G.generate len interleave
+ where
+ interleave z | even z = vals `G.unsafeIndex` halve z
+ | otherwise = vals `G.unsafeIndex` (len - halve z - 1)
+ vals = G.map realPart . ifft $ G.zipWith (*) weights xs
+ weights = G.generate len $ \x -> n * exp ((0:+1)*fi x*pi/(2*n))
+ where n = fi len
+ len = G.length xs
+
+-- | Inverse fast Fourier transform.
+ifft :: U.Vector CD -> U.Vector CD
+ifft xs = G.map ((/fi (G.length xs)) . conjugate) . fft . G.map conjugate $ xs
+
+-- | Radix-2 decimation-in-time fast Fourier transform.
+fft :: U.Vector CD -> U.Vector CD
+fft v = G.create $ do
+ mv <- G.thaw v
+ mfft mv
+ return mv
+
+mfft :: (M.MVector v CD) => v s CD -> ST s ()
+mfft vec
+ | 1 `shiftL` m /= len = error "Statistics.Transform.fft: bad vector size"
+ | otherwise = bitReverse 0 0
+ where
+ bitReverse i j | i == len-1 = stage 0 1
+ | otherwise = do
+ when (i < j) $ M.swap vec i j
+ let inner k l | k <= l = inner (k `shiftR` 1) (l-k)
+ | otherwise = bitReverse (i+1) (l+k)
+ inner (len `shiftR` 1) j
+ stage l !l1 | l == m = return ()
+ | otherwise = do
+ let !l2 = l1 `shiftL` 1
+ !e = -6.283185307179586/fromIntegral l2
+ flight j !a | j == l1 = stage (l+1) l2
+ | otherwise = do
+ let butterfly i | i >= len = flight (j+1) (a+e)
+ | otherwise = do
+ let i1 = i + l1
+ xi1 :+ yi1 <- M.read vec i1
+ let !c = cos a
+ !s = sin a
+ d = (c*xi1 - s*yi1) :+ (s*xi1 + c*yi1)
+ ci <- M.read vec i
+ M.write vec i1 (ci - d)
+ M.write vec i (ci + d)
+ butterfly (i+l2)
+ butterfly j
+ flight 0 0
+ len = M.length vec
+ m = log2 len
+
+fi :: Int -> CD
+fi = fromIntegral
+
+halve :: Int -> Int
+halve = (`shiftR` 1)
diff --git a/examples/kde/KDE.hs b/examples/kde/KDE.hs
new file mode 100644
index 0000000..a9640b9
--- /dev/null
+++ b/examples/kde/KDE.hs
@@ -0,0 +1,23 @@
+{-# LANGUAGE OverloadedStrings #-}
+
+import Control.Applicative ((<$>))
+import Statistics.Sample.KernelDensity (kde)
+import Text.Hastache (MuType(..), defaultConfig, hastacheFile)
+import Text.Hastache.Context (mkStrContext)
+import qualified Data.Attoparsec as B
+import qualified Data.Attoparsec.Char8 as A
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Lazy as L
+import qualified Data.Vector.Unboxed as U
+
+csv = do
+ B.takeTill A.isEndOfLine
+ (A.double `A.sepBy` A.char ',') `A.sepBy` A.endOfLine
+
+main = do
+ waits <- (either error (U.fromList . map last . filter (not.null)) .
+ A.parseOnly csv) <$> B.readFile "data/faithful.csv"
+ let xs = map (\(a,b) -> [a,b]) . U.toList . uncurry U.zip . kde 64 $ waits
+ context "data" = MuVariable . show $ xs
+ s <- hastacheFile defaultConfig "kde.tpl" (mkStrContext context)
+ L.writeFile "kde.html" s
diff --git a/examples/kde/data/faithful.csv b/examples/kde/data/faithful.csv
new file mode 100644
index 0000000..4a4569c
--- /dev/null
+++ b/examples/kde/data/faithful.csv
@@ -0,0 +1,273 @@
+eruption,wait
+3.6,79
+1.8,54
+3.333,74
+2.283,62
+4.533,85
+2.883,55
+4.7,88
+3.6,85
+1.95,51
+4.35,85
+1.833,54
+3.917,84
+4.2,78
+1.75,47
+4.7,83
+2.167,52
+1.75,62
+4.8,84
+1.6,52
+4.25,79
+1.8,51
+1.75,47
+3.45,78
+3.067,69
+4.533,74
+3.6,83
+1.967,55
+4.083,76
+3.85,78
+4.433,79
+4.3,73
+4.467,77
+3.367,66
+4.033,80
+3.833,74
+2.017,52
+1.867,48
+4.833,80
+1.833,59
+4.783,90
+4.35,80
+1.883,58
+4.567,84
+1.75,58
+4.533,73
+3.317,83
+3.833,64
+2.1,53
+4.633,82
+2,59
+4.8,75
+4.716,90
+1.833,54
+4.833,80
+1.733,54
+4.883,83
+3.717,71
+1.667,64
+4.567,77
+4.317,81
+2.233,59
+4.5,84
+1.75,48
+4.8,82
+1.817,60
+4.4,92
+4.167,78
+4.7,78
+2.067,65
+4.7,73
+4.033,82
+1.967,56
+4.5,79
+4,71
+1.983,62
+5.067,76
+2.017,60
+4.567,78
+3.883,76
+3.6,83
+4.133,75
+4.333,82
+4.1,70
+2.633,65
+4.067,73
+4.933,88
+3.95,76
+4.517,80
+2.167,48
+4,86
+2.2,60
+4.333,90
+1.867,50
+4.817,78
+1.833,63
+4.3,72
+4.667,84
+3.75,75
+1.867,51
+4.9,82
+2.483,62
+4.367,88
+2.1,49
+4.5,83
+4.05,81
+1.867,47
+4.7,84
+1.783,52
+4.85,86
+3.683,81
+4.733,75
+2.3,59
+4.9,89
+4.417,79
+1.7,59
+4.633,81
+2.317,50
+4.6,85
+1.817,59
+4.417,87
+2.617,53
+4.067,69
+4.25,77
+1.967,56
+4.6,88
+3.767,81
+1.917,45
+4.5,82
+2.267,55
+4.65,90
+1.867,45
+4.167,83
+2.8,56
+4.333,89
+1.833,46
+4.383,82
+1.883,51
+4.933,86
+2.033,53
+3.733,79
+4.233,81
+2.233,60
+4.533,82
+4.817,77
+4.333,76
+1.983,59
+4.633,80
+2.017,49
+5.1,96
+1.8,53
+5.033,77
+4,77
+2.4,65
+4.6,81
+3.567,71
+4,70
+4.5,81
+4.083,93
+1.8,53
+3.967,89
+2.2,45
+4.15,86
+2,58
+3.833,78
+3.5,66
+4.583,76
+2.367,63
+5,88
+1.933,52
+4.617,93
+1.917,49
+2.083,57
+4.583,77
+3.333,68
+4.167,81
+4.333,81
+4.5,73
+2.417,50
+4,85
+4.167,74
+1.883,55
+4.583,77
+4.25,83
+3.767,83
+2.033,51
+4.433,78
+4.083,84
+1.833,46
+4.417,83
+2.183,55
+4.8,81
+1.833,57
+4.8,76
+4.1,84
+3.966,77
+4.233,81
+3.5,87
+4.366,77
+2.25,51
+4.667,78
+2.1,60
+4.35,82
+4.133,91
+1.867,53
+4.6,78
+1.783,46
+4.367,77
+3.85,84
+1.933,49
+4.5,83
+2.383,71
+4.7,80
+1.867,49
+3.833,75
+3.417,64
+4.233,76
+2.4,53
+4.8,94
+2,55
+4.15,76
+1.867,50
+4.267,82
+1.75,54
+4.483,75
+4,78
+4.117,79
+4.083,78
+4.267,78
+3.917,70
+4.55,79
+4.083,70
+2.417,54
+4.183,86
+2.217,50
+4.45,90
+1.883,54
+1.85,54
+4.283,77
+3.95,79
+2.333,64
+4.15,75
+2.35,47
+4.933,86
+2.9,63
+4.583,85
+3.833,82
+2.083,57
+4.367,82
+2.133,67
+4.35,74
+2.2,54
+4.45,83
+3.567,73
+4.5,73
+4.15,88
+3.817,80
+3.917,71
+4.45,83
+2,56
+4.283,79
+4.767,78
+4.533,84
+1.85,58
+4.25,83
+1.983,43
+2.25,60
+4.75,75
+4.117,81
+2.15,46
+4.417,90
+1.817,46
+4.467,74
diff --git a/examples/kde/kde.html b/examples/kde/kde.html
new file mode 100644
index 0000000..5d2545c
--- /dev/null
+++ b/examples/kde/kde.html
@@ -0,0 +1,28 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+ <head>
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+ <title>Kernel density</title>
+ <!--[if lte IE 8]><script language="javascript" type="text/javascript" src="http://people.iola.dk/olau/flot/excanvas.min.js"></script><![endif]-->
+ <script language="javascript" type="text/javascript" src="https://ajax.googleapis.com/ajax/libs/jquery/1.6.4/jquery.min.js"></script>
+ <script language="javascript" type="text/javascript" src="http://people.iola.dk/olau/flot/jquery.flot.js"></script>
+ </head>
+ <body>
+ <h1>Kernel density</h1>
+
+ <div id="placeholder" style="width:600px;height:450px;"></div>
+
+ <p>This is a 64-point kernel density estimate
+ of <a href="http://stat.ethz.ch/R-manual/R-patched/library/datasets/html/faithful.html">wait
+ times between eruptions</a> of
+ the <a href="http://en.wikipedia.org/wiki/Old_Faithful">Old
+ Faithful</a> geyser.</p>
+
+<script type="text/javascript">
+$(function () {
+ $.plot($("#placeholder"), [ [[37.7,2.5161110551039025e-4],[38.709523809523816,4.447091645179541e-4],[39.71904761904762,8.89495267293151e-4],[40.72857142857143,1.6826638124416372e-3],[41.73809523809524,2.915853030152525e-3],[42.747619047619054,4.617384776241099e-3],[43.75714285714286,6.707125941058233e-3],[44.766666666666666,9.002680047224753e-3],[45.77619047619048,1.1289358222230473e-2],[46.78571428571429,1.3413998627118355e-2],[47.7952380952381,1.5334009498412205e-2],[48.804761904761904,1.7084636391985843e-2],[49.81428571428572,1.869160073198233e-2],[50.82380952380953,2.0093659237928833e-2],[51.833333333333336,2.1129704241951732e-2],[52.84285714285714,2.160813072660192e-2],[53.852380952380955,2.142690467760544e-2],[54.86190476190477,2.0663894588783302e-2],[55.871428571428574,1.9554774751720513e-2],[56.88095238095238,1.835784852185525e-2],[57.89047619047619,1.721364996782301e-2],[58.900000000000006,1.611898372722214e-2],[59.90952380952381,1.5018622544779535e-2],[60.91904761904762,1.3900964326230551e-2],[61.92857142857143,1.2803755503590803e-2],[62.938095238095244,1.175952549012556e-2],[63.94761904761905,1.0778427101353434e-2],[64.95714285714286,9.90254113687662e-3],[65.96666666666667,9.263754969613376e-3],[66.97619047619048,9.065069215913515e-3],[67.9857142857143,9.489824501493842e-3],[68.9952380952381,1.062157012231642e-2],[70.0047619047619,1.2443698406039176e-2],[71.01428571428572,1.4902887084493477e-2],[72.02380952380952,1.7957646715371086e-2],[73.03333333333333,2.155509535870428e-2],[74.04285714285714,2.5555036677672206e-2],[75.05238095238096,2.967437285217729e-2],[76.06190476190477,3.3517062326339185e-2],[77.07142857142857,3.6695760198314636e-2],[78.08095238095238,3.897328209325028e-2],[79.0904761904762,4.0310862807977195e-2],[80.1,4.076878209020111e-2],[81.10952380952381,4.034443197900639e-2],[82.11904761904762,3.8916020257382e-2],[83.12857142857143,3.6371579849283686e-2],[84.13809523809525,3.2813879362105385e-2],[85.14761904761905,2.8641170617233373e-2],[86.15714285714286,2.440986212690428e-2],[87.16666666666667,2.0578794105541566e-2],[88.17619047619047,1.7329418869432917e-2],[89.18571428571428,1.4578610745209346e-2],[90.1952380952381,1.2139322012628417e-2],[91.20476190476191,9.885013669357134e-3],[92.21428571428572,7.807129857922685e-3],[93.22380952380952,5.966284588636623e-3],[94.23333333333333,4.415584046924452e-3],[95.24285714285715,3.1632654187895254e-3],[96.25238095238095,2.1821132245726424e-3],[97.26190476190476,1.43459816068524e-3],[98.27142857142857,8.875453007766301e-4],[99.28095238095239,5.128125355532956e-4],[100.2904761904762,2.8384986932914304e-4],[101.3,1.768029983316066e-4]] ]);
+});
+</script>
+
+ </body>
+</html>
diff --git a/examples/kde/kde.tpl b/examples/kde/kde.tpl
new file mode 100644
index 0000000..1bf9c32
--- /dev/null
+++ b/examples/kde/kde.tpl
@@ -0,0 +1,28 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+ <head>
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+ <title>Kernel density</title>
+ <!--[if lte IE 8]><script language="javascript" type="text/javascript" src="http://people.iola.dk/olau/flot/excanvas.min.js"></script><![endif]-->
+ <script language="javascript" type="text/javascript" src="https://ajax.googleapis.com/ajax/libs/jquery/1.6.4/jquery.min.js"></script>
+ <script language="javascript" type="text/javascript" src="http://people.iola.dk/olau/flot/jquery.flot.js"></script>
+ </head>
+ <body>
+ <h1>Kernel density</h1>
+
+ <div id="placeholder" style="width:600px;height:450px;"></div>
+
+ <p>This is a 64-point kernel density estimate
+ of <a href="http://stat.ethz.ch/R-manual/R-patched/library/datasets/html/faithful.html">wait
+ times between eruptions</a> of
+ the <a href="http://en.wikipedia.org/wiki/Old_Faithful">Old
+ Faithful</a> geyser.</p>
+
+<script type="text/javascript">
+$(function () {
+ $.plot($("#placeholder"), [ {{data}} ]);
+});
+</script>
+
+ </body>
+</html>
diff --git a/statistics.cabal b/statistics.cabal
index c8a1657..a9bda03 100644
--- a/statistics.cabal
+++ b/statistics.cabal
@@ -1,36 +1,116 @@
name: statistics
-version: 0.9.0.0
+version: 0.10.0.0
synopsis: A library of statistical types, data, and functions
description:
This library provides a number of common functions and types useful
- in statistics. Our focus is on high performance, numerical
- robustness, and use of good algorithms. Where possible, we provide
+ in statistics. We focus on high performance, numerical robustness,
+ and use of good algorithms. Where possible, we provide
references to the statistical literature.
.
The library's facilities can be divided into four broad categories:
.
- Working with widely used discrete and continuous probability
- distributions. (There are dozens of exotic distributions in use; we
- focus on the most common.)
+ * Working with widely used discrete and continuous probability
+ distributions. (There are dozens of exotic distributions in use;
+ we focus on the most common.)
.
- Computing with sample data: quantile estimation, kernel density
- estimation, bootstrap methods, signigicance testing, and autocorrelation
- analysis.
+ * Computing with sample data: quantile estimation, kernel density
+ estimation, histograms, bootstrap methods, significance testing,
+ and autocorrelation analysis.
.
- Random variate generation under several different distributions.
+ * Random variate generation under several different distributions.
.
- Common statistical tests for significant differences between samples.
+ * Common statistical tests for significant differences between
+ samples.
+ .
+ Changes in 0.10.0.0:
+ .
+ * The type classes @Mean@ and @Variance@ are split in two. This is
+ required for distributions which do not have finite variance or
+ mean.
+ .
+ * The @S.Sample.KernelDensity@ module has been renamed, and
+ completely rewritten to be much more robust. The older module
+ oversmoothed multi-modal data. (The older module is still
+ available under the name @S.Sample.KernelDensity.Simple@).
+ .
+ * Histogram computation is added, in @S.Sample.Histogram@.
+ .
+ * Forward and inverse discrete Fourier and cosine transforms are
+ added, in @S.Transform@.
+ .
+ * Root finding is added, in @S.Math.RootFinding@.
+ .
+ * The @complCumulative@ function is added to the @Distribution@
+ class in order to accurately assess probalities P(X>x) which are
+ used in one-tailed tests.
+ .
+ * A @stdDev@ function is added to the @Variance@ class for
+ distributions.
+ .
+ * The constructor @S.Distribution.normalDistr@ now takes standard
+ deviation instead of variance as its parameter.
+ .
+ * A bug in @S.Quantile.weightedAvg@ is fixed. It produced a wrong
+ answer if a sample contained only one element.
+ .
+ * Bugs in quantile estimations for chi-square and gamma distribution
+ are fixed.
+ .
+ * Integer overlow in @mannWhitneyUCriticalValue@ is fixed. It
+ produced incorrect critical values for moderately large
+ samples. Something around 20 for 32-bit machines and 40 for 64-bit
+ ones.
+ .
+ * A bug in @mannWhitneyUSignificant@ is fixed. If either sample was
+ larger than 20, it produced a completely incorrect answer.
+ .
+ * One- and two-tailed tests in @S.Tests.NonParametric@ are selected
+ with sum types instead of @Bool@.
+ .
+ * Test results returned as enumeration instead of @Bool@.
+ .
+ * Performance improvements for Mann-Whitney U and Wilcoxon tests.
+ .
+ * Module @S.Tests.NonParamtric@ is split into @S.Tests.MannWhitneyU@
+ and @S.Tests.WilcoxonT@
+ .
+ * @sortBy@ is added to @S.Function@.
+ .
+ * Mean and variance for gamma distribution are fixed.
+ .
+ * Much faster cumulative probablity functions for Poisson and
+ hypergeometric distributions.
+ .
+ * Better density functions for gamma and Poisson distributions.
+ .
+ * Student-T, Fisher-Snedecor F-distributions and Cauchy-Lorentz
+ distrbution are added.
+ .
+ * The function @S.Function.create@ is removed. Use @generateM@ from
+ the @vector@ package instead.
+ .
+ * Function to perform approximate comparion of doubles is added to
+ @S.Function.Comparison@
+ .
+ * Regularized incomplete beta function and its inverse are added to
+ @S.Function@.
+
license: BSD3
license-file: LICENSE
-homepage: http://bitbucket.org/bos/statistics
-bug-reports: http://bitbucket.org/bos/statistics/issues
+homepage: https://github.com/bos/statistics
+bug-reports: https://github.com/bos/statistics/issues
author: Bryan O'Sullivan <bos@serpentine.com>
maintainer: Bryan O'Sullivan <bos@serpentine.com>
copyright: 2009, 2010, 2011 Bryan O'Sullivan
category: Math, Statistics
build-type: Simple
-cabal-version: >= 1.6
-extra-source-files: README.markdown
+cabal-version: >= 1.8
+extra-source-files:
+ README.markdown
+ examples/kde/KDE.hs
+ examples/kde/data/faithful.csv
+ examples/kde/kde.html
+ examples/kde/kde.tpl
library
exposed-modules:
@@ -38,35 +118,47 @@ library
Statistics.Constants
Statistics.Distribution
Statistics.Distribution.Binomial
+ Statistics.Distribution.CauchyLorentz
Statistics.Distribution.ChiSquared
+ Statistics.Distribution.Exponential
+ Statistics.Distribution.FDistribution
Statistics.Distribution.Gamma
Statistics.Distribution.Geometric
- Statistics.Distribution.Exponential
Statistics.Distribution.Hypergeometric
Statistics.Distribution.Normal
Statistics.Distribution.Poisson
+ Statistics.Distribution.StudentT
+ Statistics.Distribution.Uniform
Statistics.Function
- Statistics.KernelDensity
Statistics.Math
+ Statistics.Math.RootFinding
Statistics.Quantile
Statistics.Resampling
Statistics.Resampling.Bootstrap
Statistics.Sample
+ Statistics.Sample.Histogram
+ Statistics.Sample.KernelDensity
+ Statistics.Sample.KernelDensity.Simple
Statistics.Sample.Powers
Statistics.Test.NonParametric
+ Statistics.Test.Types
+ Statistics.Test.MannWhitneyU
+ Statistics.Test.WilcoxonT
+ Statistics.Transform
Statistics.Types
other-modules:
+ Statistics.Distribution.Poisson.Internal
+ Statistics.Function.Comparison
Statistics.Internal
+ Statistics.Test.Internal
build-depends:
- aeson,
base < 5,
deepseq >= 1.1.0.2,
erf,
monad-par >= 0.1.0.1,
mwc-random >= 0.8.0.5,
primitive >= 0.3,
- time,
- vector >= 0.7.0.0,
+ vector >= 0.7.1,
vector-algorithms >= 0.4
if impl(ghc >= 6.10)
build-depends:
@@ -75,10 +167,36 @@ library
-- gather extensive profiling data for now
ghc-prof-options: -auto-all
- ghc-options: -Wall -funbox-strict-fields
+ ghc-options: -O2 -Wall -funbox-strict-fields
if impl(ghc >= 6.8)
ghc-options: -fwarn-tabs
+test-suite tests
+ type: exitcode-stdio-1.0
+ hs-source-dirs: tests
+ main-is: tests.hs
+
+ ghc-options:
+ -Wall -threaded -rtsopts
+
+ build-depends:
+ base,
+ ieee754 >= 0.7.3,
+ HUnit,
+ QuickCheck >= 2,
+ test-framework,
+ test-framework-quickcheck2,
+ test-framework-hunit,
+ statistics,
+ primitive,
+ vector,
+ vector-algorithms,
+ erf
+
+source-repository head
+ type: git
+ location: https://github.com/bos/statistics
+
source-repository head
type: mercurial
- location: http://bitbucket.org/bos/statistics
+ location: https://bitbucket.org/bos/statistics
diff --git a/tests/tests.hs b/tests/tests.hs
new file mode 100644
index 0000000..3e90132
--- /dev/null
+++ b/tests/tests.hs
@@ -0,0 +1,13 @@
+import Test.Framework (defaultMain)
+
+import Tests.Distribution
+import Tests.Math
+import Tests.NonparametricTest
+import qualified Tests.Transform
+
+main :: IO ()
+main = defaultMain [ distributionTests
+ , mathTests
+ , nonparametricTests
+ , Tests.Transform.tests
+ ]