summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBryanOSullivan <>2013-04-13 22:30:19 (GMT)
committerhdiff <hdiff@hdiff.luite.com>2013-04-13 22:30:19 (GMT)
commit3b56f42d5ee0eb2fb7939066c7e1d237677ccafe (patch)
treecfd2cd37c164aafa54f94793452db26da6b85f09
parent4e9f2a1864dbfe9d9e12d1a67efb1e707a60a51d (diff)
version 0.10.3.00.10.3.0
-rw-r--r--Statistics/Distribution.hs10
-rw-r--r--Statistics/Distribution/Binomial.hs16
-rw-r--r--Statistics/Distribution/FDistribution.hs5
-rw-r--r--Statistics/Distribution/Geometric.hs7
-rw-r--r--Statistics/Distribution/Hypergeometric.hs16
-rw-r--r--Statistics/Distribution/Normal.hs20
-rw-r--r--Statistics/Distribution/Poisson.hs6
-rw-r--r--Statistics/Distribution/StudentT.hs28
-rw-r--r--Statistics/Distribution/Transform.hs73
-rw-r--r--Statistics/Function.hs33
-rw-r--r--Statistics/Quantile.hs39
-rw-r--r--Statistics/Resampling/Bootstrap.hs9
-rw-r--r--Statistics/Sample.hs5
-rw-r--r--Statistics/Sample/Histogram.hs1
-rw-r--r--Statistics/Sample/KernelDensity.hs5
-rw-r--r--Statistics/Test/KolmogorovSmirnov.hs3
-rw-r--r--Statistics/Transform.hs37
-rw-r--r--statistics.cabal12
-rw-r--r--tests/Tests/Distribution.hs81
-rw-r--r--tests/Tests/Function.hs11
-rw-r--r--tests/Tests/Transform.hs4
21 files changed, 315 insertions, 106 deletions
diff --git a/Statistics/Distribution.hs b/Statistics/Distribution.hs
index cc79aee..6809738 100644
--- a/Statistics/Distribution.hs
+++ b/Statistics/Distribution.hs
@@ -43,17 +43,21 @@ import System.Random.MWC
class Distribution d where
-- | Cumulative distribution function. The probability that a
-- random variable /X/ is less or equal than /x/,
- -- i.e. P(/X/&#8804;/x/).
+ -- i.e. P(/X/&#8804;/x/). Cumulative should be defined for
+ -- infinities as well:
+ --
+ -- > cumulative d +∞ = 1
+ -- > cumulative d -∞ = 0
cumulative :: d -> Double -> Double
-- | One's complement of cumulative distibution:
--
-- > complCumulative d x = 1 - cumulative d x
--
- -- It's useful when one is interested in P(/X/&#8805;/x/) and
+ -- It's useful when one is interested in P(/X/</x/) and
-- expression on the right side begin to lose precision. This
-- function have default implementation but implementors are
- -- encouraged to provide more precise implementation
+ -- encouraged to provide more precise implementation.
complCumulative :: d -> Double -> Double
complCumulative d x = 1 - cumulative d x
diff --git a/Statistics/Distribution/Binomial.hs b/Statistics/Distribution/Binomial.hs
index 135a041..dfb566f 100644
--- a/Statistics/Distribution/Binomial.hs
+++ b/Statistics/Distribution/Binomial.hs
@@ -67,13 +67,15 @@ probability (BD n p) k
-- Summation from different sides required to reduce roundoff errors
cumulative :: BinomialDistribution -> Double -> Double
cumulative d@(BD n _) x
- | k < 0 = 0
- | k >= n = 1
- | k < m = D.sumProbabilities d 0 k
- | otherwise = 1 - D.sumProbabilities d (k+1) n
- where
- m = floor (mean d)
- k = floor x
+ | isNaN x = error "Statistics.Distribution.Binomial.cumulative: NaN input"
+ | isInfinite x = if x > 0 then 1 else 0
+ | k < 0 = 0
+ | k >= n = 1
+ | k < m = D.sumProbabilities d 0 k
+ | otherwise = 1 - D.sumProbabilities d (k+1) n
+ where
+ m = floor (mean d)
+ k = floor x
{-# INLINE cumulative #-}
mean :: BinomialDistribution -> Double
diff --git a/Statistics/Distribution/FDistribution.hs b/Statistics/Distribution/FDistribution.hs
index 3857fb6..6506f8f 100644
--- a/Statistics/Distribution/FDistribution.hs
+++ b/Statistics/Distribution/FDistribution.hs
@@ -49,8 +49,9 @@ instance D.ContDistr FDistribution where
cumulative :: FDistribution -> Double -> Double
cumulative (F n m _) x
- | x > 0 = let y = n*x in incompleteBeta (0.5 * n) (0.5 * m) (y / (m + y))
- | otherwise = 0
+ | x <= 0 = 0
+ | isInfinite x = 1 -- Only matches +∞
+ | x > 0 = let y = n*x in incompleteBeta (0.5 * n) (0.5 * m) (y / (m + y))
density :: FDistribution -> Double -> Double
density (F n m fac) x
diff --git a/Statistics/Distribution/Geometric.hs b/Statistics/Distribution/Geometric.hs
index 9fc7531..6677c2b 100644
--- a/Statistics/Distribution/Geometric.hs
+++ b/Statistics/Distribution/Geometric.hs
@@ -70,6 +70,9 @@ probability (GD s) n | n < 1 = 0
{-# INLINE probability #-}
cumulative :: GeometricDistribution -> Double -> Double
-cumulative (GD s) x | x < 1 = 0
- | otherwise = 1 - (1-s) ^ (floor x :: Int)
+cumulative (GD s) x
+ | x < 1 = 0
+ | isInfinite x = 1
+ | isNaN x = error "Statistics.Distribution.Geometric.cumulative: NaN input"
+ | otherwise = 1 - (1-s) ^ (floor x :: Int)
{-# INLINE cumulative #-}
diff --git a/Statistics/Distribution/Hypergeometric.hs b/Statistics/Distribution/Hypergeometric.hs
index eb70003..abab8a2 100644
--- a/Statistics/Distribution/Hypergeometric.hs
+++ b/Statistics/Distribution/Hypergeometric.hs
@@ -93,10 +93,12 @@ probability (HD mi li ki) n
cumulative :: HypergeometricDistribution -> Double -> Double
cumulative d@(HD mi li ki) x
- | n < minN = 0
- | n >= maxN = 1
- | otherwise = D.sumProbabilities d minN n
- where
- n = floor x
- minN = max 0 (mi+ki-li)
- maxN = min mi ki \ No newline at end of file
+ | isNaN x = error "Statistics.Distribution.Hypergeometric.cumulative: NaN argument"
+ | isInfinite x = if x > 0 then 1 else 0
+ | n < minN = 0
+ | n >= maxN = 1
+ | otherwise = D.sumProbabilities d minN n
+ where
+ n = floor x
+ minN = max 0 (mi+ki-li)
+ maxN = min mi ki
diff --git a/Statistics/Distribution/Normal.hs b/Statistics/Distribution/Normal.hs
index b29b477..17b05fc 100644
--- a/Statistics/Distribution/Normal.hs
+++ b/Statistics/Distribution/Normal.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE DeriveDataTypeable #-}
-- |
-- Module : Statistics.Distribution.Normal
@@ -20,13 +21,15 @@ module Statistics.Distribution.Normal
, standard
) where
-import Data.Number.Erf (erfc)
-import Data.Typeable (Typeable)
+import Data.Typeable (Typeable)
import Numeric.MathFunctions.Constants (m_sqrt_2, m_sqrt_2_pi)
+import Numeric.SpecFunctions (erfc, invErfc)
import qualified Statistics.Distribution as D
-import qualified Statistics.Sample as S
+import qualified Statistics.Sample as S
import qualified System.Random.MWC.Distributions as MWC
+
+
-- | The normal distribution.
data NormalDistribution = ND {
mean :: {-# UNPACK #-} !Double
@@ -81,14 +84,17 @@ normalDistr m sd
, ndPdfDenom = m_sqrt_2_pi * sd
, ndCdfDenom = m_sqrt_2 * sd
}
- | otherwise =
+ | otherwise =
error $ "Statistics.Distribution.Normal.normalDistr: standard deviation must be positive. Got " ++ show sd
-- | Create distribution using parameters estimated from
-- sample. Variance is estimated using maximum likelihood method
-- (biased estimation).
normalFromSample :: S.Sample -> NormalDistribution
-normalFromSample a = normalDistr (S.mean a) (S.stdDev a)
+normalFromSample xs
+ = normalDistr m (sqrt v)
+ where
+ (m,v) = S.meanVariance xs
density :: NormalDistribution -> Double -> Double
density d x = exp (-xm * xm / (2 * sd * sd)) / ndPdfDenom d
@@ -106,8 +112,8 @@ quantile d p
| p == 0 = -inf
| p == 1 = inf
| p == 0.5 = mean d
- | p > 0 && p < 1 = x * stdDev d + mean d
+ | p > 0 && p < 1 = x * ndCdfDenom d + mean d
| otherwise =
error $ "Statistics.Distribution.Normal.quantile: p must be in [0,1] range. Got: "++show p
- where x = D.findRoot standard p 0 (-100) 100
+ where x = invErfc $ 2 * (1 - p)
inf = 1/0
diff --git a/Statistics/Distribution/Poisson.hs b/Statistics/Distribution/Poisson.hs
index 107f8d3..f508391 100644
--- a/Statistics/Distribution/Poisson.hs
+++ b/Statistics/Distribution/Poisson.hs
@@ -37,8 +37,10 @@ newtype PoissonDistribution = PD {
instance D.Distribution PoissonDistribution where
cumulative (PD lambda) x
- | x < 0 = 0
- | otherwise = 1 - incompleteGamma (fromIntegral (floor x + 1 :: Int)) lambda
+ | x < 0 = 0
+ | isInfinite x = 1
+ | isNaN x = error "Statistics.Distribution.Poisson.cumulative: NaN input"
+ | otherwise = 1 - incompleteGamma (fromIntegral (floor x + 1 :: Int)) lambda
{-# INLINE cumulative #-}
instance D.DiscreteDistr PoissonDistribution where
diff --git a/Statistics/Distribution/StudentT.hs b/Statistics/Distribution/StudentT.hs
index 330630e..c778f0d 100644
--- a/Statistics/Distribution/StudentT.hs
+++ b/Statistics/Distribution/StudentT.hs
@@ -13,25 +13,24 @@ module Statistics.Distribution.StudentT (
StudentT
, studentT
, studentTndf
+ , studentTUnstandardized
) where
+
import qualified Statistics.Distribution as D
+import Statistics.Distribution.Transform (LinearTransform (..))
import Data.Typeable (Typeable)
import Numeric.SpecFunctions (logBeta, incompleteBeta, invIncompleteBeta)
-
-
-- | Student-T distribution
newtype StudentT = StudentT { studentTndf :: Double }
deriving (Eq,Show,Read,Typeable)
-
-- | Create Student-T distribution. Number of parameters must be positive.
studentT :: Double -> StudentT
studentT ndf
| ndf > 0 = StudentT ndf
- | otherwise =
- error "Statistics.Distribution.StudentT.studentT: non-positive number of degrees of freedom"
+ | otherwise = modErr "studentT" "non-positive number of degrees of freedom"
instance D.Distribution StudentT where
cumulative = cumulative
@@ -58,8 +57,7 @@ quantile (StudentT ndf) p
in case sqrt $ ndf * (1 - x) / x of
r | p < 0.5 -> -r
| otherwise -> r
- | otherwise =
- error $ "Statistics.Distribution.Uniform.quantile: p must be in [0,1] range. Got: "++show p
+ | otherwise = modErr "quantile" $ "p must be in [0,1] range. Got: "++show p
instance D.MaybeMean StudentT where
@@ -67,8 +65,20 @@ instance D.MaybeMean StudentT where
| otherwise = Nothing
instance D.MaybeVariance StudentT where
- maybeStdDev (StudentT ndf) | ndf > 2 = Just $ ndf / (ndf - 2)
- | otherwise = Nothing
+ maybeVariance (StudentT ndf) | ndf > 2 = Just $! ndf / (ndf - 2)
+ | otherwise = Nothing
instance D.ContGen StudentT where
genContVar = D.genContinous
+
+-- | Create an unstandardized Student-t distribution.
+studentTUnstandardized :: Double -- ^ Number of degrees of freedom
+ -> Double -- ^ Central value (0 for standard Student T distribution)
+ -> Double -- ^ Scale parameter
+ -> LinearTransform StudentT
+studentTUnstandardized ndf mu sigma
+ | sigma > 0 = LinearTransform mu sigma $ studentT ndf
+ | otherwise = modErr "studentTUnstandardized" $ "sigma must be > 0. Got: " ++ show sigma
+
+modErr :: String -> String -> a
+modErr fun msg = error $ "Statistics.Distribution.StudentT." ++ fun ++ ": " ++ msg
diff --git a/Statistics/Distribution/Transform.hs b/Statistics/Distribution/Transform.hs
new file mode 100644
index 0000000..dd98753
--- /dev/null
+++ b/Statistics/Distribution/Transform.hs
@@ -0,0 +1,73 @@
+{-# LANGUAGE FlexibleInstances, UndecidableInstances, FlexibleContexts, DeriveDataTypeable #-}
+-- |
+-- Module : Statistics.Distribution.Transform
+-- Copyright : (c) 2013 John McDonnell;
+-- License : BSD3
+--
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : portable
+--
+-- Transformations over distributions
+module Statistics.Distribution.Transform (
+ LinearTransform (..)
+ , linTransFixedPoint
+ , scaleAround
+ ) where
+
+import Data.Typeable (Typeable)
+import Data.Functor ((<$>))
+import qualified Statistics.Distribution as D
+
+-- | Linear transformation applied to distribution.
+--
+-- > LinearTransform μ σ _
+-- > x' = μ + σ·x
+data LinearTransform d = LinearTransform
+ { linTransLocation :: {-# UNPACK #-} !Double
+ -- | Location parameter.
+ , linTransScale :: {-# UNPACK #-} !Double
+ -- | Scale parameter.
+ , linTransDistr :: d
+ -- | Distribution being transformed.
+ } deriving (Eq,Show,Read,Typeable)
+
+-- | Apply linear transformation to distribution.
+scaleAround :: Double -- ^ Fixed point
+ -> Double -- ^ Scale parameter
+ -> d -- ^ Distribution
+ -> LinearTransform d
+scaleAround x0 sc = LinearTransform (x0 * (1 - sc)) sc
+
+-- | Get fixed point of linear transformation
+linTransFixedPoint :: LinearTransform d -> Double
+linTransFixedPoint (LinearTransform loc sc _) = loc / (1 - sc)
+
+instance Functor LinearTransform where
+ fmap f (LinearTransform loc sc dist) = LinearTransform loc sc (f dist)
+
+instance D.Distribution d => D.Distribution (LinearTransform d) where
+ cumulative (LinearTransform loc sc dist) x = D.cumulative dist $ (x-loc) / sc
+
+instance D.ContDistr d => D.ContDistr (LinearTransform d) where
+ density (LinearTransform loc sc dist) x = D.density dist ((x-loc) / sc) / sc
+ quantile (LinearTransform loc sc dist) p = loc + sc * D.quantile dist p
+
+instance D.MaybeMean d => D.MaybeMean (LinearTransform d) where
+ maybeMean (LinearTransform loc _ dist) = (+loc) <$> D.maybeMean dist
+
+instance (D.Mean d) => D.Mean (LinearTransform d) where
+ mean (LinearTransform loc _ dist) = loc + D.mean dist
+
+instance D.MaybeVariance d => D.MaybeVariance (LinearTransform d) where
+ maybeVariance (LinearTransform _ sc dist) = (*(sc*sc)) <$> D.maybeVariance dist
+ maybeStdDev (LinearTransform _ sc dist) = (*sc) <$> D.maybeStdDev dist
+
+instance (D.Variance d) => D.Variance (LinearTransform d) where
+ variance (LinearTransform _ sc dist) = sc * sc * D.variance dist
+ stdDev (LinearTransform _ sc dist) = sc * D.stdDev dist
+
+instance D.ContGen d => D.ContGen (LinearTransform d) where
+ genContVar (LinearTransform loc sc d) g = do
+ x <- D.genContVar d g
+ return $! loc + sc * x
diff --git a/Statistics/Function.hs b/Statistics/Function.hs
index f3c9c3a..d5b7046 100644
--- a/Statistics/Function.hs
+++ b/Statistics/Function.hs
@@ -77,14 +77,25 @@ minMax = fini . G.foldl' go (MM (1/0) (-1/0))
-- non-negative integer. If the given value is already a power of
-- two, it is returned unchanged. If negative, zero is returned.
nextHighestPowerOfTwo :: Int -> Int
-nextHighestPowerOfTwo n = o + 1
- where m = n - 1
- o = m
- .|. (m `shiftR` 1)
- .|. (m `shiftR` 2)
- .|. (m `shiftR` 4)
- .|. (m `shiftR` 8)
- .|. (m `shiftR` 16)
-#if WORD_SIZE_IN_BITS == 64
- .|. (m `shiftR` 32)
-#endif
+nextHighestPowerOfTwo n
+#if WORD_SIZE_IN_BITS == 64
+ = 1 + _i32
+#else
+ = 1 + i16
+#endif
+ where
+ i0 = n - 1
+ i1 = i0 .|. i0 `shiftR` 1
+ i2 = i1 .|. i1 `shiftR` 2
+ i4 = i2 .|. i2 `shiftR` 4
+ i8 = i4 .|. i4 `shiftR` 8
+ i16 = i8 .|. i8 `shiftR` 16
+ _i32 = i16 .|. i16 `shiftR` 32
+-- It could be implemented as
+--
+-- > nextHighestPowerOfTwo n = 1 + foldl' go (n-1) [1, 2, 4, 8, 16, 32]
+-- where go m i = m .|. m `shiftR` i
+--
+-- But GHC do not inline foldl (probably because it's recursive) and
+-- as result function walks list of boxed ints. Hand rolled version
+-- uses unboxed arithmetic.
diff --git a/Statistics/Quantile.hs b/Statistics/Quantile.hs
index f641985..b739da5 100644
--- a/Statistics/Quantile.hs
+++ b/Statistics/Quantile.hs
@@ -37,10 +37,9 @@ module Statistics.Quantile
-- $references
) where
-import Control.Exception (assert)
-import Data.Vector.Generic ((!))
+import Data.Vector.Generic ((!))
import Numeric.MathFunctions.Constants (m_epsilon)
-import Statistics.Function (partialSort)
+import Statistics.Function (partialSort)
import qualified Data.Vector.Generic as G
-- | O(/n/ log /n/). Estimate the /k/th /q/-quantile of a sample,
@@ -51,13 +50,11 @@ weightedAvg :: G.Vector v Double =>
-> v Double -- ^ /x/, the sample data.
-> Double
weightedAvg k q x
- | n == 1 = G.head x
- | otherwise =
- assert (q >= 2) .
- assert (k >= 0) .
- assert (k < q) .
- assert (G.all (not . isNaN) x) $
- xj + g * (xj1 - xj)
+ | G.any isNaN x = modErr "weightedAvg" "Sample contains NaNs"
+ | n == 1 = G.head x
+ | q < 2 = modErr "weightedAvg" "At least 2 quantiles is needed"
+ | k < 0 || k >= q = modErr "weightedAvg" "Wrong quantile number"
+ | otherwise = xj + g * (xj1 - xj)
where
j = floor idx
idx = fromIntegral (n - 1) * fromIntegral k / fromIntegral q
@@ -81,12 +78,11 @@ continuousBy :: G.Vector v Double =>
-> Int -- ^ /q/, the number of quantiles.
-> v Double -- ^ /x/, the sample data.
-> Double
-continuousBy (ContParam a b) k q x =
- assert (q >= 2) .
- assert (k >= 0) .
- assert (k <= q) .
- assert (G.all (not . isNaN) x) $
- (1-h) * item (j-1) + h * item j
+continuousBy (ContParam a b) k q x
+ | q < 2 = modErr "continuousBy" "At least 2 quantiles is needed"
+ | k < 0 || k > q = modErr "continuousBy" "Wrong quantile number"
+ | G.any isNaN x = modErr "continuousBy" "Sample contains NaNs"
+ | otherwise = (1-h) * item (j-1) + h * item j
where
j = floor (t + eps)
t = a + p * (fromIntegral n + 1 - a - b)
@@ -115,10 +111,10 @@ midspread :: G.Vector v Double =>
-> Int -- ^ /q/, the number of quantiles.
-> v Double -- ^ /x/, the sample data.
-> Double
-midspread (ContParam a b) k x =
- assert (G.all (not . isNaN) x) .
- assert (k > 0) $
- quantile (1-frac) - quantile frac
+midspread (ContParam a b) k x
+ | G.any isNaN x = modErr "midspread" "Sample contains NaNs"
+ | k <= 0 = modErr "midspread" "Nonpositive number of quantiles"
+ | otherwise = quantile (1-frac) - quantile frac
where
quantile i = (1-h i) * item (j i-1) + h i * item (j i)
j i = floor (t i + eps) :: Int
@@ -180,6 +176,9 @@ normalUnbiased = ContParam ta ta
where ta = 3/8
{-# INLINE normalUnbiased #-}
+modErr :: String -> String -> a
+modErr f err = error $ "Statistics.Quantile." ++ f ++ ": " ++ err
+
-- $references
--
-- * Weisstein, E.W. Quantile. /MathWorld/.
diff --git a/Statistics/Resampling/Bootstrap.hs b/Statistics/Resampling/Bootstrap.hs
index c84c497..6c63e88 100644
--- a/Statistics/Resampling/Bootstrap.hs
+++ b/Statistics/Resampling/Bootstrap.hs
@@ -22,7 +22,7 @@ module Statistics.Resampling.Bootstrap
import Control.DeepSeq (NFData)
import Control.Exception (assert)
-import Control.Monad.Par (runPar, parMap)
+import Control.Monad.Par (parMap,runPar)
import Data.Data (Data)
import Data.Typeable (Typeable)
import Data.Vector.Unboxed ((!))
@@ -79,9 +79,10 @@ bootstrapBCA :: Double -- ^ Confidence level
-> [Estimator] -- ^ Estimators
-> [Resample] -- ^ Resampled data
-> [Estimate]
-bootstrapBCA confidenceLevel sample estimators resamples =
- assert (confidenceLevel > 0 && confidenceLevel < 1)
- runPar $ parMap (uncurry e) (zip estimators resamples)
+bootstrapBCA confidenceLevel sample estimators resamples
+ | confidenceLevel > 0 && confidenceLevel < 1
+ = runPar $ parMap (uncurry e) (zip estimators resamples)
+ | otherwise = error "Statistics.Resampling.Bootstrap.bootstrapBCA: confidence level outside (0,1) range"
where
e est (Resample resample)
| U.length sample == 1 = estimate pt pt pt confidenceLevel
diff --git a/Statistics/Sample.hs b/Statistics/Sample.hs
index 671a137..ab98006 100644
--- a/Statistics/Sample.hs
+++ b/Statistics/Sample.hs
@@ -101,10 +101,7 @@ harmonicMean = fini . G.foldl' go (T 0 0)
-- | /O(n)/ Geometric mean of a sample containing no negative values.
geometricMean :: (G.Vector v Double) => v Double -> Double
-geometricMean = fini . G.foldl' go (T 1 0)
- where
- fini (T p n) = p ** (1 / fromIntegral n)
- go (T p n) a = T (p * a) (n + 1)
+geometricMean = exp . mean . G.map log
{-# INLINE geometricMean #-}
-- | Compute the /k/th central moment of a sample. The central moment
diff --git a/Statistics/Sample/Histogram.hs b/Statistics/Sample/Histogram.hs
index 1628418..e779a5c 100644
--- a/Statistics/Sample/Histogram.hs
+++ b/Statistics/Sample/Histogram.hs
@@ -29,6 +29,7 @@ import qualified Data.Vector.Generic.Mutable as GM
-- The result consists of a pair of vectors:
--
-- * The lower bound of each interval.
+--
-- * The number of samples within the interval.
--
-- Interval (bin) sizes are uniform, and the upper and lower bounds
diff --git a/Statistics/Sample/KernelDensity.hs b/Statistics/Sample/KernelDensity.hs
index 92bef49..def640a 100644
--- a/Statistics/Sample/KernelDensity.hs
+++ b/Statistics/Sample/KernelDensity.hs
@@ -34,6 +34,8 @@ import Statistics.Transform (dct, idct)
import qualified Data.Vector.Generic as G
import qualified Data.Vector.Unboxed as U
+
+
-- | Gaussian kernel density estimator for one-dimensional data, using
-- the method of Botev et al.
--
@@ -53,6 +55,7 @@ kde n0 xs = kde_ n0 (lo - range / 10) (hi + range / 10) xs
where
(lo,hi) = minMax xs
range | U.length xs <= 1 = 1 -- Unreasonable guess
+ | lo == hi = 1 -- All elements are equal
| otherwise = hi - lo
-- | Gaussian kernel density estimator for one-dimensional data, using
@@ -74,7 +77,7 @@ kde_ :: Int
-> U.Vector Double -> (U.Vector Double, U.Vector Double)
kde_ n0 min max xs
| U.null xs = error "Statistics.KernelDensity.kde: empty sample"
- | n0 < 1 = error "Statistics.KernelDensity.kde: invalid number of points"
+ | n0 <= 1 = error "Statistics.KernelDensity.kde: invalid number of points"
| otherwise = (mesh, density)
where
mesh = G.generate ni $ \z -> min + (d * fromIntegral z)
diff --git a/Statistics/Test/KolmogorovSmirnov.hs b/Statistics/Test/KolmogorovSmirnov.hs
index 3da54f9..b0eb464 100644
--- a/Statistics/Test/KolmogorovSmirnov.hs
+++ b/Statistics/Test/KolmogorovSmirnov.hs
@@ -9,7 +9,8 @@
--
-- Kolmogov-Smirnov tests are non-parametric tests for assesing
-- whether given sample could be described by distribution or whether
--- two samples have the same distribution.
+-- two samples have the same distribution. It's only applicable to
+-- continous distributions.
module Statistics.Test.KolmogorovSmirnov (
-- * Kolmogorov-Smirnov test
kolmogorovSmirnovTest
diff --git a/Statistics/Transform.hs b/Statistics/Transform.hs
index 998b05c..087c2fd 100644
--- a/Statistics/Transform.hs
+++ b/Statistics/Transform.hs
@@ -43,16 +43,21 @@ type CD = Complex Double
-- | Discrete cosine transform (DCT-II).
dct :: U.Vector Double -> U.Vector Double
+{-# INLINE dct #-}
dct = dctWorker . G.map (:+0)
-- | Discrete cosine transform (DCT-II). Only real part of vector is
-- transformed, imaginary part is ignored.
dct_ :: U.Vector CD -> U.Vector Double
+{-# INLINE dct_ #-}
dct_ = dctWorker . G.map (\(i :+ _) -> i :+ 0)
dctWorker :: U.Vector CD -> U.Vector Double
dctWorker xs
- = G.map realPart $ G.zipWith (*) weights (fft interleaved)
+ -- length 1 is special cased because shuffle algorithms fail for it.
+ | G.length xs == 1 = G.map ((2*) . realPart) xs
+ | vectorOK xs = G.map realPart $ G.zipWith (*) weights (fft interleaved)
+ | otherwise = error "Statistics.Transform.dct: bad vector length"
where
interleaved = G.backpermute xs $ G.enumFromThenTo 0 2 (len-2) G.++
G.enumFromThenTo (len-1) (len-3) 1
@@ -66,17 +71,21 @@ dctWorker xs
-- | Inverse discrete cosine transform (DCT-III). It's inverse of
-- 'dct' only up to scale parameter:
--
--- > (idct . dct) x = (* lenngth x)
+-- > (idct . dct) x = (* length x)
idct :: U.Vector Double -> U.Vector Double
+{-# INLINE idct #-}
idct = idctWorker . G.map (:+0)
-- | Inverse discrete cosine transform (DCT-III). Only real part of vector is
-- transformed, imaginary part is ignored.
idct_ :: U.Vector CD -> U.Vector Double
+{-# INLINE idct_ #-}
idct_ = idctWorker . G.map (\(i :+ _) -> i :+ 0)
idctWorker :: U.Vector CD -> U.Vector Double
-idctWorker xs = G.generate len interleave
+idctWorker xs
+ | vectorOK xs = G.generate len interleave
+ | otherwise = error "Statistics.Transform.dct: bad vector length"
where
interleave z | even z = vals `G.unsafeIndex` halve z
| otherwise = vals `G.unsafeIndex` (len - halve z - 1)
@@ -90,19 +99,20 @@ idctWorker xs = G.generate len interleave
-- | Inverse fast Fourier transform.
ifft :: U.Vector CD -> U.Vector CD
-ifft xs = G.map ((/fi (G.length xs)) . conjugate) . fft . G.map conjugate $ xs
+ifft xs
+ | vectorOK xs = G.map ((/fi (G.length xs)) . conjugate) . fft . G.map conjugate $ xs
+ | otherwise = error "Statistics.Transform.ifft: bad vector length"
-- | Radix-2 decimation-in-time fast Fourier transform.
fft :: U.Vector CD -> U.Vector CD
-fft v = G.create $ do
- mv <- G.thaw v
- mfft mv
- return mv
+fft v | vectorOK v = G.create $ do mv <- G.thaw v
+ mfft mv
+ return mv
+ | otherwise = error "Statistics.Transform.fft: bad vector length"
+-- Vector length must be power of two. It's not checked
mfft :: (M.MVector v CD) => v s CD -> ST s ()
-mfft vec
- | 1 `shiftL` m /= len = error "Statistics.Transform.fft: bad vector size"
- | otherwise = bitReverse 0 0
+mfft vec = bitReverse 0 0
where
bitReverse i j | i == len-1 = stage 0 1
| otherwise = do
@@ -137,3 +147,8 @@ fi = fromIntegral
halve :: Int -> Int
halve = (`shiftR` 1)
+
+
+vectorOK :: U.Unbox a => U.Vector a -> Bool
+{-# INLINE vectorOK #-}
+vectorOK v = (1 `shiftL` log2 n) == n where n = G.length v
diff --git a/statistics.cabal b/statistics.cabal
index 711b0df..b6536b9 100644
--- a/statistics.cabal
+++ b/statistics.cabal
@@ -1,5 +1,5 @@
name: statistics
-version: 0.10.2.0
+version: 0.10.3.0
synopsis: A library of statistical types, data, and functions
description:
This library provides a number of common functions and types useful
@@ -22,6 +22,10 @@ description:
* Common statistical tests for significant differences between
samples.
.
+ Changes in 0.10.3.0
+ .
+ * Bug fixes
+ .
Changes in 0.10.2.0
.
* Bugs in DCT and IDCT are fixed.
@@ -167,6 +171,7 @@ library
Statistics.Distribution.Normal
Statistics.Distribution.Poisson
Statistics.Distribution.StudentT
+ Statistics.Distribution.Transform
Statistics.Distribution.Uniform
Statistics.Function
Statistics.Math
@@ -196,9 +201,9 @@ library
base < 5,
deepseq >= 1.1.0.2,
erf,
- monad-par >= 0.1.0.1,
+ monad-par >= 0.3.4,
mwc-random >= 0.11.0.0,
- math-functions >= 0.1.1,
+ math-functions >= 0.1.2,
primitive >= 0.3,
vector >= 0.7.1,
vector-algorithms >= 0.4
@@ -238,6 +243,7 @@ test-suite tests
test-framework-quickcheck2,
test-framework-hunit,
math-functions,
+ mwc-random,
statistics,
primitive,
vector,
diff --git a/tests/Tests/Distribution.hs b/tests/Tests/Distribution.hs
index c404d05..ad2a1f3 100644
--- a/tests/Tests/Distribution.hs
+++ b/tests/Tests/Distribution.hs
@@ -35,6 +35,7 @@ import Statistics.Distribution.Hypergeometric
import Statistics.Distribution.Normal
import Statistics.Distribution.Poisson
import Statistics.Distribution.StudentT
+import Statistics.Distribution.Transform
import Statistics.Distribution.Uniform
import Prelude hiding (catch)
@@ -53,6 +54,7 @@ distributionTests = testGroup "Tests for all distributions"
, contDistrTests (T :: T NormalDistribution )
, contDistrTests (T :: T UniformDistribution )
, contDistrTests (T :: T StudentT )
+ , contDistrTests (T :: T (LinearTransform StudentT) )
, contDistrTests (T :: T FDistribution )
, discreteDistrTests (T :: T BinomialDistribution )
@@ -82,14 +84,17 @@ discreteDistrTests t = testGroup ("Tests for: " ++ typeName t) $
cdfTests t ++
[ testProperty "Prob. sanity" $ probSanityCheck t
, testProperty "CDF is sum of prob." $ discreteCDFcorrect t
+ , testProperty "Discrete CDF is OK" $ cdfDiscreteIsCorrect t
]
-- Tests for distributions which have CDF
cdfTests :: (Param d, Distribution d, QC.Arbitrary d, Show d) => T d -> [Test]
cdfTests t =
[ testProperty "C.D.F. sanity" $ cdfSanityCheck t
- , testProperty "CDF limit at +∞" $ cdfLimitAtPosInfinity t
- , testProperty "CDF limit at -∞" $ cdfLimitAtNegInfinity t
+ , testProperty "CDF limit at +inf" $ cdfLimitAtPosInfinity t
+ , testProperty "CDF limit at -inf" $ cdfLimitAtNegInfinity t
+ , testProperty "CDF at +inf = 1" $ cdfAtPosInfinity t
+ , testProperty "CDF at -inf = 1" $ cdfAtNegInfinity t
, testProperty "CDF is nondecreasing" $ cdfIsNondecreasing t
, testProperty "1-CDF is correct" $ cdfComplementIsCorrect t
]
@@ -104,13 +109,23 @@ cdfSanityCheck _ d x = c >= 0 && c <= 1
cdfIsNondecreasing :: (Distribution d) => T d -> d -> Double -> Double -> Bool
cdfIsNondecreasing _ d = monotonicallyIncreasesIEEE $ cumulative d
+-- cumulative d +∞ = 1
+cdfAtPosInfinity :: (Param d, Distribution d) => T d -> d -> Bool
+cdfAtPosInfinity _ d
+ = cumulative d (1/0) == 1
+
+-- cumulative d - ∞ = 0
+cdfAtNegInfinity :: (Param d, Distribution d) => T d -> d -> Bool
+cdfAtNegInfinity _ d
+ = cumulative d (-1/0) == 0
+
-- CDF limit at +∞ is 1
cdfLimitAtPosInfinity :: (Param d, Distribution d) => T d -> d -> Property
cdfLimitAtPosInfinity _ d =
okForInfLimit d ==> printTestCase ("Last elements: " ++ show (drop 990 probs))
$ Just 1.0 == (find (>=1) probs)
where
- probs = take 1000 $ map (cumulative d) $ iterate (*1.4) 1
+ probs = take 1000 $ map (cumulative d) $ iterate (*1.4) 1000
-- CDF limit at -∞ is 0
cdfLimitAtNegInfinity :: (Param d, Distribution d) => T d -> d -> Property
@@ -126,6 +141,29 @@ cdfLimitAtNegInfinity _ d =
cdfComplementIsCorrect :: (Distribution d) => T d -> d -> Double -> Bool
cdfComplementIsCorrect _ d x = (eq 1e-14) 1 (cumulative d x + complCumulative d x)
+-- CDF for discrete distribution uses <= for comparison
+cdfDiscreteIsCorrect :: (DiscreteDistr d) => T d -> d -> Property
+cdfDiscreteIsCorrect _ d
+ = printTestCase (unlines $ map show badN)
+ $ null badN
+ where
+ -- We are checking that:
+ --
+ -- > CDF(i) - CDF(i-e) = P(i)
+ --
+ -- Apporixmate equality is tricky here. Scale is set by maximum
+ -- value of CDF and probability. Case when all proabilities are
+ -- zero should be trated specially.
+ badN = [ (i,p,p1,dp, (p1-p-dp) / max p1 dp)
+ | i <- [0 .. 100]
+ , let p = cumulative d $ fromIntegral i - 1e-6
+ p1 = cumulative d $ fromIntegral i
+ dp = probability d i
+ relerr = ((p1 - p) - dp) / max p1 dp
+ , not (p == 0 && p1 == 0 && dp == 0)
+ && relerr > 1e-14
+ ]
+
-- PDF is positive
pdfSanityCheck :: (ContDistr d) => T d -> d -> Double -> Bool
@@ -162,9 +200,9 @@ probSanityCheck _ d x = p >= 0 && p <= 1
-- Check that discrete CDF is correct
discreteCDFcorrect :: (DiscreteDistr d) => T d -> d -> Int -> Int -> Property
discreteCDFcorrect _ d a b
- = printTestCase (printf "CDF = %g" p1)
- $ printTestCase (printf "Sum = %g" p2)
- $ printTestCase (printf "Δ = %g" (abs (p1 - p2)))
+ = printTestCase (printf "CDF = %g" p1)
+ $ printTestCase (printf "Sum = %g" p2)
+ $ printTestCase (printf "Delta = %g" (abs (p1 - p2)))
$ abs (p1 - p2) < 3e-10
-- Avoid too large differeneces. Otherwise there is to much to sum
--
@@ -213,6 +251,11 @@ instance QC.Arbitrary CauchyDistribution where
<*> ((abs <$> arbitrary) `suchThat` (> 0))
instance QC.Arbitrary StudentT where
arbitrary = studentT <$> ((abs <$> arbitrary) `suchThat` (>0))
+instance QC.Arbitrary (LinearTransform StudentT) where
+ arbitrary = studentTUnstandardized
+ <$> ((abs <$> arbitrary) `suchThat` (>0))
+ <*> ((abs <$> arbitrary))
+ <*> ((abs <$> arbitrary) `suchThat` (>0))
instance QC.Arbitrary FDistribution where
arbitrary = fDistribution
<$> ((abs <$> arbitrary) `suchThat` (>0))
@@ -237,6 +280,10 @@ instance Param StudentT where
invQuantilePrec _ = 1e-13
okForInfLimit d = studentTndf d > 0.75
+instance Param (LinearTransform StudentT) where
+ invQuantilePrec _ = 1e-13
+ okForInfLimit d = (studentTndf . linTransDistr) d > 0.75
+
instance Param FDistribution where
invQuantilePrec _ = 1e-12
@@ -257,6 +304,13 @@ unitTests = testGroup "Unit tests"
, testStudentCDF 0.3 3.34 0.757146 -- CDF
, testStudentCDF 1 0.42 0.626569
, testStudentCDF 4.4 0.33 0.621739
+ -- Student-T General
+ , testStudentUnstandardizedPDF 0.3 1.2 4 0.45 0.0533456 -- PDF
+ , testStudentUnstandardizedPDF 4.3 (-2.4) 3.22 (-0.6) 0.0971141
+ , testStudentUnstandardizedPDF 3.8 0.22 7.62 0.14 0.0490523
+ , testStudentUnstandardizedCDF 0.3 1.2 4 0.45 0.458035 -- CDF
+ , testStudentUnstandardizedCDF 4.3 (-2.4) 3.22 (-0.6) 0.698001
+ , testStudentUnstandardizedCDF 3.8 0.22 7.62 0.14 0.496076
-- F-distribution
, testFdistrPDF 1 3 3 (1/(6 * pi)) -- PDF
, testFdistrPDF 2 2 1.2 0.206612
@@ -268,17 +322,24 @@ unitTests = testGroup "Unit tests"
where
-- Student-T
testStudentPDF ndf x exact
- = testAssertion (printf "density (studentT %f) %f ≈ %f" ndf x exact)
+ = testAssertion (printf "density (studentT %f) %f ~ %f" ndf x exact)
$ eq 1e-5 exact (density (studentT ndf) x)
testStudentCDF ndf x exact
- = testAssertion (printf "cumulative (studentT %f) %f ≈ %f" ndf x exact)
+ = testAssertion (printf "cumulative (studentT %f) %f ~ %f" ndf x exact)
$ eq 1e-5 exact (cumulative (studentT ndf) x)
+ -- Student-T General
+ testStudentUnstandardizedPDF ndf mu sigma x exact
+ = testAssertion (printf "density (studentTUnstandardized %f %f %f) %f ~ %f" ndf mu sigma x exact)
+ $ eq 1e-5 exact (density (studentTUnstandardized ndf mu sigma) x)
+ testStudentUnstandardizedCDF ndf mu sigma x exact
+ = testAssertion (printf "cumulative (studentTUnstandardized %f %f %f) %f ~ %f" ndf mu sigma x exact)
+ $ eq 1e-5 exact (cumulative (studentTUnstandardized ndf mu sigma) x)
-- F-distribution
testFdistrPDF n m x exact
- = testAssertion (printf "density (fDistribution %i %i) %f ≈ %f [got %f]" n m x exact d)
+ = testAssertion (printf "density (fDistribution %i %i) %f ~ %f [got %f]" n m x exact d)
$ eq 1e-5 exact d
where d = density (fDistribution n m) x
testFdistrCDF n m x exact
- = testAssertion (printf "cumulative (fDistribution %i %i) %f ≈ %f [got %f]" n m x exact d)
+ = testAssertion (printf "cumulative (fDistribution %i %i) %f ~ %f [got %f]" n m x exact d)
$ eq 1e-5 exact d
where d = cumulative (fDistribution n m) x
diff --git a/tests/Tests/Function.hs b/tests/Tests/Function.hs
index 318c667..dac5818 100644
--- a/tests/Tests/Function.hs
+++ b/tests/Tests/Function.hs
@@ -7,13 +7,15 @@ import Test.QuickCheck
import Test.Framework
import Test.Framework.Providers.QuickCheck2
+import Tests.Helpers
import Statistics.Function
tests :: Test
tests = testGroup "S.Function"
- [ testProperty "Sort is sort" p_sort
+ [ testProperty "Sort is sort" p_sort
+ , testAssertion "nextHighestPowerOfTwo is OK" p_nextHighestPowerOfTwo
]
@@ -23,4 +25,9 @@ p_sort xs =
where
v = sort $ U.fromList xs
- \ No newline at end of file
+p_nextHighestPowerOfTwo :: Bool
+p_nextHighestPowerOfTwo
+ = all (\(good, is) -> all ((==good) . nextHighestPowerOfTwo) is) lists
+ where
+ pows = [1 .. 17]
+ lists = [ (2^m, [2^n+1 .. 2^m]) | (n,m) <- pows `zip` tail pows ]
diff --git a/tests/Tests/Transform.hs b/tests/Tests/Transform.hs
index 192eef9..8d933a3 100644
--- a/tests/Tests/Transform.hs
+++ b/tests/Tests/Transform.hs
@@ -37,6 +37,8 @@ tests = testGroup "fft" [
, testProperty "dct . idct = id [up to scale]"
(t_fftInverse (\v -> U.map (/ (2 * fromIntegral (U.length v))) $ idct $ dct v))
-- Exact small size DCT
+ -- 1
+ , testDCT [1] $ [2]
-- 2
, testDCT [1,0] $ map (*2) [1, cos (pi/4) ]
, testDCT [0,1] $ map (*2) [1, cos (3*pi/4) ]
@@ -46,6 +48,8 @@ tests = testGroup "fft" [
, testDCT [0,0,1,0] $ map (*2) [1, cos(5*pi/8), cos(10*pi/8), cos(15*pi/8)]
, testDCT [0,0,0,1] $ map (*2) [1, cos(7*pi/8), cos(14*pi/8), cos(21*pi/8)]
-- Exact small size IDCT
+ -- 1
+ , testIDCT [1] [1]
-- 2
, testIDCT [1,0] [1, 1 ]
, testIDCT [0,1] $ map (*2) [cos(pi/4), cos(3*pi/4)]