summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexeyKhudyakov <>2016-06-02 15:33:00 (GMT)
committerhdiff <hdiff@hdiff.luite.com>2016-06-02 15:33:00 (GMT)
commit209598757c0acd2982e94d80cedc6d89e7a9da76 (patch)
treef74dcc32c89ca871a4bf94b5d841bd82321bc626
parentcff5027d7f80160278c4136332d5be7472ed00b7 (diff)
version 0.13.3.00.13.3.0
-rw-r--r--Statistics/Correlation.hs70
-rw-r--r--Statistics/Correlation/Kendall.hs8
-rw-r--r--Statistics/Distribution.hs4
-rw-r--r--Statistics/Distribution/Exponential.hs2
-rw-r--r--Statistics/Distribution/Laplace.hs125
-rw-r--r--Statistics/Matrix.hs123
-rw-r--r--Statistics/Matrix/Mutable.hs12
-rw-r--r--Statistics/Sample.hs50
-rw-r--r--Statistics/Sample/KernelDensity.hs30
-rw-r--r--Statistics/Test/Internal.hs46
-rw-r--r--Statistics/Transform.hs52
-rw-r--r--benchmark/bench.hs5
-rw-r--r--examples/kde/KDE.hs7
-rw-r--r--statistics.cabal6
-rw-r--r--tests/Tests/Correlation.hs144
-rw-r--r--tests/Tests/Distribution.hs4
-rw-r--r--tests/Tests/Transform.hs4
17 files changed, 637 insertions, 55 deletions
diff --git a/Statistics/Correlation.hs b/Statistics/Correlation.hs
new file mode 100644
index 0000000..218274d
--- /dev/null
+++ b/Statistics/Correlation.hs
@@ -0,0 +1,70 @@
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE BangPatterns #-}
+-- |
+-- Module : Statistics.Correlation.Pearson
+--
+module Statistics.Correlation
+ ( -- * Pearson correlation
+ pearson
+ , pearsonMatByRow
+ -- * Spearman correlation
+ , spearman
+ , spearmanMatByRow
+ ) where
+
+import qualified Data.Vector.Generic as G
+import qualified Data.Vector.Unboxed as U
+import Statistics.Matrix
+import Statistics.Sample
+import Statistics.Test.Internal (rankUnsorted)
+
+
+----------------------------------------------------------------
+-- Pearson
+----------------------------------------------------------------
+
+-- | Pearson correlation for sample of pairs.
+pearson :: (G.Vector v (Double, Double), G.Vector v Double)
+ => v (Double, Double) -> Double
+pearson = correlation
+{-# INLINE pearson #-}
+
+-- | Compute pairwise pearson correlation between rows of a matrix
+pearsonMatByRow :: Matrix -> Matrix
+pearsonMatByRow m
+ = generateSym (rows m)
+ (\i j -> pearson $ row m i `U.zip` row m j)
+{-# INLINE pearsonMatByRow #-}
+
+
+
+----------------------------------------------------------------
+-- Spearman
+----------------------------------------------------------------
+
+-- | compute spearman correlation between two samples
+spearman :: ( Ord a
+ , Ord b
+ , G.Vector v a
+ , G.Vector v b
+ , G.Vector v (a, b)
+ , G.Vector v Int
+ , G.Vector v Double
+ , G.Vector v (Double, Double)
+ , G.Vector v (Int, a)
+ , G.Vector v (Int, b)
+ )
+ => v (a, b)
+ -> Double
+spearman xy
+ = pearson
+ $ G.zip (rankUnsorted x) (rankUnsorted y)
+ where
+ (x, y) = G.unzip xy
+{-# INLINE spearman #-}
+
+-- | compute pairwise spearman correlation between rows of a matrix
+spearmanMatByRow :: Matrix -> Matrix
+spearmanMatByRow
+ = pearsonMatByRow . fromRows . fmap rankUnsorted . toRows
+{-# INLINE spearmanMatByRow #-}
diff --git a/Statistics/Correlation/Kendall.hs b/Statistics/Correlation/Kendall.hs
index e2fed78..4cd7de3 100644
--- a/Statistics/Correlation/Kendall.hs
+++ b/Statistics/Correlation/Kendall.hs
@@ -8,11 +8,11 @@
-- This module implementes Kendall's tau form b which allows ties in the data.
-- This is the same formula used by other statistical packages, e.g., R, matlab.
--
--- $$\tau = \frac{n_c - n_d}{\sqrt{(n_0 - n_1)(n_0 - n_2)}}$$
+-- > \tau = \frac{n_c - n_d}{\sqrt{(n_0 - n_1)(n_0 - n_2)}}
--
--- where $n_0 = n(n-1)/2$, $n_1 = number of pairs tied for the first quantify$,
--- $n_2 = number of pairs tied for the second quantify$,
--- $n_c = number of concordant pairs$, $n_d = number of discordant pairs$.
+-- where n_0 = n(n-1)\/2, n_1 = number of pairs tied for the first quantify,
+-- n_2 = number of pairs tied for the second quantify,
+-- n_c = number of concordant pairs$, n_d = number of discordant pairs.
module Statistics.Correlation.Kendall
( kendall
diff --git a/Statistics/Distribution.hs b/Statistics/Distribution.hs
index 405355d..bde1dc1 100644
--- a/Statistics/Distribution.hs
+++ b/Statistics/Distribution.hs
@@ -8,7 +8,7 @@
-- Stability : experimental
-- Portability : portable
--
--- Types classes for probability distrubutions
+-- Type classes for probability distributions
module Statistics.Distribution
(
@@ -57,7 +57,7 @@ class Distribution d where
--
-- > complCumulative d x = 1 - cumulative d x
--
- -- It's useful when one is interested in P(/X/</x/) and
+ -- It's useful when one is interested in P(/X/>/x/) and
-- expression on the right side begin to lose precision. This
-- function have default implementation but implementors are
-- encouraged to provide more precise implementation.
diff --git a/Statistics/Distribution/Exponential.hs b/Statistics/Distribution/Exponential.hs
index fa07516..e367547 100644
--- a/Statistics/Distribution/Exponential.hs
+++ b/Statistics/Distribution/Exponential.hs
@@ -98,7 +98,7 @@ quantile (ED l) p
error $ "Statistics.Distribution.Exponential.quantile: p must be in [0,1] range. Got: "++show p
-- | Create an exponential distribution.
-exponential :: Double -- ^ &#955; (scale) parameter.
+exponential :: Double -- ^ Rate parameter.
-> ExponentialDistribution
exponential l
| l <= 0 =
diff --git a/Statistics/Distribution/Laplace.hs b/Statistics/Distribution/Laplace.hs
new file mode 100644
index 0000000..b9d10af
--- /dev/null
+++ b/Statistics/Distribution/Laplace.hs
@@ -0,0 +1,125 @@
+{-# LANGUAGE DeriveDataTypeable, DeriveGeneric #-}
+-- |
+-- Module : Statistics.Distribution.Laplace
+-- Copyright : (c) 2015 Mihai Maruseac
+-- License : BSD3
+--
+-- Maintainer : mihai.maruseac@maruseac.com
+-- Stability : experimental
+-- Portability : portable
+--
+-- The Laplace distribution. This is the continuous probability
+-- defined as the difference of two iid exponential random variables
+-- or a Brownian motion evaluated as exponentially distributed times.
+-- It is used in differential privacy (Laplace Method), speech
+-- recognition and least absolute deviations method (Laplace's first
+-- law of errors, giving a robust regression method)
+--
+
+module Statistics.Distribution.Laplace
+ (
+ LaplaceDistribution
+ -- * Constructors
+ , laplace
+ , laplaceFromSample
+ -- * Accessors
+ , ldLocation
+ , ldScale
+ ) where
+
+import Data.Aeson (FromJSON, ToJSON)
+import Data.Binary (Binary(..))
+import Data.Data (Data, Typeable)
+import GHC.Generics (Generic)
+import qualified Data.Vector.Generic as G
+import qualified Statistics.Distribution as D
+import qualified Statistics.Quantile as Q
+import qualified Statistics.Sample as S
+import Statistics.Types (Sample)
+import Control.Applicative ((<$>), (<*>))
+
+
+data LaplaceDistribution = LD {
+ ldLocation :: {-# UNPACK #-} !Double
+ -- ^ Location.
+ , ldScale :: {-# UNPACK #-} !Double
+ -- ^ Scale.
+ } deriving (Eq, Read, Show, Typeable, Data, Generic)
+
+instance FromJSON LaplaceDistribution
+instance ToJSON LaplaceDistribution
+
+instance Binary LaplaceDistribution where
+ put (LD l s) = put l >> put s
+ get = LD <$> get <*> get
+
+instance D.Distribution LaplaceDistribution where
+ cumulative = cumulative
+ complCumulative = complCumulative
+
+instance D.ContDistr LaplaceDistribution where
+ density (LD l s) x = exp (- abs (x - l) / s) / (2 * s)
+ logDensity (LD l s) x = - abs (x - l) / s - log 2 - log s
+ quantile = quantile
+
+instance D.Mean LaplaceDistribution where
+ mean (LD l _) = l
+
+instance D.Variance LaplaceDistribution where
+ variance (LD _ s) = 2 * s * s
+
+instance D.MaybeMean LaplaceDistribution where
+ maybeMean = Just . D.mean
+
+instance D.MaybeVariance LaplaceDistribution where
+ maybeStdDev = Just . D.stdDev
+ maybeVariance = Just . D.variance
+
+instance D.Entropy LaplaceDistribution where
+ entropy (LD _ s) = 1 + log (2 * s)
+
+instance D.MaybeEntropy LaplaceDistribution where
+ maybeEntropy = Just . D.entropy
+
+instance D.ContGen LaplaceDistribution where
+ genContVar = D.genContinous
+
+cumulative :: LaplaceDistribution -> Double -> Double
+cumulative (LD l s) x
+ | x <= l = 0.5 * exp ( (x - l) / s)
+ | otherwise = 1 - 0.5 * exp ( - (x - l) / s )
+
+complCumulative :: LaplaceDistribution -> Double -> Double
+complCumulative (LD l s) x
+ | x <= l = 1 - 0.5 * exp ( (x - l) / s)
+ | otherwise = 0.5 * exp ( - (x - l) / s )
+
+quantile :: LaplaceDistribution -> Double -> Double
+quantile (LD l s) p
+ | p == 0 = -inf
+ | p == 1 = inf
+ | p == 0.5 = l
+ | p > 0 && p < 0.5 = l + s * log (2 * p)
+ | p > 0.5 && p < 1 = l - s * log (2 - 2 * p)
+ | otherwise =
+ error $ "Statistics.Distribution.Laplace.quantile: p must be in [0,1] range. Got: "++show p
+ where
+ inf = 1 / 0
+
+-- | Create an Laplace distribution.
+laplace :: Double -- ^ Location
+ -> Double -- ^ Scale
+ -> LaplaceDistribution
+laplace l s
+ | s <= 0 =
+ error $ "Statistics.Distribution.Laplace.laplace: scale parameter must be positive. Got " ++ show s
+ | otherwise = LD l s
+
+-- | Create Laplace distribution from sample. No tests are made to
+-- check whether it truly is Laplace. Location of distribution
+-- estimated as median of sample.
+laplaceFromSample :: Sample -> LaplaceDistribution
+laplaceFromSample xs = LD s l
+ where
+ s = Q.continuousBy Q.medianUnbiased 1 2 xs
+ l = S.mean $ G.map (\x -> abs $ x - s) xs
diff --git a/Statistics/Matrix.hs b/Statistics/Matrix.hs
index d2f2c1c..2b01607 100644
--- a/Statistics/Matrix.hs
+++ b/Statistics/Matrix.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE PatternGuards #-}
-- |
-- Module : Statistics.Matrix
-- Copyright : 2011 Aleksey Khudyakov, 2014 Bryan O'Sullivan
@@ -9,13 +10,25 @@
-- we implement the necessary minimum here.
module Statistics.Matrix
- (
+ ( -- * Data types
Matrix(..)
, Vector
- , fromList
+ -- * Conversion from/to lists/vectors
, fromVector
+ , fromList
+ , fromRowLists
+ , fromRows
+ , fromColumns
, toVector
, toList
+ , toRows
+ , toColumns
+ , toRowLists
+ -- * Other
+ , generate
+ , generateSym
+ , ident
+ , diag
, dimension
, center
, multiply
@@ -34,10 +47,21 @@ module Statistics.Matrix
) where
import Prelude hiding (exponent, map, sum)
+import Control.Applicative ((<$>))
+import Control.Monad.ST
+import qualified Data.Vector.Unboxed as U
+import Data.Vector.Unboxed ((!))
+import qualified Data.Vector.Unboxed.Mutable as UM
+
import Statistics.Function (for, square)
import Statistics.Matrix.Types
+import Statistics.Matrix.Mutable (unsafeNew,unsafeWrite,unsafeFreeze)
import Statistics.Sample.Internal (sum)
-import qualified Data.Vector.Unboxed as U
+
+
+----------------------------------------------------------------
+-- Conversion to/from vectors/lists
+----------------------------------------------------------------
-- | Convert from a row-major list.
fromList :: Int -- ^ Number of rows.
@@ -46,6 +70,10 @@ fromList :: Int -- ^ Number of rows.
-> Matrix
fromList r c = fromVector r c . U.fromList
+-- | create a matrix from a list of lists, as rows
+fromRowLists :: [[Double]] -> Matrix
+fromRowLists = fromRows . fmap U.fromList
+
-- | Convert from a row-major vector.
fromVector :: Int -- ^ Number of rows.
-> Int -- ^ Number of columns.
@@ -56,6 +84,22 @@ fromVector r c v
| otherwise = Matrix r c 0 v
where len = U.length v
+-- | create a matrix from a list of vectors, as rows
+fromRows :: [Vector] -> Matrix
+fromRows xs
+ | [] <- xs = error "Statistics.Matrix.fromRows: empty list of rows!"
+ | any (/=nCol) ns = error "Statistics.Matrix.fromRows: row sizes do not match"
+ | nCol == 0 = error "Statistics.Matrix.fromRows: zero columns in matrix"
+ | otherwise = fromVector nRow nCol (U.concat xs)
+ where
+ nCol:ns = U.length <$> xs
+ nRow = length xs
+
+
+-- | create a matrix from a list of vectors, as columns
+fromColumns :: [Vector] -> Matrix
+fromColumns = transpose . fromRows
+
-- | Convert to a row-major flat vector.
toVector :: Matrix -> U.Vector Double
toVector (Matrix _ _ _ v) = v
@@ -64,6 +108,78 @@ toVector (Matrix _ _ _ v) = v
toList :: Matrix -> [Double]
toList = U.toList . toVector
+-- | Convert to a list of lists, as rows
+toRowLists :: Matrix -> [[Double]]
+toRowLists (Matrix _ nCol _ v)
+ = chunks $ U.toList v
+ where
+ chunks [] = []
+ chunks xs = case splitAt nCol xs of
+ (rowE,rest) -> rowE : chunks rest
+
+
+-- | Convert to a list of vectors, as rows
+toRows :: Matrix -> [Vector]
+toRows (Matrix _ nCol _ v) = chunks v
+ where
+ chunks xs
+ | U.null xs = []
+ | otherwise = case U.splitAt nCol xs of
+ (rowE,rest) -> rowE : chunks rest
+
+-- | Convert to a list of vectors, as columns
+toColumns :: Matrix -> [Vector]
+toColumns = toRows . transpose
+
+
+
+----------------------------------------------------------------
+-- Other
+----------------------------------------------------------------
+
+-- | Generate matrix using function
+generate :: Int -- ^ Number of rows
+ -> Int -- ^ Number of columns
+ -> (Int -> Int -> Double)
+ -- ^ Function which takes /row/ and /column/ as argument.
+ -> Matrix
+generate nRow nCol f
+ = Matrix nRow nCol 0 $ U.generate (nRow*nCol) $ \i ->
+ let (r,c) = i `quotRem` nCol in f r c
+
+-- | Generate symmetric square matrix using function
+generateSym
+ :: Int -- ^ Number of rows and columns
+ -> (Int -> Int -> Double)
+ -- ^ Function which takes /row/ and /column/ as argument. It must
+ -- be symmetric in arguments: @f i j == f j i@
+ -> Matrix
+generateSym n f = runST $ do
+ m <- unsafeNew n n
+ for 0 n $ \r -> do
+ unsafeWrite m r r (f r r)
+ for (r+1) n $ \c -> do
+ let x = f r c
+ unsafeWrite m r c x
+ unsafeWrite m c r x
+ unsafeFreeze m
+
+
+-- | Create the square identity matrix with given dimensions.
+ident :: Int -> Matrix
+ident n = diag $ U.replicate n 1.0
+
+-- | Create a square matrix with given diagonal, other entries default to 0
+diag :: Vector -> Matrix
+diag v
+ = Matrix n n 0 $ U.create $ do
+ arr <- UM.replicate (n*n) 0
+ for 0 n $ \i ->
+ UM.unsafeWrite arr (i*n + i) (v ! i)
+ return arr
+ where
+ n = U.length v
+
-- | Return the dimensions of this matrix, as a (row,column) pair.
dimension :: Matrix -> (Int, Int)
dimension (Matrix r c _ _) = (r, c)
@@ -125,6 +241,7 @@ unsafeIndex :: Matrix
-> Double
unsafeIndex = unsafeBounds U.unsafeIndex
+-- | Apply function to every element of matrix
map :: (Double -> Double) -> Matrix -> Matrix
map f (Matrix r c e v) = Matrix r c e (U.map f v)
diff --git a/Statistics/Matrix/Mutable.hs b/Statistics/Matrix/Mutable.hs
index 8e4e68a..24f58be 100644
--- a/Statistics/Matrix/Mutable.hs
+++ b/Statistics/Matrix/Mutable.hs
@@ -12,6 +12,7 @@ module Statistics.Matrix.Mutable
, replicate
, thaw
, bounds
+ , unsafeNew
, unsafeFreeze
, unsafeRead
, unsafeWrite
@@ -37,6 +38,17 @@ thaw (Matrix r c e v) = MMatrix r c e <$> U.thaw v
unsafeFreeze :: MMatrix s -> ST s Matrix
unsafeFreeze (MMatrix r c e mv) = Matrix r c e <$> U.unsafeFreeze mv
+-- | Allocate new matrix. Matrix content is not initialized hence unsafe.
+unsafeNew :: Int -- ^ Number of row
+ -> Int -- ^ Number of columns
+ -> ST s (MMatrix s)
+unsafeNew r c
+ | r < 0 = error "Statistics.Matrix.Mutable.unsafeNew: negative number of rows"
+ | c < 0 = error "Statistics.Matrix.Mutable.unsafeNew: negative number of columns"
+ | otherwise = do
+ vec <- M.new (r*c)
+ return $ MMatrix r c 0 vec
+
unsafeRead :: MMatrix s -> Int -> Int -> ST s Double
unsafeRead mat r c = unsafeBounds mat r c M.unsafeRead
{-# INLINE unsafeRead #-}
diff --git a/Statistics/Sample.hs b/Statistics/Sample.hs
index 94775db..f4ffb09 100644
--- a/Statistics/Sample.hs
+++ b/Statistics/Sample.hs
@@ -50,6 +50,10 @@ module Statistics.Sample
, fastVarianceUnbiased
, fastStdDev
+ -- * Joint distirbutions
+ , covariance
+ , correlation
+ , pair
-- * References
-- $references
) where
@@ -340,6 +344,52 @@ fastStdDev :: (G.Vector v Double) => v Double -> Double
fastStdDev = sqrt . fastVariance
{-# INLINE fastStdDev #-}
+-- | Covariance of sample of pairs. For empty sample it's set to
+-- zero
+covariance :: (G.Vector v (Double,Double), G.Vector v Double)
+ => v (Double,Double)
+ -> Double
+covariance xy
+ | n == 0 = 0
+ | otherwise = mean $ G.zipWith (*)
+ (G.map (\x -> x - muX) xs)
+ (G.map (\y -> y - muY) ys)
+ where
+ n = G.length xy
+ (xs,ys) = G.unzip xy
+ muX = mean xs
+ muY = mean ys
+{-# SPECIALIZE covariance :: U.Vector (Double,Double) -> Double #-}
+{-# SPECIALIZE covariance :: V.Vector (Double,Double) -> Double #-}
+
+-- | Correlation coefficient for sample of pairs. Also known as
+-- Pearson's correlation. For empty sample it's set to zero.
+correlation :: (G.Vector v (Double,Double), G.Vector v Double)
+ => v (Double,Double)
+ -> Double
+correlation xy
+ | n == 0 = 0
+ | otherwise = cov / sqrt (varX * varY)
+ where
+ n = G.length xy
+ (xs,ys) = G.unzip xy
+ (muX,varX) = meanVariance xs
+ (muY,varY) = meanVariance ys
+ cov = mean $ G.zipWith (*)
+ (G.map (\x -> x - muX) xs)
+ (G.map (\y -> y - muY) ys)
+{-# SPECIALIZE correlation :: U.Vector (Double,Double) -> Double #-}
+{-# SPECIALIZE correlation :: V.Vector (Double,Double) -> Double #-}
+
+
+-- | Pair two samples. It's like 'G.zip' but requires that both
+-- samples have equal size.
+pair :: (G.Vector v a, G.Vector v b, G.Vector v (a,b)) => v a -> v b -> v (a,b)
+pair va vb
+ | G.length va == G.length vb = G.zip va vb
+ | otherwise = error "Statistics.Sample.pair: vector must have same length"
+{-# INLINE pair #-}
+
------------------------------------------------------------------------
-- Helper code. Monomorphic unpacked accumulators.
diff --git a/Statistics/Sample/KernelDensity.hs b/Statistics/Sample/KernelDensity.hs
index c78d307..bfee05e 100644
--- a/Statistics/Sample/KernelDensity.hs
+++ b/Statistics/Sample/KernelDensity.hs
@@ -31,9 +31,10 @@ import Statistics.Function (minMax, nextHighestPowerOfTwo)
import Statistics.Math.RootFinding (fromRoot, ridders)
import Statistics.Sample.Histogram (histogram_)
import Statistics.Sample.Internal (sum)
-import Statistics.Transform (dct, idct)
-import qualified Data.Vector.Generic as G
-import qualified Data.Vector.Unboxed as U
+import Statistics.Transform (CD, dct, idct)
+import qualified Data.Vector.Generic as G
+import qualified Data.Vector.Unboxed as U
+import qualified Data.Vector as V
-- | Gaussian kernel density estimator for one-dimensional data, using
@@ -46,17 +47,22 @@ import qualified Data.Vector.Unboxed as U
-- mesh interval, use 'kde_'.)
--
-- * Density estimates at each mesh point.
-kde :: Int
+kde :: (G.Vector v CD, G.Vector v Double, G.Vector v Int)
+ => Int
-- ^ The number of mesh points to use in the uniform discretization
-- of the interval @(min,max)@. If this value is not a power of
-- two, then it is rounded up to the next power of two.
- -> U.Vector Double -> (U.Vector Double, U.Vector Double)
+ -> v Double -> (v Double, v Double)
kde n0 xs = kde_ n0 (lo - range / 10) (hi + range / 10) xs
where
(lo,hi) = minMax xs
- range | U.length xs <= 1 = 1 -- Unreasonable guess
+ range | G.length xs <= 1 = 1 -- Unreasonable guess
| lo == hi = 1 -- All elements are equal
| otherwise = hi - lo
+{-# INLINABLE kde #-}
+{-# SPECIAlIZE kde :: Int -> U.Vector Double -> (U.Vector Double, U.Vector Double) #-}
+{-# SPECIAlIZE kde :: Int -> V.Vector Double -> (V.Vector Double, V.Vector Double) #-}
+
-- | Gaussian kernel density estimator for one-dimensional data, using
-- the method of Botev et al.
@@ -66,7 +72,8 @@ kde n0 xs = kde_ n0 (lo - range / 10) (hi + range / 10) xs
-- * The coordinates of each mesh point.
--
-- * Density estimates at each mesh point.
-kde_ :: Int
+kde_ :: (G.Vector v CD, G.Vector v Double, G.Vector v Int)
+ => Int
-- ^ The number of mesh points to use in the uniform discretization
-- of the interval @(min,max)@. If this value is not a power of
-- two, then it is rounded up to the next power of two.
@@ -74,9 +81,10 @@ kde_ :: Int
-- ^ Lower bound (@min@) of the mesh range.
-> Double
-- ^ Upper bound (@max@) of the mesh range.
- -> U.Vector Double -> (U.Vector Double, U.Vector Double)
+ -> v Double
+ -> (v Double, v Double)
kde_ n0 min max xs
- | U.null xs = error "Statistics.KernelDensity.kde: empty sample"
+ | G.null xs = error "Statistics.KernelDensity.kde: empty sample"
| n0 <= 1 = error "Statistics.KernelDensity.kde: invalid number of points"
| otherwise = (mesh, density)
where
@@ -103,6 +111,10 @@ kde_ n0 min max xs
const = (1 + 0.5 ** (s+0.5)) / 3
k0 = U.product (G.enumFromThenTo 1 3 (2*s-1)) / m_sqrt_2_pi
sqr x = x * x
+{-# INLINABLE kde_ #-}
+{-# SPECIAlIZE kde_ :: Int -> Double -> Double -> U.Vector Double -> (U.Vector Double, U.Vector Double) #-}
+{-# SPECIAlIZE kde_ :: Int -> Double -> Double -> V.Vector Double -> (V.Vector Double, V.Vector Double) #-}
+
-- $references
--
diff --git a/Statistics/Test/Internal.hs b/Statistics/Test/Internal.hs
index 4383a50..bef70d5 100644
--- a/Statistics/Test/Internal.hs
+++ b/Statistics/Test/Internal.hs
@@ -1,11 +1,15 @@
{-# LANGUAGE FlexibleContexts #-}
module Statistics.Test.Internal (
rank
+ , rankUnsorted
, splitByTags
) where
-import qualified Data.Vector.Generic as G
-
+import Data.Ord
+import Data.Vector.Generic ((!))
+import qualified Data.Vector.Generic as G
+import qualified Data.Vector.Generic.Mutable as M
+import Statistics.Function
-- Private data type for unfolding
@@ -16,7 +20,14 @@ data Rank v a = Rank {
, rankVec :: v a -- Remaining vector
}
--- | Calculate rank of sample. Sample should be already sorted.
+-- | Calculate rank of every element of sample. In case of ties ranks
+-- are averaged. Sample should be already sorted in ascending order.
+--
+-- >>> rank (==) (fromList [10,20,30::Int])
+-- > fromList [1.0,2.0,3.0]
+--
+-- >>> rank (==) (fromList [10,10,10,30::Int])
+-- > fromList [2.0,2.0,2.0,4.0]
rank :: (G.Vector v a, G.Vector v Double)
=> (a -> a -> Bool) -- ^ Equivalence relation
-> v a -- ^ Vector to rank
@@ -38,6 +49,35 @@ rank eq vec = G.unfoldr go (Rank 0 (-1) 1 vec)
go (Rank n val r v) = Just (val, Rank (n-1) val r v)
{-# INLINE rank #-}
+-- | Compute rank of every element of vector. Unlike rank it doesn't
+-- require sample to be sorted.
+rankUnsorted :: ( Ord a
+ , G.Vector v a
+ , G.Vector v Int
+ , G.Vector v Double
+ , G.Vector v (Int, a)
+ )
+ => v a
+ -> v Double
+rankUnsorted xs = G.create $ do
+ -- Put ranks into their original positions
+ -- NOTE: backpermute will do wrong thing
+ vec <- M.new n
+ for 0 n $ \i ->
+ M.unsafeWrite vec (index ! i) (ranks ! i)
+ return vec
+ where
+ n = G.length xs
+ -- Calculate ranks for sorted array
+ ranks = rank (==) sorted
+ -- Sort vector and retain original indices of elements
+ (index, sorted)
+ = G.unzip
+ $ sortBy (comparing snd)
+ $ indexed xs
+{-# INLINE rankUnsorted #-}
+
+
-- | Split tagged vector
splitByTags :: (G.Vector v a, G.Vector v (Bool,a)) => v (Bool,a) -> (v a, v a)
splitByTags vs = (G.map snd a, G.map snd b)
diff --git a/Statistics/Transform.hs b/Statistics/Transform.hs
index bb41ab3..7159bbf 100644
--- a/Statistics/Transform.hs
+++ b/Statistics/Transform.hs
@@ -34,23 +34,30 @@ import Control.Monad.ST (ST)
import Data.Bits (shiftL, shiftR)
import Data.Complex (Complex(..), conjugate, realPart)
import Numeric.SpecFunctions (log2)
-import qualified Data.Vector.Generic as G
+import qualified Data.Vector.Generic as G
import qualified Data.Vector.Generic.Mutable as M
-import qualified Data.Vector.Unboxed as U
-
+import qualified Data.Vector.Unboxed as U
+import qualified Data.Vector as V
type CD = Complex Double
-- | Discrete cosine transform (DCT-II).
-dct :: U.Vector Double -> U.Vector Double
+dct :: (G.Vector v CD, G.Vector v Double, G.Vector v Int) => v Double -> v Double
dct = dctWorker . G.map (:+0)
+{-# INLINABLE dct #-}
+{-# SPECIAlIZE dct :: U.Vector Double -> U.Vector Double #-}
+{-# SPECIAlIZE dct :: V.Vector Double -> V.Vector Double #-}
-- | Discrete cosine transform (DCT-II). Only real part of vector is
-- transformed, imaginary part is ignored.
-dct_ :: U.Vector CD -> U.Vector Double
+dct_ :: (G.Vector v CD, G.Vector v Double, G.Vector v Int) => v CD -> v Double
dct_ = dctWorker . G.map (\(i :+ _) -> i :+ 0)
+{-# INLINABLE dct_ #-}
+{-# SPECIAlIZE dct_ :: U.Vector CD -> U.Vector Double #-}
+{-# SPECIAlIZE dct_ :: V.Vector CD -> V.Vector Double#-}
-dctWorker :: U.Vector CD -> U.Vector Double
+dctWorker :: (G.Vector v CD, G.Vector v Double, G.Vector v Int) => v CD -> v Double
+{-# INLINE dctWorker #-}
dctWorker xs
-- length 1 is special cased because shuffle algorithms fail for it.
| G.length xs == 1 = G.map ((2*) . realPart) xs
@@ -70,15 +77,22 @@ dctWorker xs
-- 'dct' only up to scale parameter:
--
-- > (idct . dct) x = (* length x)
-idct :: U.Vector Double -> U.Vector Double
+idct :: (G.Vector v CD, G.Vector v Double) => v Double -> v Double
idct = idctWorker . G.map (:+0)
+{-# INLINABLE idct #-}
+{-# SPECIAlIZE idct :: U.Vector Double -> U.Vector Double #-}
+{-# SPECIAlIZE idct :: V.Vector Double -> V.Vector Double #-}
-- | Inverse discrete cosine transform (DCT-III). Only real part of vector is
-- transformed, imaginary part is ignored.
-idct_ :: U.Vector CD -> U.Vector Double
+idct_ :: (G.Vector v CD, G.Vector v Double) => v CD -> v Double
idct_ = idctWorker . G.map (\(i :+ _) -> i :+ 0)
+{-# INLINABLE idct_ #-}
+{-# SPECIAlIZE idct_ :: U.Vector CD -> U.Vector Double #-}
+{-# SPECIAlIZE idct_ :: V.Vector CD -> V.Vector Double #-}
-idctWorker :: U.Vector CD -> U.Vector Double
+idctWorker :: (G.Vector v CD, G.Vector v Double) => v CD -> v Double
+{-# INLINE idctWorker #-}
idctWorker xs
| vectorOK xs = G.generate len interleave
| otherwise = error "Statistics.Transform.dct: bad vector length"
@@ -93,21 +107,29 @@ idctWorker xs
len = G.length xs
+
-- | Inverse fast Fourier transform.
-ifft :: U.Vector CD -> U.Vector CD
+ifft :: G.Vector v CD => v CD -> v CD
ifft xs
| vectorOK xs = G.map ((/fi (G.length xs)) . conjugate) . fft . G.map conjugate $ xs
| otherwise = error "Statistics.Transform.ifft: bad vector length"
+{-# INLINABLE ifft #-}
+{-# SPECIAlIZE ifft :: U.Vector CD -> U.Vector CD #-}
+{-# SPECIAlIZE ifft :: V.Vector CD -> V.Vector CD #-}
-- | Radix-2 decimation-in-time fast Fourier transform.
-fft :: U.Vector CD -> U.Vector CD
+fft :: G.Vector v CD => v CD -> v CD
fft v | vectorOK v = G.create $ do mv <- G.thaw v
mfft mv
return mv
| otherwise = error "Statistics.Transform.fft: bad vector length"
+{-# INLINABLE fft #-}
+{-# SPECIAlIZE fft :: U.Vector CD -> U.Vector CD #-}
+{-# SPECIAlIZE fft :: V.Vector CD -> V.Vector CD #-}
-- Vector length must be power of two. It's not checked
mfft :: (M.MVector v CD) => v s CD -> ST s ()
+{-# INLINE mfft #-}
mfft vec = bitReverse 0 0
where
bitReverse i j | i == len-1 = stage 0 1
@@ -138,13 +160,17 @@ mfft vec = bitReverse 0 0
len = M.length vec
m = log2 len
+
+----------------------------------------------------------------
+-- Helpers
+----------------------------------------------------------------
+
fi :: Int -> CD
fi = fromIntegral
halve :: Int -> Int
halve = (`shiftR` 1)
-
-vectorOK :: U.Unbox a => U.Vector a -> Bool
+vectorOK :: G.Vector v a => v a -> Bool
{-# INLINE vectorOK #-}
vectorOK v = (1 `shiftL` log2 n) == n where n = G.length v
diff --git a/benchmark/bench.hs b/benchmark/bench.hs
index d1b24d4..6345980 100644
--- a/benchmark/bench.hs
+++ b/benchmark/bench.hs
@@ -3,6 +3,7 @@ import Criterion.Main
import Data.Complex
import Statistics.Sample
import Statistics.Transform
+import Statistics.Correlation.Pearson
import System.Random.MWC
import qualified Data.Vector.Unboxed as U
@@ -35,6 +36,10 @@ main =
, bench "variance" $ nf (\x -> variance x) sample
, bench "varianceUnbiased" $ nf (\x -> varianceUnbiased x) sample
, bench "varianceWeighted" $ nf (\x -> varianceWeighted x) sampleW
+ -- Correlation
+ , bench "pearson" $ nf (\x -> pearson (U.reverse sample) x) sample
+ , bench "pearson'" $ nf (\x -> pearson' (U.reverse sample) x) sample
+ , bench "pearsonFast" $ nf (\x -> pearsonFast (U.reverse sample) x) sample
-- Other
, bench "stdDev" $ nf (\x -> stdDev x) sample
, bench "skewness" $ nf (\x -> skewness x) sample
diff --git a/examples/kde/KDE.hs b/examples/kde/KDE.hs
index a9640b9..0cc677b 100644
--- a/examples/kde/KDE.hs
+++ b/examples/kde/KDE.hs
@@ -4,11 +4,12 @@ import Control.Applicative ((<$>))
import Statistics.Sample.KernelDensity (kde)
import Text.Hastache (MuType(..), defaultConfig, hastacheFile)
import Text.Hastache.Context (mkStrContext)
-import qualified Data.Attoparsec as B
-import qualified Data.Attoparsec.Char8 as A
+import qualified Data.Attoparsec.ByteString as B
+import qualified Data.Attoparsec.ByteString.Char8 as A
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as L
import qualified Data.Vector.Unboxed as U
+import qualified Data.Text.Lazy.IO as TL
csv = do
B.takeTill A.isEndOfLine
@@ -20,4 +21,4 @@ main = do
let xs = map (\(a,b) -> [a,b]) . U.toList . uncurry U.zip . kde 64 $ waits
context "data" = MuVariable . show $ xs
s <- hastacheFile defaultConfig "kde.tpl" (mkStrContext context)
- L.writeFile "kde.html" s
+ TL.writeFile "kde.html" s
diff --git a/statistics.cabal b/statistics.cabal
index 6f12ab7..eda3465 100644
--- a/statistics.cabal
+++ b/statistics.cabal
@@ -1,5 +1,5 @@
name: statistics
-version: 0.13.2.3
+version: 0.13.3.0
synopsis: A library of statistical types, data, and functions
description:
This library provides a number of common functions and types useful
@@ -49,6 +49,7 @@ library
exposed-modules:
Statistics.Autocorrelation
Statistics.Constants
+ Statistics.Correlation
Statistics.Correlation.Kendall
Statistics.Distribution
Statistics.Distribution.Beta
@@ -60,6 +61,7 @@ library
Statistics.Distribution.Gamma
Statistics.Distribution.Geometric
Statistics.Distribution.Hypergeometric
+ Statistics.Distribution.Laplace
Statistics.Distribution.Normal
Statistics.Distribution.Poisson
Statistics.Distribution.StudentT
@@ -134,7 +136,7 @@ test-suite tests
Tests.Transform
ghc-options:
- -Wall -threaded -rtsopts
+ -Wall -threaded -rtsopts -fsimpl-tick-factor=500
build-depends:
HUnit,
diff --git a/tests/Tests/Correlation.hs b/tests/Tests/Correlation.hs
index 2f3b3b7..a473170 100644
--- a/tests/Tests/Correlation.hs
+++ b/tests/Tests/Correlation.hs
@@ -3,35 +3,153 @@
module Tests.Correlation
( tests ) where
+import Control.Arrow (Arrow(..))
+import qualified Data.Vector as V
+import Statistics.Matrix hiding (map)
+import Statistics.Correlation
+import Statistics.Correlation.Kendall
+import Test.QuickCheck ((==>),Property,counterexample)
import Test.Framework
import Test.Framework.Providers.QuickCheck2
import Test.Framework.Providers.HUnit
-import Test.HUnit (Assertion, (@=?))
-import qualified Data.Vector as V
-import Statistics.Correlation.Kendall
+import Test.HUnit (Assertion, (@=?), assertBool)
+
+import Tests.ApproxEq
+
+----------------------------------------------------------------
+-- Tests list
+----------------------------------------------------------------
tests :: Test
tests = testGroup "Correlation"
- [ testProperty "Kendall test -- general" testKendall
- , testCase "Kendall test -- special cases" testKendallSpecial
+ [ testProperty "Pearson correlation" testPearson
+ , testProperty "Spearman correlation is scale invariant" testSpearmanScale
+ , testProperty "Spearman correlation, nonlinear" testSpearmanNonlinear
+ , testProperty "Kendall test -- general" testKendall
+ , testCase "Kendall test -- special cases" testKendallSpecial
]
+
+----------------------------------------------------------------
+-- Pearson's correlation
+----------------------------------------------------------------
+
+testPearson :: [(Double,Double)] -> Property
+testPearson sample
+ = (length sample > 1) ==> (exact ~= fast)
+ where
+ (~=) = eql 1e-12
+ exact = exactPearson $ map (realToFrac *** realToFrac) sample
+ fast = pearson $ V.fromList sample
+
+exactPearson :: [(Rational,Rational)] -> Double
+exactPearson sample
+ = realToFrac cov / sqrt (realToFrac (varX * varY))
+ where
+ (xs,ys) = unzip sample
+ n = fromIntegral $ length sample
+ -- Mean
+ muX = sum xs / n
+ muY = sum ys / n
+ -- Mean of squares
+ muX2 = sum (map (\x->x*x) xs) / n
+ muY2 = sum (map (\x->x*x) ys) / n
+ -- Covariance
+ cov = sum (zipWith (*) [x - muX | x<-xs] [y - muY | y<-ys]) / n
+ varX = muX2 - muX*muX
+ varY = muY2 - muY*muY
+
+----------------------------------------------------------------
+-- Spearman's correlation
+----------------------------------------------------------------
+
+-- Test that Spearman correlation is scale invariant
+testSpearmanScale :: [(Double,Double)] -> Double -> Property
+testSpearmanScale xs a
+ = and [ length xs > 1 -- Enough to calculate underflow
+ , a /= 0
+ , not (isNaN c1)
+ , not (isNaN c2)
+ , not (isNaN c3)
+ , not (isNaN c4)
+ ]
+ ==> ( counterexample (show xs2)
+ $ counterexample (show xs3)
+ $ counterexample (show xs4)
+ $ counterexample (show (c1,c2,c3,c4))
+ $ and [ c1 == c4
+ , c1 == signum a * c2
+ , c1 == signum a * c3
+ ]
+ )
+ where
+ xs2 = map ((*a) *** id ) xs
+ xs3 = map (id *** (*a)) xs
+ xs4 = map ((*a) *** (*a)) xs
+ c1 = spearman $ V.fromList xs
+ c2 = spearman $ V.fromList xs2
+ c3 = spearman $ V.fromList xs3
+ c4 = spearman $ V.fromList xs4
+
+-- Test that Spearman correlation allows to transform sample with
+testSpearmanNonlinear :: [(Double,Double)] -> Property
+testSpearmanNonlinear sample0
+ = and [ length sample0 > 1
+ , not (isNaN c1)
+ , not (isNaN c2)
+ , not (isNaN c3)
+ , not (isNaN c4)
+ ]
+ ==> ( counterexample (show sample0)
+ $ counterexample (show sample1)
+ $ counterexample (show sample2)
+ $ counterexample (show sample3)
+ $ counterexample (show sample4)
+ $ counterexample (show (c1,c2,c3,c4))
+ $ and [ c1 == c2
+ , c1 == c3
+ , c1 == c4
+ ]
+ )
+ where
+ -- We need to stretch sample into [-10 .. 10] range to avoid
+ -- problems with under/overflows etc.
+ stretch xs
+ | a == b = xs
+ | otherwise = [ (x - a - 10) * 20 / (a - b) | x <- xs ]
+ where
+ a = minimum xs
+ b = maximum xs
+ sample1 = uncurry zip $ (stretch *** stretch) $ unzip sample0
+ sample2 = map (exp *** id ) sample1
+ sample3 = map (id *** exp) sample1
+ sample4 = map (exp *** exp) sample1
+ c1 = spearman $ V.fromList sample1
+ c2 = spearman $ V.fromList sample2
+ c3 = spearman $ V.fromList sample3
+ c4 = spearman $ V.fromList sample4
+
+
+----------------------------------------------------------------
+-- Kendall's correlation
+----------------------------------------------------------------
+
testKendall :: [(Double, Double)] -> Bool
testKendall xy | isNaN r1 = isNaN r2
| otherwise = r1 == r2
where
- r1 = kendallBruteForce xy
+ r1 = kendallBruteForce xy
r2 = kendall $ V.fromList xy
testKendallSpecial :: Assertion
-testKendallSpecial = ys @=? map (kendall.V.fromList) xs
- where
- (xs, ys) = unzip testData
- testData :: [([(Double, Double)], Double)]
- testData = [ ( [(1,1), (2,2), (3,1), (1,5), (2,2)], -0.375 )
- , ( [(1,3), (1,3), (1,3), (3,2), (3,5)], 0)
+testKendallSpecial = vs @=? map (\(xs, ys) -> kendall $ V.fromList $ zip xs ys) d
+ where
+ (d, vs) = unzip testData
+ testData :: [(([Double], [Double]), Double)]
+ testData = [ (([1, 2, 3, 1, 2], [1, 2, 1, 5, 2]), -0.375)
+ , (([1, 1, 1, 3, 3], [3, 3, 3, 2, 5]), 0)
]
-
+
kendallBruteForce :: [(Double, Double)] -> Double
kendallBruteForce xy = (n_c - n_d) / sqrt ((n_0 - n_1) * (n_0 - n_2))
diff --git a/tests/Tests/Distribution.hs b/tests/Tests/Distribution.hs
index b2ff38d..836513c 100644
--- a/tests/Tests/Distribution.hs
+++ b/tests/Tests/Distribution.hs
@@ -17,6 +17,7 @@ import Statistics.Distribution.FDistribution (FDistribution, fDistribution)
import Statistics.Distribution.Gamma (GammaDistribution, gammaDistr)
import Statistics.Distribution.Geometric
import Statistics.Distribution.Hypergeometric
+import Statistics.Distribution.Laplace (LaplaceDistribution, laplace)
import Statistics.Distribution.Normal (NormalDistribution, normalDistr)
import Statistics.Distribution.Poisson (PoissonDistribution, poisson)
import Statistics.Distribution.StudentT
@@ -42,6 +43,7 @@ tests = testGroup "Tests for all distributions"
, contDistrTests (T :: T ChiSquared )
, contDistrTests (T :: T ExponentialDistribution )
, contDistrTests (T :: T GammaDistribution )
+ , contDistrTests (T :: T LaplaceDistribution )
, contDistrTests (T :: T NormalDistribution )
, contDistrTests (T :: T UniformDistribution )
, contDistrTests (T :: T StudentT )
@@ -252,6 +254,8 @@ instance QC.Arbitrary BinomialDistribution where
arbitrary = binomial <$> QC.choose (1,100) <*> QC.choose (0,1)
instance QC.Arbitrary ExponentialDistribution where
arbitrary = exponential <$> QC.choose (0,100)
+instance QC.Arbitrary LaplaceDistribution where
+ arbitrary = laplace <$> QC.choose (-10,10) <*> QC.choose (0, 2)
instance QC.Arbitrary GammaDistribution where
arbitrary = gammaDistr <$> QC.choose (0.1,10) <*> QC.choose (0.1,10)
instance QC.Arbitrary BetaDistribution where
diff --git a/tests/Tests/Transform.hs b/tests/Tests/Transform.hs
index 70b598a..40afca5 100644
--- a/tests/Tests/Transform.hs
+++ b/tests/Tests/Transform.hs
@@ -60,7 +60,7 @@ tests = testGroup "fft" [
-- vector should be replicated in every real component of the result,
-- and all the imaginary components should be zero.
t_impulse :: Double -> Positive Int -> Bool
-t_impulse k (Positive m) = G.all (c_near i) (fft v)
+t_impulse k (Positive m) = U.all (c_near i) (fft v)
where v = i `G.cons` G.replicate (n-1) 0
i = k :+ 0
n = 1 `shiftL` (m .&. 6)
@@ -69,7 +69,7 @@ t_impulse k (Positive m) = G.all (c_near i) (fft v)
-- otherwise zero vector, the sum-of-squares of each component of the
-- result should equal the square of the impulse.
t_impulse_offset :: Double -> Positive Int -> Positive Int -> Bool
-t_impulse_offset k (Positive x) (Positive m) = G.all ok (fft v)
+t_impulse_offset k (Positive x) (Positive m) = U.all ok (fft v)
where v = G.concat [G.replicate xn 0, G.singleton i, G.replicate (n-xn-1) 0]
ok (re :+ im) = within ulps (re*re + im*im) (k*k)
i = k :+ 0