summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorteh <>2015-03-16 13:13:00 (GMT)
committerhdiff <hdiff@hdiff.luite.com>2015-03-16 13:13:00 (GMT)
commita72ffe74e96a954da4b605c5606abb51589a93d6 (patch)
treedbc6d72696d83befe8795291ac5c095f02f9254f /src
version 0.1.0.0HEAD0.1.0.0master
Diffstat (limited to 'src')
-rw-r--r--src/Numeric/StreamingHistogram.hs100
-rw-r--r--src/Numeric/StreamingHistogram/Internal.hs23
2 files changed, 123 insertions, 0 deletions
diff --git a/src/Numeric/StreamingHistogram.hs b/src/Numeric/StreamingHistogram.hs
new file mode 100644
index 0000000..585f832
--- /dev/null
+++ b/src/Numeric/StreamingHistogram.hs
@@ -0,0 +1,100 @@
+{-# LANGUAGE GADTs #-}
+-----------------------------------------------------------------------------
+-- |
+-- Module : Numeric.StreamingHistogram
+-- Maintainer : Tom Hunger
+-- Stability : experimental
+-- Portability : portable
+--
+-- A very inefficient implementation of lossy online histograms based
+-- on the following paper:
+--
+-- * A Streaming Parallel Decision Tree Algorithm by Ben-Haim and
+-- Tom-Tov:
+-- <http://www.jmlr.org/papers/volume11/ben-haim10a/ben-haim10a.pdf>
+--
+-- __NB__ This library is very experimental and inefficient. Numerical
+-- stability has not been tested (yet).
+
+module Numeric.StreamingHistogram
+ ( empty
+ , update
+ , Hist(..)
+ , merge
+ , sum
+ ) where
+
+import qualified Data.List as L
+import qualified Data.Map as M
+import Numeric.StreamingHistogram.Internal (shrink)
+import Prelude hiding (sum, pi)
+
+import Debug.Trace
+
+data Hist where
+ Hist ::
+ { nBins :: Int
+ , histData :: M.Map Double Int} -> Hist
+ deriving (Show, Eq)
+
+-- | Creates an empty histogram to be updated with 'update'
+empty :: Int -- ^ Maximum number of values to keep
+ -> Hist -- ^ The empty histogram
+empty nBins' = Hist { nBins = nBins', histData = M.empty }
+
+
+-- | Adds a single 'value' to the histogram.
+update :: Double -- ^ Value to add
+ -> Hist -- ^ Lossy histogram to update
+ -> Hist -- ^ Histogram with the value incorporated
+update value h =
+ let updated = M.insertWith (+) value 1 (histData h)
+ data' | M.size updated > nBins h = (M.fromList . shrink . M.assocs) updated
+ | otherwise = updated
+ in
+ h { histData = data' }
+
+-- | Merges two histograms. Returns Nothing if they don't have the same
+-- number of values. TODO: Use more efficient algorithm.
+merge :: Hist -- ^ Histogram a
+ -> Hist -- ^ Histogram b
+ -> Maybe Hist -- ^ Returns Nothing if a and b disagree on how
+ -- many values to keep.
+merge ha hb =
+ let union = M.unionWith (+) (histData ha) (histData hb)
+ data' u
+ | length u > nBins ha = data' (shrink u)
+ | otherwise = u
+ in
+ case (nBins ha) == (nBins hb) of
+ True -> Just (ha { histData = M.fromList (data' (M.assocs union)) })
+ False -> Nothing
+
+-- | Estimates how many points lie in the interval [-infinity, b].
+sum :: Double -- ^ The cutoff value b
+ -> Hist -- ^ A histogram
+ -> Double -- ^ Estimated number of points
+sum b hist =
+ result
+ where
+ -- pad the data on both sides to make sure we always have a (p,
+ -- p+1) pair even when (b < min histData) or (b > max histData).
+ -- TODO (1e300 is probably not a good boundary value..).
+ sorted = M.assocs (histData hist)
+ result = discover 0.0 ((-1e300, 0) : sorted ++ [(1e300, 0)])
+
+ -- Find split point for b and keep track of s so far
+ discover :: Double -> [(Double, Int)] -> Double
+ discover sSoFar ((pi, mi):(pj, mj):xs)
+ | trace (show (sSoFar, b, (pi, mi), (pj, mj))) False = undefined
+ | pi <= b && b < pj = (sTrapezoid pi mi pj mj) + sSoFar + (fromIntegral mi) / 2.0
+ | otherwise = discover (sSoFar + fromIntegral mi) ((pj, mj):xs)
+
+ -- Initial value for S (trapezoid rule)
+ sTrapezoid pi mi pj mj =
+ let
+ mb = (fromIntegral mi) + (fromIntegral (mj - mi)) / (pj - pi) * (b - pi)
+ in
+ ((fromIntegral mi) + mb) / 2.0 * (b - pi) / (pj - pi)
+
+-- TODO implement 'uniform' but I don't need it just now
diff --git a/src/Numeric/StreamingHistogram/Internal.hs b/src/Numeric/StreamingHistogram/Internal.hs
new file mode 100644
index 0000000..e19a40b
--- /dev/null
+++ b/src/Numeric/StreamingHistogram/Internal.hs
@@ -0,0 +1,23 @@
+module Numeric.StreamingHistogram.Internal
+ ( shrink
+ ) where
+
+import qualified Data.List as L
+
+-- Internal: Lossy compression of histogram by finding the pair of
+-- values that would introduce minimum error when merged and then do a
+-- weighted merge.
+shrink :: [(Double, Int)] -> [(Double, Int)]
+shrink m
+ | length m == 0 = m
+ | otherwise = shrunk'
+ where
+ deltas = L.zipWith (\(x, _) (y, _)-> y - x) (init m) (L.tail m)
+ minDelta = L.minimum deltas
+ shrunk' = shrunk (init m) (L.tail m)
+
+ shrunk :: [(Double, Int)] -> [(Double, Int)] -> [(Double, Int)]
+ shrunk [] [] = []
+ shrunk ((lq, lk):ls) ((rq, rk):rs)
+ | rq - lq == minDelta = ((lq * (fromIntegral lk) + rq * (fromIntegral rk)) / (fromIntegral (lk + rk)), lk + rk) : rs
+ | otherwise = (lq, lk) : (shrunk ls rs)