summaryrefslogtreecommitdiff
path: root/src/Numeric/StreamingHistogram.hs
blob: 585f8324ee76f30267233915c67372b26bd6feb8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
{-# LANGUAGE GADTs #-}
-----------------------------------------------------------------------------
-- |
-- Module      :  Numeric.StreamingHistogram
-- Maintainer  :  Tom Hunger
-- Stability   :  experimental
-- Portability :  portable
--
-- A very inefficient implementation of lossy online histograms based
-- on the following paper:
--
-- * A Streaming Parallel Decision Tree Algorithm by Ben-Haim and
--   Tom-Tov:
--   <http://www.jmlr.org/papers/volume11/ben-haim10a/ben-haim10a.pdf>
--
-- __NB__ This library is very experimental and inefficient. Numerical
-- stability has not been tested (yet).

module Numeric.StreamingHistogram
       ( empty
       , update
       , Hist(..)
       , merge
       , sum
       ) where

import qualified Data.List as L
import qualified Data.Map as M
import           Numeric.StreamingHistogram.Internal (shrink)
import           Prelude hiding (sum, pi)

import Debug.Trace

data Hist where
    Hist ::
        { nBins :: Int
        , histData :: M.Map Double Int} -> Hist
    deriving (Show, Eq)

-- | Creates an empty histogram to be updated with 'update'
empty :: Int -- ^ Maximum number of values to keep
      -> Hist -- ^ The empty histogram
empty nBins' = Hist { nBins = nBins', histData = M.empty }


-- | Adds a single 'value' to the histogram.
update :: Double -- ^ Value to add
       -> Hist -- ^ Lossy histogram to update
       -> Hist -- ^ Histogram with the value incorporated
update value h =
    let updated = M.insertWith (+) value 1 (histData h)
        data' | M.size updated > nBins h = (M.fromList . shrink . M.assocs) updated
              | otherwise = updated
    in
       h { histData = data' }

-- | Merges two histograms. Returns Nothing if they don't have the same
-- number of values. TODO: Use more efficient algorithm.
merge :: Hist -- ^ Histogram a
         -> Hist -- ^ Histogram b
         -> Maybe Hist -- ^ Returns Nothing if a and b disagree on how
                       -- many values to keep.
merge ha hb =
    let union = M.unionWith (+) (histData ha) (histData hb)
        data' u
            | length u > nBins ha = data' (shrink u)
            | otherwise = u
    in
         case (nBins ha) == (nBins hb) of
            True -> Just (ha { histData = M.fromList (data' (M.assocs union)) })
            False -> Nothing

-- | Estimates how many points lie in the interval [-infinity, b].
sum :: Double -- ^ The cutoff value b
       -> Hist -- ^ A histogram
       -> Double -- ^ Estimated number of points
sum b hist =
    result
  where
    -- pad the data on both sides to make sure we always have a (p,
    -- p+1) pair even when (b < min histData) or (b > max histData).
    -- TODO (1e300 is probably not a good boundary value..).
    sorted = M.assocs (histData hist)
    result = discover 0.0 ((-1e300, 0) : sorted ++ [(1e300, 0)])

    -- Find split point for b and keep track of s so far
    discover :: Double -> [(Double, Int)] -> Double
    discover sSoFar ((pi, mi):(pj, mj):xs)
      | trace (show (sSoFar, b, (pi, mi), (pj, mj))) False = undefined
      | pi <= b && b < pj = (sTrapezoid pi mi pj mj) + sSoFar + (fromIntegral mi) / 2.0
      | otherwise = discover (sSoFar + fromIntegral mi) ((pj, mj):xs)

    -- Initial value for S (trapezoid rule)
    sTrapezoid pi mi pj mj =
        let
            mb = (fromIntegral mi) + (fromIntegral (mj - mi)) / (pj - pi) * (b - pi)
        in
            ((fromIntegral mi) + mb) / 2.0 * (b - pi) / (pj - pi)

-- TODO implement 'uniform' but I don't need it just now