summaryrefslogtreecommitdiff
path: root/ngrams.hs
blob: 64bee09cc73234241c23c7a9a9a2bf491a48f59b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import System.Console.HCL 
import Data.Char (isAlpha, isSpace)
import Text.Regex (matchRegex, mkRegex)
import Network.Browser (browse, setErrHandler, setOutHandler,
  request, defaultGETRequest)
import Network.HTTP (urlEncode, rspBody)
import Network.URI (parseURI)
import Data.List (intersperse, unfoldr, isSuffixOf, isPrefixOf)
import Numeric (readDec)
import Data.Foldable (foldl')
import Control.Monad (liftM)
import Text.Printf (printf)

main = execReq $
  do
    reqIO $ putStrLn "Calculate n-grams using Google. Quit by hitting enter at any prompt."
    reqForever $
      do
        phrase <- reqLift (words . map clean) $ prompt "Please enter a starting phrase: " reqResp
        alternatives <- reqLift phrasesAndWords $ prompt "Enter alternative words separated by spaces. Phrases can enclosed in quotes (e.g., \"...\"): " reqResp
        count <- reqUntil (return . (> 1)) $ prompt "What kind of n-gram would you like to analyze (> 1)? " reqInt
        ngs <- reqIO $ getProbabilities (ngrams count phrase alternatives)
        reqIO $ mapM_ (putStrLn . formatNGram) ngs
        reqIO $ mapM_ (\ng -> putStrLn ("Probability of phrase '" ++
          mkPhrase ng ++ "': " ++ printf "%.6g" (totalProbability ng * 100) ++ "%")) (recoverPhrases (length alternatives) ngs)
  where
    clean c | isAlpha c = c
            | otherwise = ' '
    -- Recover original phrase from ngram list.
    mkPhrase :: [NGram] -> String
    mkPhrase p = unwords $ map (head . ngPhrase) (init p) ++ (ngPhrase . last) p
    -- Converts a string to a list of words, but keeps words in quotes
    -- together. Nested quotes are not supported.
    phrasesAndWords :: String -> [String]
    phrasesAndWords =
      let startsWith w c = [c] `isPrefixOf` w
          endsWith w c = [c] `isSuffixOf` w
          buildPhrases [] = Nothing
          -- A word starts with a '"', now look for ending '"'
          buildPhrases (w:ws) 
            | w `startsWith` '"' =
                let (p, rest) = break (`endsWith` '"') (w:ws)
                in
                  case () of
                    () | null p -> Just ((init . tail) w, ws) -- single word in quotes
                       | null rest -> Just ((tail . unwords) p, rest) -- No ending quote
                       -- Some amount of words in the phrase and some not
                       | otherwise -> Just ((init . tail) (unwords (p ++ [head rest])), tail rest)
            | otherwise = Just (w, ws)
      in
        -- Use of words makes sure any valid quotes appear at beginning or
        -- end of a word.
        unfoldr buildPhrases . words 

-- ^ Stores an n-gram. That is, a structure that tells us
-- the probability of the last word in ngPhrase occurrring, given
-- the occurences of the previous words in the phrase and the
-- occurrences of the entire phrase.
data NGram = NGram { ngPhrase :: [String] -- The n-gram itself. The last word is 
  , numeratorCnt :: Integer  -- number of hits for phrase w\/o last word.
  , denomCnt :: Integer -- ^ number of hits for phrase w\/o last word.
  , probability :: Double -- ^ Probability that the last word in ngPhrase should be there, based on numeratorCnt and denomCnt.
}
  deriving Show

-- ^ Generalized ngrams. Takes a phrase and a list of possible
-- ending words. Returns all n-grams for them. The behavior
-- is undefined if the length requested is longer than
-- the phrase given.
ngrams :: Int -> [String] -> [String] -> [[String]]
ngrams nsize phrase suffixes  =
    let phraseGrams = gatherNGrams phrase
        altGrams = gatherAlts (drop (length phrase - (nsize - 1)) phrase)
    in
      phraseGrams ++ altGrams
  where
    -- Gathers the strings given into groups of the
    -- the length given. 
    gatherNGrams :: [String] -> [[String]]
    gatherNGrams = unfoldr group
      where
        group p 
          | null $ drop (max 0 (nsize - 1)) p = Nothing
          | otherwise = Just (take nsize p , drop 1 p)
    -- Takes a list of words and a list of alternate
    -- suffixes. Returns a list of lists, where
    -- each inner list is the original word list plus one
    -- of the alternate endings.
    gatherAlts :: [String] -> [[String]]
    gatherAlts prefix = zipWith addSuffix (repeat prefix) suffixes
      where
        addSuffix a b = a ++ [b]

-- ^ Turn an n-gram into a nice string.
formatNGram :: NGram -> String
formatNGram (NGram ph n d p) = unwords ph ++ ": " ++ show n ++
  " / " ++ show d ++ " = " ++ printf "%.6g" (p * 100) ++ "%"

-- ^ Multiplies the probabilities of all n-grams together. Indicates the
-- probability of the entire phrase. 
totalProbability :: [NGram] -> Double
totalProbability ngs = foldl' (*) 1 (map probability ngs)

-- ^ Recovers all possible phrases from the NGrams given.
-- The first argument is the number of alternative endings that
-- were given originally.
recoverPhrases :: Int -> [NGram] -> [[NGram]]
recoverPhrases numAlts ngs =
  let initial = take (ngLength - numAlts) ngs
      rest = drop (ngLength - numAlts) ngs
      ngLength = length ngs
  in
    map (\r -> initial ++ [r]) rest

-- ^ Given a list of possible phrases (broken into n-sized chunks),
-- calculate the ngram associated with each chunk.
getProbabilities :: [[String]] -> IO [NGram]
getProbabilities phrase = mapM getProbability phrase
  where
    getProbability phrase =
      do
        denomCount <- liftM (max 1) $ resultCount (init phrase)
        numCount <- liftM (max 1) $ resultCount phrase
        return $ NGram phrase
          numCount denomCount (fromIntegral numCount / fromIntegral denomCount)
    
-- ^ Determine how many times the phrase given appears on the web, according
-- to Google's estimate.
resultCount :: [String] -> IO Integer
resultCount terms = 
  let makeQuery terms = "http://www.google.com/search?q=" ++
        (urlEncode $ unwords (map ("+" ++) terms))
  in case parseURI (makeQuery terms) of
      Just uri ->
        do
          browse $
            do
              setErrHandler (const (return()))
              setOutHandler (const (return()))
              (_, result) <- request $ defaultGETRequest uri
              case matchRegex (mkRegex "of about <b>([[:digit:],]*)</b>") (rspBody result) of
                   Nothing -> return 0
                   Just strs -> return $ fst $ head $ readDec (filter (/=',') (head strs))
      _ -> error $ "Could not make uri from the terms " ++ show terms