summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJustinBailey <>2007-10-25 23:04:16 (GMT)
committerLuite Stegeman <luite@luite.com>2007-10-25 23:04:16 (GMT)
commitba23b6a2296d4a534324fea84fbd81b426b1145a (patch)
tree0add4924fa80fd594e0ff7cbe798f8399de87be5
version 1.01.0
-rw-r--r--NGrams.cabal20
-rw-r--r--Setup.hs2
-rw-r--r--ngrams.hs144
3 files changed, 166 insertions, 0 deletions
diff --git a/NGrams.cabal b/NGrams.cabal
new file mode 100644
index 0000000..69352bc
--- /dev/null
+++ b/NGrams.cabal
@@ -0,0 +1,20 @@
+Name: NGrams
+Version: 1.0
+License: BSD3
+Author: Justin Bailey
+Homepage:
+Maintainer: jgbailey _ gmail _ com
+Category: Tool
+Build-Depends: base, HTTP, HCL, network, regex-compat
+Synopsis: Simple application for calculating n-grams using Google.
+Description:
+ An n-gram is a natural language processing technique for determining,
+ given a list of words and a phrase, the probability that each of the
+ words given will follow in the phrase. The executable provided asks for
+ a phrase and a list of alternate ending words. It also asks for the type
+ of n-gram to create (e.g., bi-gram, tri-gram). The probability for each
+ possible phrase is then calculated using Google search results.
+
+Executable: ngrams
+Main-Is: ngrams.hs
+Hs-Source-Dirs: .
diff --git a/Setup.hs b/Setup.hs
new file mode 100644
index 0000000..c2d38c4
--- /dev/null
+++ b/Setup.hs
@@ -0,0 +1,2 @@
+import Distribution.Simple
+main = defaultMain \ No newline at end of file
diff --git a/ngrams.hs b/ngrams.hs
new file mode 100644
index 0000000..b9b9351
--- /dev/null
+++ b/ngrams.hs
@@ -0,0 +1,144 @@
+import HCL
+import Data.Char (isAlpha, isSpace)
+import Text.Regex (matchRegex, mkRegex)
+import Network.Browser (browse, setErrHandler, setOutHandler,
+ request, defaultGETRequest)
+import Network.HTTP (urlEncode, rspBody)
+import Network.URI (parseURI)
+import Data.List (intersperse, unfoldr, isSuffixOf, isPrefixOf)
+import Numeric (readDec)
+import Data.Foldable (foldl')
+import Control.Monad (liftM)
+import Text.Printf (printf)
+
+main = execReq $
+ do
+ reqIO $ putStrLn "Calculate n-grams using Google. Quit by hitting enter at any prompt."
+ reqForever $
+ do
+ phrase <- reqLift (words . map clean) $ prompt "Please enter a starting phrase: " reqResp
+ alternatives <- reqLift phrasesAndWords $ prompt "Enter alternative words separated by spaces. Phrases can enclosed in quotes (e.g., \"...\"): " reqResp
+ count <- reqUntil (return . (> 1)) $ prompt "What kind of n-gram would you like to analyze (> 1)? " reqInt
+ ngs <- reqIO $ getProbabilities (ngrams count phrase alternatives)
+ reqIO $ mapM_ (putStrLn . formatNGram) ngs
+ reqIO $ mapM_ (\ng -> putStrLn ("Probability of phrase '" ++
+ mkPhrase ng ++ "': " ++ printf "%.6g" (totalProbability ng * 100) ++ "%")) (recoverPhrases (length alternatives) ngs)
+ where
+ clean c | isAlpha c = c
+ | otherwise = ' '
+ -- Recover original phrase from ngram list.
+ mkPhrase :: [NGram] -> String
+ mkPhrase p = unwords $ map (head . ngPhrase) (init p) ++ (ngPhrase . last) p
+ -- Converts a string to a list of words, but keeps words in quotes
+ -- together. Nested quotes are not supported.
+ phrasesAndWords :: String -> [String]
+ phrasesAndWords =
+ let startsWith w c = [c] `isPrefixOf` w
+ endsWith w c = [c] `isSuffixOf` w
+ buildPhrases [] = Nothing
+ -- A word starts with a '"', now look for ending '"'
+ buildPhrases (w:ws)
+ | w `startsWith` '"' =
+ let (p, rest) = break (`endsWith` '"') (w:ws)
+ in
+ case () of
+ () | null p -> Just ((init . tail) w, ws) -- single word in quotes
+ | null rest -> Just ((tail . unwords) p, rest) -- No ending quote
+ -- Some amount of words in the phrase and some not
+ | otherwise -> Just ((init . tail) (unwords (p ++ [head rest])), tail rest)
+ | otherwise = Just (w, ws)
+ in
+ -- Use of words makes sure any valid quotes appear at beginning or
+ -- end of a word.
+ unfoldr buildPhrases . words
+
+-- ^ Stores an n-gram. That is, a structure that tells us
+-- the probability of the last word in ngPhrase occurrring, given
+-- the occurences of the previous words in the phrase and the
+-- occurrences of the entire phrase.
+data NGram = NGram { ngPhrase :: [String] -- The n-gram itself. The last word is
+ , numeratorCnt :: Integer -- number of hits for phrase w\/o last word.
+ , denomCnt :: Integer -- ^ number of hits for phrase w\/o last word.
+ , probability :: Double -- ^ Probability that the last word in ngPhrase should be there, based on numeratorCnt and denomCnt.
+}
+ deriving Show
+
+-- ^ Generalized ngrams. Takes a phrase and a list of possible
+-- ending words. Returns all n-grams for them. The behavior
+-- is undefined if the length requested is longer than
+-- the phrase given.
+ngrams :: Int -> [String] -> [String] -> [[String]]
+ngrams nsize phrase suffixes =
+ let phraseGrams = gatherNGrams phrase
+ altGrams = gatherAlts (drop (length phrase - (nsize - 1)) phrase)
+ in
+ phraseGrams ++ altGrams
+ where
+ -- Gathers the strings given into groups of the
+ -- the length given.
+ gatherNGrams :: [String] -> [[String]]
+ gatherNGrams = unfoldr group
+ where
+ group p
+ | null $ drop (max 0 (nsize - 1)) p = Nothing
+ | otherwise = Just (take nsize p , drop 1 p)
+ -- Takes a list of words and a list of alternate
+ -- suffixes. Returns a list of lists, where
+ -- each inner list is the original word list plus one
+ -- of the alternate endings.
+ gatherAlts :: [String] -> [[String]]
+ gatherAlts prefix = zipWith addSuffix (repeat prefix) suffixes
+ where
+ addSuffix a b = a ++ [b]
+
+-- ^ Turn an n-gram into a nice string.
+formatNGram :: NGram -> String
+formatNGram (NGram ph n d p) = unwords ph ++ ": " ++ show n ++
+ " / " ++ show d ++ " = " ++ printf "%.6g" (p * 100) ++ "%"
+
+-- ^ Multiplies the probabilities of all n-grams together. Indicates the
+-- probability of the entire phrase.
+totalProbability :: [NGram] -> Double
+totalProbability ngs = foldl' (*) 1 (map probability ngs)
+
+-- ^ Recovers all possible phrases from the NGrams given.
+-- The first argument is the number of alternative endings that
+-- were given originally.
+recoverPhrases :: Int -> [NGram] -> [[NGram]]
+recoverPhrases numAlts ngs =
+ let initial = take (ngLength - numAlts) ngs
+ rest = drop (ngLength - numAlts) ngs
+ ngLength = length ngs
+ in
+ map (\r -> initial ++ [r]) rest
+
+-- ^ Given a list of possible phrases (broken into n-sized chunks),
+-- calculate the ngram associated with each chunk.
+getProbabilities :: [[String]] -> IO [NGram]
+getProbabilities phrase = mapM getProbability phrase
+ where
+ getProbability phrase =
+ do
+ denomCount <- liftM (max 1) $ resultCount (init phrase)
+ numCount <- liftM (max 1) $ resultCount phrase
+ return $ NGram phrase
+ numCount denomCount (fromIntegral numCount / fromIntegral denomCount)
+
+-- ^ Determine how many times the phrase given appears on the web, according
+-- to Google's estimate.
+resultCount :: [String] -> IO Integer
+resultCount terms =
+ let makeQuery terms = "http://www.google.com/search?q=" ++
+ (urlEncode $ unwords (map ("+" ++) terms))
+ in case parseURI (makeQuery terms) of
+ Just uri ->
+ do
+ browse $
+ do
+ setErrHandler (const (return()))
+ setOutHandler (const (return()))
+ (_, result) <- request $ defaultGETRequest uri
+ case matchRegex (mkRegex "of about <b>([[:digit:],]*)</b>") (rspBody result) of
+ Nothing -> return 0
+ Just strs -> return $ fst $ head $ readDec (filter (/=',') (head strs))
+ _ -> error $ "Could not make uri from the terms " ++ show terms