module Tok where import qualified Data.Char as Char import Data.List import Distribution.System type Options = [String] getOpts :: [String] -> (Options,[String]) getOpts = partition ((=='-') . head) count_tokens :: String -> Int count_tokens = length . nub . tokens [] tokens :: Options -> String -> [String] tokens opts = {- format opts . -} ignore opts . arrange opts . analyse opts . utf8 opts utf8 opts = id analyse :: Options -> String -> [String] analyse opts = ana where ana [] = [] ana s = case s of p:cs | isPunct p -> [p] : ana cs c:cs | Char.isSpace c -> ana cs c:cs -> readWord (c:cs) isSpec c = Char.isSpace c || isPunct c uncap w = if elem "-uncap" opts then (map Char.toLower w) else w readWord (x:xs) | elem "-nocap" opts && Char.isUpper x = ana (dropWhile (not . isSpec) xs) | otherwise = let (w,rest) = span (not . isSpec) (x:xs) in uncap w : ana rest isPunct :: Char -> Bool isPunct = flip elem ",.:;?!" isPunctS :: String -> Bool isPunctS [] = False isPunctS (c:_) = isPunct c isNumber :: String -> Bool isNumber [] = False isNumber (c:_) = Char.isDigit c arrange :: Options -> [String] -> [String] arrange opts | elem "-sort" opts = sort | elem "-stat" opts = map printStat . statistics | elem "-freq" opts = map printStat . frequency . statistics | elem "-nub" opts = map fst . statistics | otherwise = id ignore :: Options -> [String] -> [String] ignore opts = filter (\w -> not (or [i w | i <- igns])) where igns = [isPunctS | elem "-nopunct" opts] ++ [isNumber | elem "-nonumber" opts] format :: Options -> [String] -> String format opts | otherwise = unlines statistics :: [String] -> [(String,Int)] statistics = compress . sort where compress ws = case ws of w:ww -> let (w1,w2) = span (==w) ww n = 1 + length w1 in (w,n) : compress w2 _ -> [] frequency :: [(String,Int)] -> [(String,Int)] frequency = sortBy (\ (_,m) (_,n) -> compare n m) printStat :: (String,Int) -> String printStat (s,i) = s ++ " " ++ show i