module Paradigm where import Types import Data.Maybe(isJust) import RegExp import Store import Extract.Lex import Extract.Par import qualified Extract.Abs as Abs import Data.List import Extract.ErrM import Data.SharedString import qualified Data.Set as Set import System.IO import Text.Printf(hPrintf) import Control.Monad(foldM) import Store import Tok import Translate delimit = "\t" ffind :: Options -> Paradigms -> FilePath -> FilePath -> IO () ffind opts ps file parafile = do hPutStr stderr $ (show (length ps)) ++ " " ++ (paradigm_name (length ps)) ++ " read from '" ++ parafile ++ "'.\n" s <- readFile file hPutStr stderr $ prtCorpus s file let xs = get_data s opts bs@(left,right) = boundaries ps -- hPutStr stderr $ "CB: (" ++ (printB left) ++"," ++ (printB right) ++ ")\n" case process_data xs bs of st -> do process st (storeWords st) (Set.empty) (0,0) where prtCorpus s f | is_structured s = "\nReading preprocessed data from '" ++ file ++ "'...\n\n" | otherwise = "\nReading raw text data from '" ++ file ++ "'...\n\n" printB (None) = "empty" printB (Unlimited) = "unbounded" printB (BSize 0) = "empty" printB (BSize x) = show x paradigm_name n | n == 0 = "no rule" | n == 1 = "rule" | n > 1 = "rules" word_list s = Tok.tokens (["-" ++ o | _:o <- opts] ++ ["-nopunct","-nonumber"]) s process st [] output_set p@(hit,miss) = do let f x = ((fromInteger (toInteger x)/(fromInteger (hit+miss))) * 100 :: Float) s1 = f hit hPrintf stderr "\nUnique tokens : %d\n" (hit+miss) hPrintf stderr "Corpus Usage : %.2f%s\n" s1 "%" hPrintf stderr "Words Extracted : %d\n" (Set.size output_set) process st (x:newset) output_set p@(hit,miss) = do (b,output) <- printLex opts (matchParadigms st ps x) case output of (_:_) | elem "-u" opts -> do os <- foldM process_set output_set output process st newset os (update b p) (_:_) -> do sequence_ $ map print_it output process st newset output_set (update b p) _ -> process st newset output_set (update b p) process_set set x | Set.member x set = return set | otherwise = do print_it x case x of ('-':'-':_) -> return set _ -> return $ Set.insert x set update b (i1,i2) | b = (i1+1,i2) | otherwise = (i1,i2+1) print_it x = putStrLn x printLex :: [String] -> (String, [(Name, String, [String], [String])]) -> IO (Bool,[String]) printLex opts (w,ps) = case ps of [] | elem "-nobad" opts -> return (False,[]) --[] [] -> do return $ (False,["-- " ++ w]) _ | elem "-e" opts -> do return (True,[p ++ " \"" ++ concat (intersperse "\", \"" ss) ++ "\"" ++ pr_id opts (Data.List.head ss) ++ "; " ++ printEvidence (nub es) | (p,pid, ss, es) <- ps]) _ -> do return (True,[p ++ " \"" ++ concat (intersperse "\", \"" ss) ++ "\"" ++ pr_id opts (Data.List.head ss) ++ "; " | (p,pid,ss, _) <- ps]) pr_id opts pid | elem "-i" opts = " {id(\"" ++ pid ++ "..POS.1\")}" | otherwise = "" printEvidence :: [String] -> String printEvidence [] = [] printEvidence (x:xs) = "-- " ++ concat (intersperse "," (x:xs)) printArgs :: [String] -> String printArgs = unwords . map (\s -> '"':s++"\"") -- test all paradigms on one word. matchParadigms :: Store -> Paradigms -> String -> (String,[(Name,String, [String],[String])]) matchParadigms st ps w = (w, mParadigms ps) where mParadigms [] = [] mParadigms (p:ps) = case matchWordForm p w of [] -> mParadigms ps xs -> [(name p, ident p, ss, evidence) | env <- xs, Just (evidence,xss) <- [matchBody st (body p) env], env' <- [env++xs | xs <- if null xss then [[]] else xss], checkEnv env', Just ss <- [matchHead st (Types.head p) env']] ++ mParadigms ps -- checkEnv ensures that a variable is only instantiated to one value. -- checkEnv = and . map check . groupBy (\a -> \b -> fst a == fst b) . sort where check [] = True check [_] = True check (x:y:ys) = (snd x == snd y) && check (y:ys) matchHead :: Store -> Head -> ITable -> Maybe [String] matchHead st hs env = case [s | (r,cons) <- hs, s <- [generateString r env], isJust (queryStore st s cons True)] of xs | length xs == length hs -> Just xs _ -> Nothing -- match body of a paradigm rule, with variables instantiated (ITable) matchBody :: Store -> Body -> ITable -> Maybe (MatchedWordForms,[[(Variable,String)]]) matchBody st (Conj l1 l2) env = case (matchBody st l1 env, matchBody st l2 env) of (Just (xs,xss), Just (ys,yss)) -> return (xs ++ ys, combine xss yss) _ -> Nothing matchBody st (Disj l1 l2) env = case (matchBody st l1 env, matchBody st l2 env) of (Just (xs,xss), Just (ys,yss)) -> return (xs ++ ys, combine xss yss) (xs, Nothing) -> xs (Nothing, xs) -> xs matchBody st (Neg l1) env = case (matchBody st l1 env) of Just _ -> Nothing Nothing -> Just ([],[]) matchBody st (Atom ((_,r),cons)) env = case generateString r env of s -> case queryStore st s cons False of Just xss -> return ([s],xss) Nothing -> Nothing combine xss yss = [xs++ys | xs <- xss, ys <- yss] -- Find instantiations for variables matchWordForm :: Paradigm -> String -> [[(Variable, String)]] matchWordForm p s = pmatch (body p) where pmatch l = case l of Conj l1 l2 -> pmatch l1 ++ pmatch l2 Disj l1 l2 -> pmatch l1 ++ pmatch l2 Neg l1 -> pmatch l1 Atom ((reg,_),_) -> case (matchReg reg s) of Just xs -> [xs] Nothing -> [] get_paradigms :: FilePath -> IO (Err Paradigms) get_paradigms f = do s <- readFile f case pGrammar (convert (myLexer s)) of Bad s -> return (Bad s) Ok tree -> return $ collect_paradigms tree where convert = id boundaries :: Paradigms -> (Boundary, Boundary) boundaries ps = bound (map body ps) (head_bound (concat (map Types.head ps)) (None,None)) where head_bound [] res = res head_bound ((_,Nothing):xs) res = head_bound xs res head_bound ((_,Just b):xs) res = head_bound xs (getBoundC b res) bound [] res = res bound (x:xs) res@(Unlimited,Unlimited) = res bound (x:xs) (l,r) = case getBound x (None,None) of (l1,r1) -> bound xs (update l1 l, update r1 r) update Unlimited x = Unlimited update x@(BSize _) None = x update (BSize x) (BSize y) = BSize (max x y) update _ x = x getBound (Atom (_,Nothing)) res = res getBound (Atom (_,Just c)) res = getBoundC c res getBound l res = case l of Conj l1 l2 -> case (getBound l1 res, getBound l2 res) of ((x1,x2),(x3,x4)) -> (update x1 x3, update x2 x4) Disj l1 l2 -> case (getBound l1 res, getBound l2 res) of ((x1,x2),(x3,x4)) -> (update x1 x3, update x2 x4) Neg l1 -> getBound l1 res getBoundC (Atom (Nothing,_,_)) res = res getBoundC (Atom (Just p,_,_)) res = updatePos p res getBoundC l res = case l of Conj l1 l2 -> case (getBoundC l1 res, getBoundC l2 res) of ((x1,x2),(x3,x4)) -> (update x1 x3, update x2 x4) Disj l1 l2 -> case (getBoundC l1 res, getBoundC l2 res) of ((x1,x2),(x3,x4)) -> (update x1 x3, update x2 x4) Neg l1 -> getBoundC l1 res updatePos (Relative p) (p1,p2) = if p < 0 then (update (BSize (abs p)) p1, p2) else (p1, update (BSize p) p2) updatePos (Spanning p _) (p1,p2) = if p < 0 then (Unlimited, p2) else (p1, Unlimited) updatePos _ res = res