Пример #1
0
        // return the best ranked str from candidates using orthographic score
        // tarPos: start from 0, not include empty space token
        private static MergeObj GetTopRankMergeObjByFrequency(HashSet <MergeObj> candidates, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList)
        {
            // init
            WordWcMap wordWcMap       = cSpellApi.GetWordWcMap();
            int       maxCandNo       = cSpellApi.GetCanMaxCandNo();
            MergeObj  topRankMergeObj = null;

            // get the top rank mergeObj by frequency
            if (candidates.Count > 0)
            {
                // 1. convert mergeObj set to string set
                // key: coreMergeWord, MergeObj
                Dictionary <string, MergeObj> candStrMergeObjMap = new Dictionary <string, MergeObj>();
                foreach (MergeObj mergeObj in candidates)
                {
                    string mergeWord = mergeObj.GetCoreMergeWord();
                    candStrMergeObjMap[mergeWord] = mergeObj;
                }
                HashSet <string> candStrSet = new HashSet <string>(candStrMergeObjMap.Keys);
                // 2. find the top rank by Str
                string topRankStr = RankByFrequency.GetTopRankStr(candStrSet, wordWcMap);
                // 3. convert back from top rank str to MergeObj
                // topRankStr should never be null because candidates is > 0
                if (!string.ReferenceEquals(topRankStr, null))
                {
                    topRankMergeObj = candStrMergeObjMap.GetValueOrNull(topRankStr);
                }
                // 4. print out frequency score detail
                ScoreDetailByMode.PrintFrequencyScore(candStrSet, wordWcMap, maxCandNo, debugFlag);
            }
            return(topRankMergeObj);
        }
Пример #2
0
        // include multiwords, multiwords = avg. score
        private static double GetScoreDev2(string inWord, WordWcMap wordWcMap)
        {
            // check multiword case for split
            bool          normFlag   = false;  // don't use punctuation for determiner
            List <string> wordList   = TermUtil.ToWordList(inWord, normFlag);
            double        score      = 0.0;
            double        totalScore = 0.0;
            int           totalWords = wordList.Count;

            //double maxWc = GetAdjustedWc(wordWcMap.GetMaxWc());
            // use the average score for the multiwords
            foreach (string word in wordList)
            {
                //double curScore = GetScoreByChurch(word, wordWcMap);
                //double curScore = GetScoreByCrowell(word, wordWcMap);
                //double curScore = GetScoreByPeter(word, wordWcMap);
                //double curScore = GetUnigramFreqScore(word, wordWcMap);
                //double curScore = GetWc(word, wordWcMap);
                double curScore = GetScoreDev1(word, wordWcMap);
                totalScore += curScore;
            }
            if (totalScore > 0.0)
            {
                score = totalScore / totalWords;
            }
            return(score);
        }
Пример #3
0
        // score = WC(word)/total_word_cout
        // score range is between 0.0 ~ 1.0
        // should be the same as GetCorpusFreqScore
        // Get score for a singel word
        private static double GetWordScore(string inWord, double maxWc, WordWcMap wordWcMap)
        {
            double wc    = GetAdjustedWc(inWord, wordWcMap);
            double score = (Math.Log(wc) / Math.Log(maxWc));

            return(score);
        }
Пример #4
0
        // should be the same as GetUnigramFreqScore
        // score range is between 0.0 ~ 1.0
        // not used because it is no good
        private static double GetWordScore2(string inWord, double maxWc, WordWcMap wordWcMap)
        {
            double wc      = 1.0d * GetWc(inWord, wordWcMap);
            double totalWc = 1.0d * wordWcMap.GetTotalWc();
            double score   = (Math.Log(wc / totalWc) / Math.Log(maxWc / totalWc));

            return(score);
        }
Пример #5
0
        // possibility = WC(w0rd)/total_word_cout
        private static double GetWordPossOverTotalWc(string inWord, WordWcMap wordWcMap, bool caseFlag)
        {
            double wc      = 1.0d * GetWc(inWord, wordWcMap, caseFlag);
            double totalWc = 1.0d * wordWcMap.GetTotalWc();
            double score   = wc / totalWc;

            return(score);
        }
Пример #6
0
        // single word score, multiwords is 0.0
        private static double GetScoreDev1(string inWord, WordWcMap wordWcMap)
        {
            long   maxWc = wordWcMap.GetMaxWc();
            double wc    = GetWc(inWord, wordWcMap);
            double score = wc / (1.0 * maxWc);

            return(score);
        }
Пример #7
0
        private static double GetScoreByPeter(string inWord, WordWcMap wordWcMap)
        {
            long   totalWc = wordWcMap.GetTotalWc();
            double wc      = GetWc(inWord, wordWcMap);
            double score   = wc / (1.0 * totalWc);

            return(score);
        }
Пример #8
0
        // tarPos: start from 0, not include empty space token
        public static string GetTopRankStr(string inStr, HashSet <string> candidates, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList)
        {
            // init
            int       rankMode         = cSpellApi.GetRankMode();
            double    wf1              = cSpellApi.GetOrthoScoreEdDistFac();
            double    wf2              = cSpellApi.GetOrthoScorePhoneticFac();
            double    wf3              = cSpellApi.GetOrthoScoreOverlapFac();
            WordWcMap wordWcMap        = cSpellApi.GetWordWcMap();
            string    topRankStr       = inStr;
            int       maxCandNo        = cSpellApi.GetCanMaxCandNo();
            Word2Vec  word2VecIm       = cSpellApi.GetWord2VecIm();
            Word2Vec  word2VecOm       = cSpellApi.GetWord2VecOm();
            int       contextRadius    = cSpellApi.GetNw1To1ContextRadius();
            bool      word2VecSkipWord = cSpellApi.GetWord2VecSkipWord();
            double    rangeFactor      = cSpellApi.GetRankNwS1RankRangeFac();
            double    nwS1MinOScore    = cSpellApi.GetRankNwS1MinOScore();
            int       tarSize          = 1; // only for one-to-one or split, no merge here

            // get the top ranked candidate
            if (candidates.Count > 0)
            {
                // get the top rank str by scores
                switch (rankMode)
                {
                case CSpellApi.RANK_MODE_ORTHOGRAPHIC:
                    topRankStr = RankByOrthographic.GetTopRankStr(inStr, candidates, wf1, wf2, wf3);
                    ScoreDetailByMode.PrintOrthographicScore(inStr, candidates, maxCandNo, wf1, wf2, wf3, debugFlag);
                    break;

                case CSpellApi.RANK_MODE_FREQUENCY:
                    topRankStr = RankByFrequency.GetTopRankStr(candidates, wordWcMap);
                    ScoreDetailByMode.PrintFrequencyScore(candidates, wordWcMap, maxCandNo, debugFlag);
                    break;

                case CSpellApi.RANK_MODE_CONTEXT:
                    topRankStr = RankByContext.GetTopRankStr(inStr, candidates, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius);
                    ScoreDetailByMode.PrintContextScore(candidates, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, maxCandNo, debugFlag);
                    break;

                case CSpellApi.RANK_MODE_NOISY_CHANNEL:
                    topRankStr = RankByNoisyChannel.GetTopRankStr(inStr, candidates, wordWcMap, wf1, wf2, wf3);
                    ScoreDetailByMode.PrintNoisyChannelScore(inStr, candidates, wordWcMap, maxCandNo, wf1, wf2, wf3, debugFlag);
                    break;

                case CSpellApi.RANK_MODE_ENSEMBLE:
                    topRankStr = RankByEnsemble.GetTopRankStr(inStr, candidates, wordWcMap, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, rangeFactor, wf1, wf2, wf3);
                    // ensemble use same basic socre as CSpell
                    ScoreDetailByMode.PrintCSpellScore(inStr, candidates, wordWcMap, maxCandNo, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, wf1, wf2, wf3, debugFlag);
                    break;

                case CSpellApi.RANK_MODE_CSPELL:
                    topRankStr = RankByCSpellNonWord.GetTopRankStr(inStr, candidates, wordWcMap, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, rangeFactor, nwS1MinOScore, wf1, wf2, wf3);
                    ScoreDetailByMode.PrintCSpellScore(inStr, candidates, wordWcMap, maxCandNo, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, wf1, wf2, wf3, debugFlag);
                    break;
                }
            }
            return(topRankStr);
        }
Пример #9
0
        // not completed with contextScore
        private static void Tests(WordWcMap wordWcMap, Word2Vec w2vOm)
        {
            List <string> testStrList = new List <string>();

            Test("spel", "spell", wordWcMap);
            Test("spel", "speil", wordWcMap);
            Test("spelld", "spell", wordWcMap);
            Test("spelld", "spelled", wordWcMap);
        }
Пример #10
0
        // single word score, multiwords is 0.0
        private static void Test(string wordStr, string candStr, WordWcMap wordWcMap)
        {
            double            wf1 = 1.00;
            double            wf2 = 0.70;
            double            wf3 = 0.80;
            NoisyChannelScore ncs = new NoisyChannelScore(wordStr, candStr, wordWcMap, wf1, wf2, wf3);

            Console.WriteLine(ncs.ToString());
        }
Пример #11
0
 // private constructor
 public NoisyChannelScore(string wordStr, string candStr, WordWcMap wordWcMap, double wf1, double wf2, double wf3)
 {
     wordStr_ = wordStr;
     candStr_ = candStr;
     // calculate score
     oScore_ = new OrthographicScore(wordStr_, candStr_, wf1, wf2, wf3);
     fScore_ = new FrequencyScore(candStr_, wordWcMap);
     score_  = oScore_.GetScore() * fScore_.GetScore();
 }
Пример #12
0
        // org code from baseline, TBM
        public static double GetCorpusFreqScore(string inWord, WordWcMap wordWcMap)
        {
            // get the wc
            Dictionary <string, int> wWcMap = wordWcMap.GetWordWcMap();
            int freq = (wWcMap.ContainsKey(inWord) ? wWcMap.GetValueOrNull(inWord) : 0);
            // check if inWord is a multiword
            IList <string> spls    = inWord.Split("[ ]").ToList();
            bool           isSplit = spls.Count >= 2;

            if (isSplit == false)
            {
                // check possessive, this is not right:
                // all XXX's will result in same scsore is XXX is bigger than 's
                if (inWord.EndsWith("'s", StringComparison.Ordinal))
                {
                    spls = new List <string>();
                    spls.Add(inWord.Substring(0, inWord.Length - 2));
                    spls.Add("'s");
                    isSplit = true;
                }
            }
            else
            {
                //System.out.println("---- split: [" + inWord + "]");
            }
            // use the min. wc of split word in the multiword's case
            if (freq == 0 && isSplit)
            {
                int min = int.MaxValue;
                foreach (string spl in spls)
                {
                    //System.out.println("- split: rpStr: [" + spl + "|" + rpStr + "]");
                    if (String.IsNullOrEmpty(spl))
                    {
                        continue;
                    }
                    int splFreq = (wWcMap.ContainsKey(spl) ? wWcMap.GetValueOrNull(spl) : 0);
                    //System.out.println("Corpus count:" + spl + "|" + wWcMap.get(spl) + "|" + splFreq);
                    // use the min. freq of each word as the freq of the multiwords
                    if (splFreq >= 0 && splFreq < min)
                    {
                        min = splFreq;
                    }
                }
                // use the min. freq of the split words as whole word?
                freq = min;
            }
            if (freq == 0)
            {
                return(0.0);
            }
            long   maxWc = wordWcMap.GetMaxWc();
            double score = (Math.Log(freq) / Math.Log(maxWc));

            return(score);
        }
Пример #13
0
 // private constructor
 public CSpellScore(string wordStr, string candStr, WordWcMap wordWcMap, DoubleVec contextVec, Word2Vec word2Vec, double wf1, double wf2, double wf3)
 {
     wordStr_ = wordStr;
     candStr_ = candStr;
     // calculate score
     oScore_ = new OrthographicScore(wordStr_, candStr_, wf1, wf2, wf3);
     fScore_ = new FrequencyScore(candStr_, wordWcMap);
     nScore_ = new NoisyChannelScore(wordStr_, candStr_, wordWcMap, wf1, wf2, wf3);
     cScore_ = new ContextScore(candStr_, contextVec, word2Vec);
 }
Пример #14
0
        // return the best ranked str from candidates using frequency score
        public static string GetTopRankStr(HashSet <string> candidates, WordWcMap wordWcMap)
        {
            string topRankStr = "";
            List <FrequencyScore> candScoreList = GetCandidateScoreList(candidates, wordWcMap);

            if (candScoreList.Count > 0)
            {
                topRankStr = candScoreList[0].GetWord();
            }
            return(topRankStr);
        }
Пример #15
0
        // return candidate str list sorted by score, higher first
        public static List <string> GetCandidateStrList(HashSet <string> candidates, WordWcMap wordWcMap)
        {
            List <FrequencyScore> candScoreList = GetCandidateScoreList(candidates, wordWcMap);
            List <string>         candStrList   = new List <string>();

            foreach (FrequencyScore fs in candScoreList)
            {
                candStrList.Add(fs.GetWord());
            }
            return(candStrList);
        }
Пример #16
0
        // 1990, 1191 paper from Church
        private static double GetScoreByCrowell(string inWord, WordWcMap wordWcMap)
        {
            double wc    = GetWc(inWord, wordWcMap);
            double score = 0.5;             // assign to 0.5 if word is not in the corpus

            if (wc != 0.0)
            {
                score = (1.0 + Math.Log(wc));
            }
            return(score);
        }
Пример #17
0
 // test Driver
 public static void MainTest(string[] args)
 {
     if (args.Length > 0)
     {
         Console.WriteLine("Usage: java WordWcMap");
         Environment.Exit(0);
     }
     // test
     string    inFile      = "../data/Freq/baselineWordFreq.data";
     bool      verboseFlag = true;
     WordWcMap wordWcMap   = new WordWcMap(inFile, verboseFlag);
 }
        // private methods
        private static int RunTest(bool detailFlag, long limitNo)
        {
            // init dic
            string    configFile = "../data/Config/cSpell.properties";
            CSpellApi cSpellApi  = new CSpellApi(configFile);
            WordWcMap wordWcMap  = cSpellApi.GetWordWcMap();
            double    wf1        = cSpellApi.GetOrthoScoreEdDistFac();
            double    wf2        = cSpellApi.GetOrthoScorePhoneticFac();
            double    wf3        = cSpellApi.GetOrthoScoreOverlapFac();

            cSpellApi.SetRankMode(CSpellApi.RANK_MODE_NOISY_CHANNEL);
            // provide cmdLine interface
            int returnValue = 0;
            NoisyChannelScoreComparator <NoisyChannelScore> ncsc = new NoisyChannelScoreComparator <NoisyChannelScore>();

            try {
                StreamReader stdInput = new StreamReader(Console.OpenStandardInput());
                try {
                    string inText = null;
                    Console.WriteLine("- Please input a text (type \"Ctl-d\" to quit) > ");
                    while (!string.ReferenceEquals((inText = stdInput.ReadLine()), null))
                    {
                        // ---------------------------------
                        // Get spell correction on the input
                        // ---------------------------------
                        // get all possible candidates
                        HashSet <string> candSet = NonWord1To1Candidates.GetCandidates(inText, cSpellApi);
                        Console.WriteLine("-- canSet.size(): " + candSet.Count);
                        // get final suggestion
                        string topRankStr = GetTopRankStr(inText, candSet, wordWcMap, wf1, wf2, wf3);
                        Console.WriteLine("- top tank str: " + topRankStr);
                        // print details
                        if (detailFlag == true)
                        {
                            HashSet <NoisyChannelScore> candScoreSet = GetCandidateScoreSet(inText, candSet, wordWcMap, wf1, wf2, wf3);
                            Console.WriteLine("------ Suggestion List ------");
                            var list = candScoreSet.OrderBy(x => x, ncsc).Take((int)limitNo).Select(obj => obj.ToString()).ToList();
                            foreach (var item in list)
                            {
                                Console.WriteLine(item);
                            }
                        }
                    }
                } catch (Exception e2) {
                    Console.Error.WriteLine(e2.Message);
                    returnValue = -1;
                }
            } catch (Exception e) {
                Console.Error.WriteLine(e.Message);
                returnValue = -1;
            }
            return(returnValue);
        }
Пример #19
0
 public static void PrintFrequencyScore(HashSet <string> candSet, WordWcMap wordWcMap, int maxCandNo, bool debugFlag)
 {
     if (debugFlag == true)
     {
         FrequencyScoreComparator <FrequencyScore> fsc = new FrequencyScoreComparator <FrequencyScore>();
         HashSet <FrequencyScore> fScoreSet            = RankByFrequency.GetCandidateScoreSet(candSet, wordWcMap);
         var list = fScoreSet.OrderBy(x => x, fsc).Take(maxCandNo).Select(obj => obj.ToString()).ToList();
         foreach (var item in list)
         {
             DebugPrint.PrintFScore(item, debugFlag);
         }
     }
 }
Пример #20
0
        // test Driver
        public static void MainTest(string[] args)
        {
            if (args.Length > 0)
            {
                Console.WriteLine("Usage: java WordCountScore");
                Environment.Exit(0);
            }
            // test
            string    inFile      = "../data/Frequency/wcWord.data";
            bool      verboseFlag = true;
            WordWcMap wordWcMap   = new WordWcMap(inFile, verboseFlag);

            Tests(wordWcMap);
        }
Пример #21
0
 // this detail does not print how cSpell really fidn the top rank
 // it is sorted by CSpell score
 // CSpell use the cSpell score + context and frequency to find the top
 public static void PrintCSpellScore(string inStr, HashSet <string> candSet, WordWcMap wordWcMap, int maxCandNo, int tarPos, int tarSize, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, double wf1, double wf2, double wf3, bool debugFlag)
 {
     if (debugFlag == true)
     {
         // NW 1To1
         CSpellScoreNw1To1Comparator <CSpellScore> csc = new CSpellScoreNw1To1Comparator <CSpellScore>();
         HashSet <CSpellScore> cScoreSet = RankByCSpellNonWord.GetCandidateScoreSet(inStr, candSet, wordWcMap, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, wf1, wf2, wf3, debugFlag);
         var list = cScoreSet.OrderBy(x => x, csc).Take(maxCandNo).Select(obj => obj.ToString()).ToList();
         foreach (var item in list)
         {
             DebugPrint.PrintScore(item, debugFlag);
         }
     }
 }
Пример #22
0
        private static void Tests(WordWcMap wordWcMap)
        {
            List <string> testStrList = new List <string>();

            testStrList.Add("the");            // first one in the corpus
            testStrList.Add("if");             // first one in the corpus
            testStrList.Add("you");            // first one in the corpus
            testStrList.Add("doctor");
            testStrList.Add("Doctor");         // Test Case
            testStrList.Add("doctor[123]");
            testStrList.Add("'s");
            testStrList.Add("container");
            testStrList.Add("diagnose");
            testStrList.Add("deionized");
            testStrList.Add("&eacute;vy");       // last one in the corpus
            testStrList.Add("xxxx");             // last one in the corpus
            testStrList.Add("doctor's");         // posssive
            testStrList.Add("heart's");
            testStrList.Add("if you");           // multiwords
            testStrList.Add("the doctor");       // multiwords
            testStrList.Add("Not exist");
            testStrList.Add("brokenribscantsleepatnight");
            testStrList.Add("broken");
            testStrList.Add("rib");
            testStrList.Add("ribs");
            testStrList.Add("cant");
            testStrList.Add("cants");
            testStrList.Add("scant");
            testStrList.Add("scants");
            testStrList.Add("sleep");
            testStrList.Add("leep");
            testStrList.Add("lee");
            testStrList.Add("pat");
            testStrList.Add("at");
            testStrList.Add("night");
            testStrList.Add("broken ribs cants leep at night");
            testStrList.Add("broken ribs cant sleep at night");
            testStrList.Add("broken rib scants leep at night");
            testStrList.Add("broken rib scants lee pat night");
            testStrList.Add("broken rib scant sleep at night");
            Console.WriteLine("=================================================");
            Console.WriteLine("Word|Score");
            Console.WriteLine("=================================================");
            foreach (string testStr in testStrList)
            {
                Test(testStr, wordWcMap);
            }
        }
Пример #23
0
        // test Driver
        public static void MainTest(string[] args)
        {
            if (args.Length > 0)
            {
                Console.WriteLine("Usage: java CSpellScore");
                Environment.Exit(0);
            }
            // test
            string    inFile      = "../data/Frequency/wcWord.data";
            bool      verboseFlag = true;
            WordWcMap wordWcMap   = new WordWcMap(inFile, verboseFlag);
            string    inOmFile    = "../data/Context/syn1n.data";
            Word2Vec  w2vOm       = new Word2Vec(inOmFile, verboseFlag);

            Tests(wordWcMap, w2vOm);
        }
Пример #24
0
        // org code from baseline, TBM, From Ensemble
        public static double GetUnigramFreqScore(string inWord, WordWcMap wordWcMap)
        {
            Dictionary <string, int> wWcMap = wordWcMap.GetWordWcMap();
            int            freq             = (wWcMap.ContainsKey(inWord) ? wWcMap.GetValueOrNull(inWord) : 0);
            IList <string> spls             = inWord.Split("[ ]", true).ToList();
            bool           isSplit          = spls.Count >= 2;

            if (isSplit == false)
            {
                if (inWord.EndsWith("'s", StringComparison.Ordinal))
                {
                    spls = new List <string>();
                    spls.Add(inWord.Substring(0, inWord.Length - 2));
                    spls.Add("'s");
                    isSplit = true;
                }
            }
            // use the min. wc of split word in the multiword's case
            if (freq == 0 && isSplit)
            {
                int min = int.MaxValue;
                foreach (string spl in spls)
                {
                    if (String.IsNullOrEmpty(spl))
                    {
                        continue;
                    }
                    int splFreq = (wWcMap.ContainsKey(spl) ? wWcMap.GetValueOrNull(spl) : 0);
                    //System.out.println("Corpus count:" + spl + "|" + wWcMap.get(spl) + "|" + splFreq);
                    if (splFreq >= 0 && splFreq < min)
                    {
                        min = splFreq;
                    }
                }
                freq = min;
            }
            if (freq == 0)
            {
                return(0.0);                // to avoid infinity
            }
            long   maxWc   = wordWcMap.GetMaxWc();
            long   totalWc = wordWcMap.GetTotalWc();
            double score   = (Math.Log(1.0 * freq / totalWc) / Math.Log(1.0 * maxWc / totalWc));

            return(score);
        }
Пример #25
0
        // return the best ranked str from candidates using frequency score
        public static string GetTopRankStrByScore(HashSet <string> candidates, WordWcMap wordWcMap)
        {
            string topRankStr = "";
            double maxScore   = 0.0;

            foreach (string cand in candidates)
            {
                FrequencyScore fs    = new FrequencyScore(cand, wordWcMap);
                double         score = fs.GetScore();
                if (score > maxScore)
                {
                    topRankStr = cand;
                    maxScore   = score;
                }
            }
            return(topRankStr);
        }
Пример #26
0
        private static int GetWc(string inWord, WordWcMap wordWcMap, bool caseFlag)
        {
            string inWordLc = inWord;

            // ignore case
            if (caseFlag == false)
            {
                inWordLc = inWord.ToLower();
            }
            // the key of wWcMap are lowercased in the Beta version
            Dictionary <string, int> wWcMap = wordWcMap.GetWordWcMap();
            int wc = 0;

            if (wWcMap.GetValueOrNull(inWordLc) != null)
            {
                wc = wWcMap.GetValueOrNull(inWordLc);
            }
            return(wc);
        }
Пример #27
0
        // public method
        public static double GetScore(string inWord, WordWcMap wordWcMap)
        {
            //double score = GetScoreByChurch(inWord, wordWcMap);
            //double score = GetScoreByCrowell(inWord, wordWcMap);
            //double score = GetScoreByPeter(inWord, wordWcMap);
            //double score = GetWc(inWord, wordWcMap);
            // Halil
            //double score = GetUnigramFreqScore(inWord, wordWcMap);
            //double score = GetScoreDev1(inWord, wordWcMap);

            // default - multiword
            double score = GetScoreDev2(inWord, wordWcMap);

            // Baseline Orginal code
            //double score = GetCorpusFreqScore(inWord, wordWcMap);

            // Not used: double score = GetAdjustScoreMin(inWord, wordWcMap);
            // Not used: double score = GetAdjustScoreAvg(inWord, wordWcMap);
            return(score);
        }
Пример #28
0
        // get socre for single word and multiwords (for split cases)
        // 1). multiword: score = avg. score of allwords
        // 2). single word: score =  log(adjust WC) / log (adjust Max. WC).
        public static double GetAdjustScoreAvg(string inWord, WordWcMap wordWcMap)
        {
            // check multiword case for split
            bool          normFlag   = false;  // don't use punctuation for determiner
            List <string> wordList   = TermUtil.ToWordList(inWord, normFlag);
            double        score      = 0.0;
            double        totalScore = 0.0;
            long          totalWords = wordList.Count;
            double        maxWc      = GetAdjustedWc(wordWcMap.GetMaxWc());

            // use the average score for the multiwords
            foreach (string word in wordList)
            {
                totalScore += GetWordScore(word, maxWc, wordWcMap);
            }
            if (totalWords > 0)
            {
                score = totalScore / totalWords;
            }
            return(score);
        }
Пример #29
0
        public static double GetAdjustScoreMin(string inWord, WordWcMap wordWcMap)
        {
            // check multiword case for split
            bool          normFlag   = false;  // don't use punctuation for determiner
            List <string> wordList   = TermUtil.ToWordList(inWord, normFlag);
            double        score      = 0.0;
            double        totalScore = 0.0;
            int           totalWords = wordList.Count;
            double        maxWc      = GetAdjustedWc(wordWcMap.GetMaxWc());
            // use the average score for the multiwords
            double minScore = int.MaxValue;

            foreach (string word in wordList)
            {
                double curScore = GetWordScore(word, maxWc, wordWcMap);
                minScore = (curScore < minScore ? curScore : minScore);
            }
            if (minScore < int.MaxValue)
            {
                score = minScore;
            }
            return(score);
        }
Пример #30
0
        private static void Tests(WordWcMap wordWcMap)
        {
            List <string> testStrList = new List <string>();

            testStrList.Add("the");            // first one in the corpus
            testStrList.Add("&eacute;vy");     // last one in the corpus
            testStrList.Add("xxxx");           // not in the corpus
            testStrList.Add("spondylitis");    // first one in the corpus
            testStrList.Add("spondyl");        // first one in the corpus
            testStrList.Add("its");            // first one in the corpus
            testStrList.Add("if");             // first one in the corpus
            testStrList.Add("you");            // first one in the corpus
            testStrList.Add("doctor");
            testStrList.Add("Doctor");         // Test Case
            testStrList.Add("doctor[123]");
            testStrList.Add("'s");
            testStrList.Add("container");
            testStrList.Add("diagnose");
            testStrList.Add("deionized");
            testStrList.Add("diabetic");
            testStrList.Add("diabetics");
            testStrList.Add("doctor's");           // posssive
            testStrList.Add("heart's");
            testStrList.Add("if you");             // multiwords
            testStrList.Add("the doctor");         // multiwords
            testStrList.Add("Not exist");
            testStrList.Add("brokenribscantsleepatnight");
            testStrList.Add("broken");
            testStrList.Add("rib");
            testStrList.Add("ribs");
            testStrList.Add("cant");
            testStrList.Add("cants");
            testStrList.Add("scant");
            testStrList.Add("scants");
            testStrList.Add("sleep");
            testStrList.Add("leep");
            testStrList.Add("lee");
            testStrList.Add("pat");
            testStrList.Add("at");
            testStrList.Add("night");
            testStrList.Add("broken ribs cants leep at night");
            testStrList.Add("broken ribs cant sleep at night");
            testStrList.Add("broken rib scants leep at night");
            testStrList.Add("broken rib scants lee pat night");
            testStrList.Add("broken rib scant sleep at night");
            testStrList.Add("friend share");
            testStrList.Add("assistance");
            testStrList.Add("baraclude and");
            testStrList.Add("xifaxan as");
            testStrList.Add("pamphlets");
            testStrList.Add("damage");
            testStrList.Add("withdrawal");
            testStrList.Add("tachycardia");
            testStrList.Add("always");
            testStrList.Add("itching");
            testStrList.Add("philtrum");
            testStrList.Add("achalasia");
            testStrList.Add("swollen");
            testStrList.Add("of course");
            testStrList.Add("antenatal");
            testStrList.Add("microsomia");
            testStrList.Add("migraine");
            testStrList.Add("hemorrhage");
            Console.WriteLine("=================================================");
            Console.WriteLine("Word|Score|Adjust|Wc|Wc/max");
            Console.WriteLine("=================================================");
            foreach (string testStr in testStrList)
            {
                Test(testStr, wordWcMap);
            }
        }