// return candidate scoreObj list sorted by score, higher first public static List <ContextScore> GetCandidateScoreList(HashSet <string> candidates, int tarPos, int tarSize, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, bool debugFlag) { // find score object set for each candidates ... HashSet <ContextScore> candScoreSet = GetCandidateScoreSet(candidates, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, debugFlag); // sorted by the score, higher go first List <ContextScore> candScoreList = new List <ContextScore>(candScoreSet); ContextScoreComparator <ContextScore> csc = new ContextScoreComparator <ContextScore>(); candScoreList.Sort(csc); return(candScoreList); }
// TBD, this file should be deleted by moving each method to // the assocaited ranking class // public method public static void PrintContextScore(HashSet <string> candSet, int tarPos, int tarSize, List <TokenObj> inTextList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, int maxCandNo, bool debugFlag) { if (debugFlag == true) { ContextScoreComparator <ContextScore> csc = new ContextScoreComparator <ContextScore>(); HashSet <ContextScore> cScoreSet = RankByContext.GetCandidateScoreSet(candSet, tarPos, tarSize, inTextList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, debugFlag); var list = cScoreSet.OrderBy(x => x, csc).Take(maxCandNo).Select(obj => obj.ToString()).ToList(); foreach (var item in list) { DebugPrint.PrintCScore(item, debugFlag); } } }
// return candidate scoreObj list sorted by score, higher first public static List <ContextScore> GetCandidateScoreList(HashSet <MergeObj> candidates, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, bool debugFlag) { // find score object set for each candidates ... HashSet <ContextScore> candScoreSet = GetCandidateScoreSet(candidates, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, debugFlag); // sorted by the score, higher go first List <ContextScore> candScoreList = new List <ContextScore>(candScoreSet); ContextScoreComparator <ContextScore> csc = new ContextScoreComparator <ContextScore>(); candScoreList.Sort(csc); // print detail foreach (ContextScore contextScore in candScoreList) { DebugPrint.PrintCScore(contextScore.ToString(), debugFlag); } return(candScoreList); }
// by combination, O, N, F, C private int compareByCombo(CSpellScore o1, CSpellScore o2) { int @out = 0; OrthographicScore oScore1 = ((CSpellScore)o1).GetOScore(); OrthographicScore oScore2 = ((CSpellScore)o2).GetOScore(); NoisyChannelScore nScore1 = ((CSpellScore)o1).GetNScore(); NoisyChannelScore nScore2 = ((CSpellScore)o2).GetNScore(); FrequencyScore fScore1 = ((CSpellScore)o1).GetFScore(); FrequencyScore fScore2 = ((CSpellScore)o2).GetFScore(); ContextScore cScore1 = ((CSpellScore)o1).GetCScore(); ContextScore cScore2 = ((CSpellScore)o2).GetCScore(); // 1. compared by orthographic score, best if (oScore1.GetScore() != oScore2.GetScore()) { OrthographicScoreComparator <OrthographicScore> osc = new OrthographicScoreComparator <OrthographicScore>(); @out = osc.Compare(oScore1, oScore2); } // 2. compared by noise channel score, 2nd best else if (nScore1.GetScore() != nScore2.GetScore()) { NoisyChannelScoreComparator <NoisyChannelScore> nsc = new NoisyChannelScoreComparator <NoisyChannelScore>(); @out = nsc.Compare(nScore1, nScore2); } // 3. compared by pure frequency score, 3rd best else if (fScore1.GetScore() != fScore2.GetScore()) { FrequencyScoreComparator <FrequencyScore> fsc = new FrequencyScoreComparator <FrequencyScore>(); @out = fsc.Compare(fScore1, fScore2); } // 4. compared by context score, 4 last else if (cScore1.GetScore() != cScore2.GetScore()) { ContextScoreComparator <ContextScore> csc = new ContextScoreComparator <ContextScore>(); @out = csc.Compare(cScore1, cScore2); } // 5. alphabetic order else { string cand1 = ((CSpellScore)o1).GetCandStr(); string cand2 = ((CSpellScore)o2).GetCandStr(); @out = cand2.CompareTo(cand1); } return(@out); }
private int compareByContext(CSpellScore o1, CSpellScore o2) { int @out = 0; ContextScore cScore1 = ((CSpellScore)o1).GetCScore(); ContextScore cScore2 = ((CSpellScore)o2).GetCScore(); // 1. compared by context score, 4 last if (cScore1.GetScore() != cScore2.GetScore()) { ContextScoreComparator <ContextScore> csc = new ContextScoreComparator <ContextScore>(); @out = csc.Compare(cScore1, cScore2); } // 2. alphabetic order else { string cand1 = ((CSpellScore)o1).GetCandStr(); string cand2 = ((CSpellScore)o2).GetCandStr(); @out = cand2.CompareTo(cand1); } return(@out); }
// return the best ranked str from candidates using word2Vec score // inTokenList, includes space token, is not coreTerm.Lc // return the orignal inStr if no candidate has score > 0.0d public static string GetTopRankStr(string inStr, HashSet <string> candidates, int tarPos, int tarSize, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, int shortSplitWordLength, int maxShortSplitWordNo, double rwSplitFactor, int maxCandNo, bool debugFlag) { // init string topRankStr = inStr; // Find the correction str if (candidates.Count > 0) { // 1. sorted score list for each candidates ... // This ranking can be improved if n-gram model (frequecny) is used List <ContextScore> candScoreList = RankByContext.GetCandidateScoreList(candidates, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, debugFlag); // 1.1 get the top tank candidate ContextScore topContextScore = candScoreList[0]; // 2. validate the top rank // 2.1 wordVec for context DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag); // 2.2 wordVec for the original words before split ContextScore orgContextScore = new ContextScore(inStr, contextVec, word2VecOm); // 2.3 compare the top rank split to the original string b4 split if (IsTopCandValid(inStr, orgContextScore, topContextScore, rwSplitFactor, debugFlag) == true) { // no correction: if score is not good enough for corection topRankStr = topContextScore.GetTerm(); } // debug print if (debugFlag == true) { // print focus token (original) DebugPrint.PrintCScore(orgContextScore.ToString(), debugFlag); // print candidates ContextScoreComparator <ContextScore> csc = new ContextScoreComparator <ContextScore>(); var list = candScoreList.OrderBy(x => x, csc).Take(maxCandNo).Select(obj => obj.ToString()).ToList(); foreach (var item in list) { DebugPrint.PrintCScore(item, debugFlag); } } } return(topRankStr); }
// private methods // this test is not verified private static int RunTest(bool detailFlag, int tarPos, int tarSize, int contextRadius, long limitNo) { // init dic string configFile = "../data/Config/cSpell.properties"; CSpellApi cSpellApi = new CSpellApi(configFile); cSpellApi.SetRankMode(CSpellApi.RANK_MODE_CONTEXT); Word2Vec word2VecIm = cSpellApi.GetWord2VecIm(); Word2Vec word2VecOm = cSpellApi.GetWord2VecOm(); bool word2VecSkipWord = cSpellApi.GetWord2VecSkipWord(); ContextScoreComparator <ContextScore> csc = new ContextScoreComparator <ContextScore>(); // provide cmdLine interface int returnValue = 0; try { StreamReader stdInput = new StreamReader(Console.OpenStandardInput()); try { string inText = null; Console.WriteLine("- Please input a text, only a spell error allowed (type \"Ctl-d\" to quit) > "); while (!string.ReferenceEquals((inText = stdInput.ReadLine()), null)) { // --------------------------------- // Get spell correction on the input // --------------------------------- // convert input text to TokenObj TextObj textObj = new TextObj(inText); List <TokenObj> inTextList = textObj.GetTokenList(); // *2 because tokenList include space string tarWord = inTextList[tarPos * 2].GetTokenStr(); for (int i = 1; i < tarSize; i++) { int ii = (tarPos + 1) * 2; tarWord += " " + inTextList[ii].GetTokenStr(); } Console.WriteLine("- input text: [" + inText + "]"); Console.WriteLine("- target: [" + tarPos + "|" + tarSize + "|" + tarWord + "]"); Console.WriteLine("- context radius: " + contextRadius); // get all possible candidates HashSet <string> candSet = NonWord1To1Candidates.GetCandidates(tarWord, cSpellApi); candSet.Add(tarWord); // add the original word Console.WriteLine("-- canSet.size(): " + candSet.Count); // get final suggestion // remove space token from the list List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList); string topRankStr = GetTopRankStr(tarWord, candSet, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, detailFlag); Console.WriteLine("- top rank str: " + topRankStr); // print details if (detailFlag == true) { HashSet <ContextScore> candScoreSet = GetCandidateScoreSet(candSet, tarPos, tarSize, inTextList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, detailFlag); Console.WriteLine("------ Suggestion List ------"); var list = candScoreSet.OrderBy(x => x, csc).Take((int)limitNo).Select(obj => obj.ToString()); foreach (var item in list) { Console.WriteLine(item); } } // print the prompt Console.WriteLine("- Please input a text, only a spell error allowed (type \"Ctl-d\" to quit) > "); } } catch (Exception e2) { Console.Error.WriteLine(e2.Message); returnValue = -1; } } catch (Exception e) { Console.Error.WriteLine(e.Message); returnValue = -1; } return(returnValue); }