// return the best ranked str from candidates using orthographic score // tarPos: start from 0, not include empty space token private static MergeObj GetTopRankMergeObjByFrequency(HashSet <MergeObj> candidates, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList) { // init WordWcMap wordWcMap = cSpellApi.GetWordWcMap(); int maxCandNo = cSpellApi.GetCanMaxCandNo(); MergeObj topRankMergeObj = null; // get the top rank mergeObj by frequency if (candidates.Count > 0) { // 1. convert mergeObj set to string set // key: coreMergeWord, MergeObj Dictionary <string, MergeObj> candStrMergeObjMap = new Dictionary <string, MergeObj>(); foreach (MergeObj mergeObj in candidates) { string mergeWord = mergeObj.GetCoreMergeWord(); candStrMergeObjMap[mergeWord] = mergeObj; } HashSet <string> candStrSet = new HashSet <string>(candStrMergeObjMap.Keys); // 2. find the top rank by Str string topRankStr = RankByFrequency.GetTopRankStr(candStrSet, wordWcMap); // 3. convert back from top rank str to MergeObj // topRankStr should never be null because candidates is > 0 if (!string.ReferenceEquals(topRankStr, null)) { topRankMergeObj = candStrMergeObjMap.GetValueOrNull(topRankStr); } // 4. print out frequency score detail ScoreDetailByMode.PrintFrequencyScore(candStrSet, wordWcMap, maxCandNo, debugFlag); } return(topRankMergeObj); }
// public method /// <summary> /// The core method to correct a word by following steps: /// <ul> /// <li>detect if real-word for merge /// <li>get candidates /// <ul> /// <li>get candidates from merge. /// </ul> /// <li>Rank candidates /// <ul> /// <li>context /// <li>frequency (TBD) /// </ul> /// <li>Update information /// /// </ul> /// </summary> /// <param name="tarPos"> the position of target tokenObj </param> /// <param name="nonSpaceTokenList"> token list without space tokens </param> /// <param name="cSpellApi"> for all dictioanry and Word2Vec data </param> /// <param name="debugFlag"> boolean flag for debug print /// </param> /// <returns> the corrected merged word in MergeObj if the target token /// matches real-word merged rules. /// Otherwise, a null of MergeObj is returned. </returns> // return the original term if no good correctin are found public static MergeObj GetCorrectTerm(int tarPos, List <TokenObj> nonSpaceTokenList, CSpellApi cSpellApi, bool debugFlag) { // get tarWord from tarTokenObj and init outTokenObj TokenObj tarTokenObj = nonSpaceTokenList[tarPos]; string tarWord = tarTokenObj.GetTokenStr(); // 1. only remove ending punctuation for coreTerm // No coreStr is used for real-word merge for less aggressive //String coreStr = TermUtil.StripEndPuncSpace(tarWord).toLowerCase(); // 2. real-word merge correction // check if tarWord and removeEndPuncStr is OOV MergeObj outMergeObj = null; // no merge if it is null if ((tarTokenObj.GetProcHist().Count == 0) && (RealWordMergeDetector.IsDetect(tarWord, cSpellApi, debugFlag) == true)) { cSpellApi.UpdateDetectNo(); // TBD, should take care of possessive xxx's here // 3. get candidates from merge // set mergeWithHypehn to false for real-word merge HashSet <MergeObj> mergeSet = RealWordMergeCandidates.GetCandidates(tarPos, nonSpaceTokenList, cSpellApi); // 4. Ranking: get top ranked candidates as corrected terms // 4.1 just use frenquency or context, no orthoGraphic // in case of using context // need the context & frequency score for the orgMergeTerm outMergeObj = RankRealWordMergeByMode.GetTopRankMergeObj(mergeSet, cSpellApi, debugFlag, tarPos, nonSpaceTokenList); } return(outMergeObj); }
// cSpell private static MergeObj GetTopRankMergeObjByCSpell(HashSet <MergeObj> candidates, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList) { // use context first for higher accuracy MergeObj topRankMergeObj = GetTopRankMergeObjByContext(candidates, cSpellApi, debugFlag, tarPos, nonSpaceTokenList); // then use frequency for more recall if (topRankMergeObj == null) { topRankMergeObj = GetTopRankMergeObjByFrequency(candidates, cSpellApi, debugFlag, tarPos, nonSpaceTokenList); } return(topRankMergeObj); }
// return the best ranked str from candidates using word2Vec score // inTokenList, includes space token, is not coreTerm.Lc // return null if no candidate is found to correct public static MergeObj GetTopRankMergeObj(HashSet <MergeObj> candidates, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, double rwMergeFactor, bool debugFlag) { // init the topRankMergeObj MergeObj topRankMergeObj = null; if (candidates.Count > 0) { // 1. find sorted score list for each candidates ... List <ContextScore> candScoreList = GetCandidateScoreList(candidates, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, debugFlag); // 2. find the top ranked str // the 0 element has the highest score because it is sorted // only 1 candidate, use it for nonWord ContextScore topContextScore = null; if (candScoreList.Count > 0) { topContextScore = candScoreList[0]; } // 3. find the mergeObj from the topRankStr (if exist) if (topContextScore != null) { // 3.1. convert mergeObj set to string set // key: coreMergeWord, MergeObj Dictionary <string, MergeObj> candStrMergeObjMap = new Dictionary <string, MergeObj>(); foreach (MergeObj mergeObj in candidates) { string mergeWord = mergeObj.GetCoreMergeWord(); candStrMergeObjMap[mergeWord] = mergeObj; } HashSet <string> andStrSet = new HashSet <string>(candStrMergeObjMap.Keys); // 3.2 convert back from top rank str to MergeObj // topRankStr should never be null because candidates is > 0 string topRankStr = topContextScore.GetTerm(); topRankMergeObj = candStrMergeObjMap.GetValueOrNull(topRankStr); // 4. compare the top rank merge to the original string b4 merge // 1. get the word2Vec score for the orgMergeTerm b4 merge // 1.1 wordVec for context int tarPos = topRankMergeObj.GetStartPos(); // tarSize is the total token No of the orgMergeWords int tarSize = topRankMergeObj.GetEndPos() - topRankMergeObj.GetStartPos() + 1; DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag); // 1.2 wordVec for the original words before merge string orgMergeWord = topRankMergeObj.GetOrgMergeWord(); ContextScore orgContextScore = new ContextScore(orgMergeWord, contextVec, word2VecOm); // validate top merge candidate, set to null if false if (IsTopCandValid(orgContextScore, topContextScore, rwMergeFactor, debugFlag) == false) { // set to null if score is not good enough for corection topRankMergeObj = null; } } } return(topRankMergeObj); }
// use context score private static MergeObj GetTopRankMergeObjByContext(HashSet <MergeObj> candidates, CSpellApi cSpellApi, int tarPos, List <TokenObj> nonSpaceTokenList, bool debugFlag) { // init Word2Vec word2VecIm = cSpellApi.GetWord2VecIm(); Word2Vec word2VecOm = cSpellApi.GetWord2VecOm(); int contextRadius = cSpellApi.GetNwMergeContextRadius(); bool word2VecSkipWord = cSpellApi.GetWord2VecSkipWord(); int maxCandNo = cSpellApi.GetCanMaxCandNo(); MergeObj topRankMergeObj = RankNonWordMergeByContext.GetTopRankMergeObj(candidates, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, debugFlag); return(topRankMergeObj); }
// public method // process public static List <TokenObj> Process(List <TokenObj> inTokenList, CSpellApi cSpellApi, bool debugFlag) { DebugPrint.PrintProcess("5. RealWord-Merge", debugFlag); DebugPrint.PrintInText(TextObj.TokenListToText(inTokenList), debugFlag); // pre-porcess // update Pos for the inTokenList TextObj.UpdateIndexPos(inTokenList); // 1. remove non space-token and convert to non-space-token list List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTokenList); // 2. process: go through each token for detection and correction // to find merge corrections (mergeObjList) int index = 0; List <MergeObj> mergeObjList = new List <MergeObj>(); int maxLegitTokenLength = cSpellApi.GetMaxLegitTokenLength(); while (index < inTokenList.Count) { TokenObj curTokenObj = inTokenList[index]; // update the tarPos // SCR-3, use legit token if (curTokenObj.IsLegitToken(maxLegitTokenLength) == true) { int tarPos = inTokenList[index].GetPos(); // correct term is the highest ranked candidates MergeObj mergeObj = RealWordMergeCorrector.GetCorrectTerm(tarPos, nonSpaceTokenList, cSpellApi, debugFlag); if (mergeObj == null) // no merge correction { index++; } else // has merge correction { mergeObjList.Add(mergeObj); // next token after end token, this ensures no overlap merge index = mergeObj.GetEndIndex() + 1; } } else // space token // update index { index++; } } // update the output for merge for the whole inTokenList, // has to update after the loop bz merge might // happen to the previous token // update the tokenObj up to the merge, then go to the next token // update operation info also List <TokenObj> outTokenList = MergeCorrector.CorrectTokenListByMerge(inTokenList, mergeObjList, TokenObj.HIST_RW_M, debugFlag, cSpellApi); return(outTokenList); }
// return the best ranked str from candidates using word2Vec score // inTokenList, includes space token, is not coreTerm.Lc // return the orignal inStr if no candidate has score > 0.0d public static MergeObj GetTopRankMergeObj(HashSet <MergeObj> candidates, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, bool debugFlag) { MergeObj topRankMergeObj = null; if (candidates.Count > 0) { // 1. find sorted score list for each candidates ... List <ContextScore> candScoreList = GetCandidateScoreList(candidates, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, debugFlag); // 2. find the hgihest str // the 0 element has the highest score because it is sorted string topRankStr = null; // only 1 candidate, use it for nonWord if (candScoreList.Count == 1) { topRankStr = candScoreList[0].GetTerm(); } else if (candScoreList.Count > 0) // multiple candidates // 1. Check the score, the top rank must be > 0.0 // This shold use the corrdinated comparator, which +, 0, - //if(candScoreList.get(0).GetScore() > 0.0d) // 2. Use score system 2 // Check the score, no updated if the top score is 0.0 // It works for top score is + or - // if the top is 0.0, no updated because top can is not in w2v // if top score is 0, we don't know is it better than - // top rank rules: score can't be 0.0d { if (candScoreList[0].GetScore() != 0.0d) { topRankStr = candScoreList[0].GetTerm(); } } // 3. find the mergeObj from the topRankStr if (!string.ReferenceEquals(topRankStr, null)) { // 3.1. convert mergeObj set to string set // key: coreMergeWord, MergeObj Dictionary <string, MergeObj> candStrMergeObjMap = new Dictionary <string, MergeObj>(); foreach (MergeObj mergeObj in candidates) { string mergeWord = mergeObj.GetCoreMergeWord(); candStrMergeObjMap[mergeWord] = mergeObj; } HashSet <string> candStrSet = new HashSet <string>(candStrMergeObjMap.Keys); // 3.2 convert back from top rank str to MergeObj // topRankStr should never be null because candidates is > 0 topRankMergeObj = candStrMergeObjMap.GetValueOrNull(topRankStr); } } return(topRankMergeObj); }
// clean up mergeObjList: // 1. contain, remove the previous one // 2. overlap, remove the latter one // This is a quick fix for window = 2. the permanemnt fix should be a // real-time update on each merge private static List <MergeObj> CleanUpMergeObjList(List <MergeObj> mergeObjList) { List <MergeObj> outMergeObjList = new List <MergeObj>(); bool skipNext = false; for (int i = 0; i < mergeObjList.Count; i++) { MergeObj mergeObj1 = mergeObjList[i]; if (i < mergeObjList.Count - 1) { MergeObj mergeObj2 = mergeObjList[i + 1]; // next mergeObj int startPos1 = mergeObj1.GetStartPos(); int startPos2 = mergeObj2.GetStartPos(); int endPos1 = mergeObj1.GetEndPos(); int endPos2 = mergeObj2.GetEndPos(); // mergeObj2 contains mergeObj1 if ((startPos1 == startPos2) && (endPos1 < endPos2)) { continue; } // merObj2 has overlap with mergeObj1 else if ((startPos2 > startPos1) && (startPos2 < endPos1)) { outMergeObjList.Add(mergeObj1); skipNext = true; } else { if (skipNext == true) { skipNext = false; } else { outMergeObjList.Add(mergeObj1); } } } else { // add the last mergeObj if (skipNext == false) { outMergeObjList.Add(mergeObj1); } } } return(outMergeObjList); }
// public method public static MergeObj GetTopRankMergeObj(HashSet <MergeObj> candidates, CSpellApi cSpellApi, int tarPos, List <TokenObj> nonSpaceTokenList, bool debugFlag) { /* * // use frequency score for merge * MergeObj mergeObj = GetTopRankMergeObjByFrequency(candidates, * cSpellApi, debugFlag, tarPos, nonSpaceTokenList); * // use context score for merge * MergeObj mergeObj = GetTopRankMergeObjByContext(candidates, * cSpellApi, debugFlag, tarPos, nonSpaceTokenList); */ // use combination MergeObj mergeObj = GetTopRankMergeObjByCSpell(candidates, cSpellApi, tarPos, nonSpaceTokenList, debugFlag); return(mergeObj); }
// public method public static MergeObj GetTopRankMergeObj(HashSet <MergeObj> candidates, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList) { // only have time to test Word2Vec score, yet to test other scores MergeObj mergeObj = GetTopRankMergeObjByContext(candidates, cSpellApi, debugFlag, tarPos, nonSpaceTokenList); /* * // use frequency score for merge * MergeObj mergeObj = GetTopRankMergeObjByFrequency(candidates, * cSpellApi, debugFlag, tarPos, nonSpaceTokenList); * // use context score for merge * MergeObj mergeObj = GetTopRankMergeObjByContext(candidates, * cSpellApi, debugFlag, tarPos, nonSpaceTokenList); * // use combination * MergeObj mergeObj = GetTopRankMergeObjByCSpell(candidates, * cSpellApi, debugFlag, tarPos, nonSpaceTokenList); */ return(mergeObj); }
private static void TestGetCorrectTerm(CSpellApi cSpellApi) { // init // all lowerCase string inText = "Dur ing my absent."; bool debugFlag = false; List <TokenObj> inTokenList = TextObj.TextToTokenList(inText); // 1. convert to the non-empty token list List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTokenList); // result int tarPos = 0; MergeObj mergeObj = NonWordMergeCorrector.GetCorrectTerm(tarPos, nonSpaceTokenList, cSpellApi, debugFlag); // print out Console.WriteLine("--------- GetCorrectTerm( ) -----------"); Console.WriteLine("In: [" + inText + "]"); Console.WriteLine("In nonSpaceTokenList: [" + nonSpaceTokenList.Count + "]"); Console.WriteLine("Out MergeObj: [" + mergeObj.ToString() + "]"); }
// public method /// <summary> /// The core method to correct a word by following steps: /// <ul> /// <li>Convert inToken to removeEndPuncStr /// <li>detect if misspell (OOV) - non-word, exclude Aa /// <li>get candidates /// <ul> /// <li>get candidates from merge. /// </ul> /// <li>Rank candidates /// <ul> /// <li>orthographic /// <li>frequency /// <li>context /// </ul> /// <li>Update information /// /// </ul> /// </summary> /// <param name="tarPos"> postion of target token </param> /// <param name="nonSpaceTokenList"> token list without space token(s) </param> /// <param name="cSpellApi"> CSpell Api object </param> /// <param name="debugFlag"> flag for debug print /// </param> /// <returns> the corrected merged word in MergeObj if the token is OOV /// and suggested merged word found. /// Otherwise, a null of MergeObj is returned. </returns> // return the original term if no good correctin are found public static MergeObj GetCorrectTerm(int tarPos, List <TokenObj> nonSpaceTokenList, CSpellApi cSpellApi, bool debugFlag) { // get tarWord from tarTokenObj and init outTokenObj TokenObj tarTokenObj = nonSpaceTokenList[tarPos]; string tarWord = tarTokenObj.GetTokenStr(); MergeObj outMergeObj = null; // no merge if it is null // 1. only remove ending punctuation for coreTerm string coreStr = TermUtil.StripEndPuncSpace(tarWord).ToLower(); // 2. non-word correction // check if tarWord and removeEndPuncStr is OOV if (NonWordMergeDetector.IsDetect(tarWord, coreStr, cSpellApi, debugFlag) == true) { cSpellApi.UpdateDetectNo(); // 3. get candidates from merge HashSet <MergeObj> mergeSet = NonWordMergeCandidates.GetCandidates(tarPos, nonSpaceTokenList, cSpellApi); // 4. Ranking: get top ranked candidates as corrected terms // 4.1 just use frenquency or context, no orthoGraphic // in case of using context outMergeObj = RankNonWordMergeByMode.GetTopRankMergeObj(mergeSet, cSpellApi, tarPos, nonSpaceTokenList, debugFlag); } return(outMergeObj); }
// return the best ranked str from candidates using context score // this method is replaced by GetTopRankStr, which sorted by comparator public static MergeObj GetTopRankMergeObjByScore(HashSet <MergeObj> candidates, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, bool debugFlag) { MergeObj topRankMergeObj = null; double maxScore = 0.0d; foreach (MergeObj mergeObj in candidates) { // 1. get the context and contextVec int tarPos = mergeObj.GetStartPos(); int tarSize = mergeObj.GetEndPos() - mergeObj.GetStartPos() + 1; DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag); // 2. get ContextSocre for each merge, use output matrix string mergeWord = mergeObj.GetCoreMergeWord(); ContextScore cs = new ContextScore(mergeWord, contextVec, word2VecOm); double score = cs.GetScore(); // update only if the score is > 0.0d if (score > maxScore) { topRankMergeObj = mergeObj; maxScore = score; } } return(topRankMergeObj); }
// private methods // this test is not verified private static int RunTest(bool detailFlag, int tarPos, int tarSize, int contextRadius, long limitNo) { // init dic string configFile = "../data/Config/cSpell.properties"; CSpellApi cSpellApi = new CSpellApi(configFile); cSpellApi.SetRankMode(CSpellApi.RANK_MODE_CONTEXT); Word2Vec word2VecIm = cSpellApi.GetWord2VecIm(); Word2Vec word2VecOm = cSpellApi.GetWord2VecOm(); bool word2VecSkipWord = cSpellApi.GetWord2VecSkipWord(); ContextScoreComparator <ContextScore> csc = new ContextScoreComparator <ContextScore>(); // provide cmdLine interface int returnValue = 0; try { StreamReader stdInput = new StreamReader(Console.OpenStandardInput()); try { string inText = null; Console.WriteLine("- Please input a text, only a spell error allowed (type \"Ctl-d\" to quit) > "); while (!string.ReferenceEquals((inText = stdInput.ReadLine()), null)) { // --------------------------------- // Get spell correction on the input // --------------------------------- // convert input text to TokenObj TextObj textObj = new TextObj(inText); List <TokenObj> inTextList = textObj.GetTokenList(); // remove space token from the list List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList); // *2 because tokenList include space string tarWord = inTextList[tarPos * 2].GetTokenStr(); for (int i = 1; i < tarSize; i++) { int ii = (tarPos + 1) * 2; tarWord += " " + inTextList[ii].GetTokenStr(); } Console.WriteLine("- input text: [" + inText + "]"); Console.WriteLine("- target: [" + tarPos + "|" + tarSize + "|" + tarWord + "]"); Console.WriteLine("- context radius: " + contextRadius); // get all possible candidates HashSet <MergeObj> candSet = NonWordMergeCandidates.GetCandidates(tarPos, nonSpaceTokenList, cSpellApi); Console.WriteLine("-- canSet.size(): " + candSet.Count); // get final suggestion MergeObj topRankMergeObj = GetTopRankMergeObj(candSet, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, detailFlag); Console.WriteLine("- top rank merge Obj: " + topRankMergeObj.ToString()); // print details if (detailFlag == true) { HashSet <ContextScore> candScoreSet = GetCandidateScoreSet(candSet, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, detailFlag); Console.WriteLine("------ Suggestion List ------"); var list = candScoreSet.OrderBy(x => x, csc).Take((int)limitNo).Select(obj => obj.ToString()).ToList(); foreach (var item in list) { Console.WriteLine(item); } } // print the prompt Console.WriteLine("- Please input a text, only a spell error allowed (type \"Ctl-d\" to quit) > "); } } catch (Exception e2) { Console.Error.WriteLine(e2.Message); returnValue = -1; } } catch (Exception e) { Console.Error.WriteLine(e.Message); returnValue = -1; } return(returnValue); }