// include multiwords, multiwords = avg. score private static double GetScoreDev2(string inWord, WordWcMap wordWcMap) { // check multiword case for split bool normFlag = false; // don't use punctuation for determiner List <string> wordList = TermUtil.ToWordList(inWord, normFlag); double score = 0.0; double totalScore = 0.0; int totalWords = wordList.Count; //double maxWc = GetAdjustedWc(wordWcMap.GetMaxWc()); // use the average score for the multiwords foreach (string word in wordList) { //double curScore = GetScoreByChurch(word, wordWcMap); //double curScore = GetScoreByCrowell(word, wordWcMap); //double curScore = GetScoreByPeter(word, wordWcMap); //double curScore = GetUnigramFreqScore(word, wordWcMap); //double curScore = GetWc(word, wordWcMap); double curScore = GetScoreDev1(word, wordWcMap); totalScore += curScore; } if (totalScore > 0.0) { score = totalScore / totalWords; } return(score); }
// check all split words form a term to verify it is a valid // inTerm is the term to be split // the inTerm is a coreTerm public static bool IsValidSplitWords(string inTerm, CSpellApi cSpellApi) { //RootDictionary unitDic = cSpellApi.GetUnitDic(); List <string> splitWordList = TermUtil.ToWordList(inTerm); bool validFlag = true; // go through all split words, they can be: // 1. digit (pure number) // 2. unit // 3. word in the split word dictionary: English + ProperNoun (not Aa) // if any splitWord is not above, the split is false foreach (string splitWord in splitWordList) { /* remove unit and digit beacuse: * 1. they are handled in ND * 2. some unit are Aa, such as ng, cause noise [FP] * - seing => se i ng, no good * if((DigitPuncTokenUtil.IsDigit(splitWord) == false) // digit * && (unitDic.IsDicWord(splitWord) == false) // unit * && (IsValidSplitWord(splitWord, cSpellApi) == false))// split word */ if (IsValidSplitWord(splitWord, cSpellApi) == false) { validFlag = false; break; } } return(validFlag); }
// These are hueristic rule for real-wrod split // check the total no of short word for split words in inTerm (candidate) // short word is configurable, such as 2 or 3 // the total no of split shot word must less than a number, default is 2 // This rule is added to filter out: some -> so me, // filter out: another -> a not her (shortSplitWordNo = 3) // filter out: anyone -> any one (shortSplitWordNo = 2) // 1. keep: away -> a way (shortSplitWordNo = 1) // 2. filter: out soon -> so on (shortSplitWordNo = 2) // 3. filter: out anyway -> any way (shortSplitWordNo = 2) private static bool CheckShortSplitWords(string inTerm, CSpellApi cSpellApi) { // init int shortSplitWordLength = cSpellApi.GetCanRwShortSplitWordLength(); int maxShortSplitWordNo = cSpellApi.GetCanRwMaxShortSplitWordNo(); // convert to word list List <string> wordList = TermUtil.ToWordList(inTerm); bool flag = true; int shortSplitWordNo = 0; // total no of short split word 1 foreach (string word in wordList) { // find shor word if (word.Length <= shortSplitWordLength) { shortSplitWordNo++; } } // check the total no of short split words (length <= 2) if (shortSplitWordNo >= maxShortSplitWordNo) { flag = false; } return(flag); }
// 3 operations: // convert a tokenObj to a arrayList of tokenObjs: // 1. merge (delete) a tokenObj if the str is empty (length = 0) // 2. keep the same tokenObj if str is a single word // 3. split a tokenObj if the str contains space public static void AddSplit1To1Correction(List <TokenObj> inList, TokenObj inToken) { string tokenStr = inToken.GetTokenStr(); // 1. do not add to the list if the token is empty if ((string.ReferenceEquals(tokenStr, null)) || (tokenStr.Length == 0)) { // do nothing } // 2. keep the same tokenObj if there is no change // 1-to-1 correction else if (TermUtil.IsMultiword(tokenStr) == false) { Add1To1Correction(inList, inToken); // TB Deleted //inList.add(inToken); } // 3. split a tokenObj to an arrayList if the str has space else { AddSplitCorrection(inList, inToken); /* TB deleted * ArrayList<TokenObj> tempTokenList = new ArrayList<TokenObj>(); * // keep token and delimiters * String[] tokenArray = tokenStr.split(TextObj.patternStrSpace_); * tempTokenList = new ArrayList<TokenObj>(Arrays.stream(tokenArray) * .map(token -> new TokenObj(inToken, token)) * .collect(Collectors.toList())); * inList.addAll(tempTokenList); */ } }
/// <summary> /// Compare two object o1 and o2. Both objects o1 and o2 are /// NoisyChannelScore. The compare algorithm: /// </summary> /// <param name="o1"> first object to be compared </param> /// <param name="o2"> second object to be compared /// </param> /// <returns> a negative integer, 0, or positive integer to represent the /// object o1 is less, equals, or greater than object 02. </returns> public virtual int Compare(NoisyChannelScore o1, NoisyChannelScore o2) { // 1. compare how many words for the candidates // for now, we assume less word is better, // i.e. whatever is better than "what ever" int @out = 0; string cand1 = ((NoisyChannelScore)o1).GetCandStr(); string cand2 = ((NoisyChannelScore)o2).GetCandStr(); int wordNo1 = TermUtil.GetWordNo(cand1); int wordNo2 = TermUtil.GetWordNo(cand2); if (wordNo1 != wordNo2) { @out = wordNo1 - wordNo2; // less wordNo has higher rank } else { // 2. compare noisy Channel score double score1 = ((NoisyChannelScore)o1).GetScore(); double score2 = ((NoisyChannelScore)o2).GetScore(); // SCR-2: use a fixed number to ensure result is not 0. if (score2 > score1) { // from high to low @out = 1; } else if (score2 < score1) { @out = -1; } else { // 3. compare by orthographic score OrthographicScore oScore1 = ((NoisyChannelScore)o1).GetOScore(); OrthographicScore oScore2 = ((NoisyChannelScore)o2).GetOScore(); if (oScore1.GetScore() != oScore2.GetScore()) { OrthographicScoreComparator <OrthographicScore> osc = new OrthographicScoreComparator <OrthographicScore>(); @out = osc.Compare(oScore1, oScore2); } else // 4. hannelScore { FrequencyScore fScore1 = ((NoisyChannelScore)o1).GetFScore(); FrequencyScore fScore2 = ((NoisyChannelScore)o2).GetFScore(); if (fScore1.GetScore() != fScore2.GetScore()) { FrequencyScoreComparator <FrequencyScore> fsc = new FrequencyScoreComparator <FrequencyScore>(); @out = fsc.Compare(fScore1, fScore2); } else // 4. alphabetic order { @out = cand2.CompareTo(cand1); } } } } return(@out); }
// Use Avg. word2Vec Om for each word in the inTerm private static DoubleVec GetWordVecForTerm(string inTerm, Word2Vec w2vOm) { List <string> inWordList = TermUtil.ToWordList(inTerm); // avg. the wordVec if inTerm is a multiword DoubleVec outWordVec = GetAvgWordVecForList(inWordList, w2vOm); // TBD: take care of possesive return(outWordVec); }
// private method // Test merge and Split private static void Test(string inText, int tarPos, int tarSize, int radius, string mergedWord, string splitWords, Word2Vec w2vIm, Word2Vec w2vOm) { // 0. process the inText TextObj textObj = new TextObj(inText); List <TokenObj> inTextList = textObj.GetTokenList(); // remove space token from the list List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList); Console.WriteLine("=========================================="); Console.WriteLine("-- inTextList: [" + inText + "]"); bool word2VecSkipWord = true; bool debugFlag = false; // 1.a context with window radius DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, w2vIm, radius, word2VecSkipWord, debugFlag); // 1.b context with all inText DoubleVec contextVecA = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, w2vIm, word2VecSkipWord, debugFlag); // 1.c get score1 ContextScore score1 = new ContextScore(mergedWord, contextVec, w2vOm); ContextScore score1a = new ContextScore(mergedWord, contextVecA, w2vOm); Console.WriteLine(score1.ToString() + "|" + string.Format("{0,1:F8}", score1a.GetScore())); // 2. split words ContextScore score2 = new ContextScore(splitWords, contextVec, w2vOm); ContextScore score2a = new ContextScore(splitWords, contextVecA, w2vOm); Console.WriteLine(score2.ToString() + "|" + string.Format("{0,1:F8}", score2a.GetScore())); // 3. 3. 3. Use avg. score on single words // This method use different context for each single word List <string> splitWordList = TermUtil.ToWordList(splitWords); int index = 0; double scoreSAvg = 0.0d; // radius double scoreSAAvg = 0.0d; // all inText //debugFlag = false; foreach (string splitWord in splitWordList) { // window radius DoubleVec contextVecS = Word2VecContext.GetContextVec(tarPos + index, 1, nonSpaceTokenList, w2vIm, radius, word2VecSkipWord, debugFlag); ContextScore scoreS = new ContextScore(splitWord, contextVecS, w2vOm); //System.out.println("-- " + scoreS.ToString()); scoreSAvg += scoreS.GetScore(); // all text DoubleVec contextVecSA = Word2VecContext.GetContextVec(tarPos + index, 1, nonSpaceTokenList, w2vIm, word2VecSkipWord, debugFlag); ContextScore scoreSA = new ContextScore(splitWord, contextVecSA, w2vOm); //System.out.println("-- " + scoreSA.ToString()); scoreSAAvg += scoreSA.GetScore(); index++; } scoreSAvg = scoreSAvg / index; // window scoreSAAvg = scoreSAAvg / index; // all text Console.WriteLine("Avg. Single Word|" + string.Format("{0,1:F8}", scoreSAvg) + "|" + string.Format("{0,1:F8}", scoreSAAvg)); }
// get possible split set by replacing hyphen with space protected internal static string GetSplitByPunc(string inWord, char puncChar) { char[] temp = inWord.ToCharArray(); for (int i = 0; i < temp.Length; i++) { if (temp[i] == puncChar) { temp[i] = ' '; } } string splitStr = TermUtil.Trim(new string(temp)); return(splitStr); }
// These are hueristic rule for real-word one-to-one correction // check if all one-to-one words in inTerm (candidate) // 1. must have wordVec. private static bool Check1To1Words(string inTerm, Word2Vec word2VecOm) { List <string> wordList = TermUtil.ToWordList(inTerm); bool flag = true; foreach (string word in wordList) { if (word2VecOm.HasWordVec(word) == false) { flag = false; break; } } return(flag); }
// get all possible split combination by 1 space // lowercase only // not include duplicates // This is the core split process by space protected internal static HashSet <string> GetSplitSetBy1Space(string inWord) { HashSet <string> splitSet = new HashSet <string>(); string word = inWord.ToLower(); // Insert space inside the word, not on either ends for (int i = 1; i < word.Length; i++) { // Insert space for split string insertWord = word.Substring(0, i) + GlobalVars.SPACE_STR + word.Substring(i); // remove multiple spaces // needed when inserting a space to a space // Use this to convert "a b" to "a b" splitSet.Add(TermUtil.StringTrim(insertWord)); } return(splitSet); }
private static void AddMergeObj(string tarWord, string orgMergeWord, string mergeWord, int mergeNo, int startIndex, int tarIndex, int endIndex, int startPos, int tarPos, int endPos, HashSet <MergeObj> mergeSet, RootDictionary suggestDic, RootDictionary aADic) { // 1. convert merged word to coreTerm int ctType = CoreTermUtil.CT_TYPE_SPACE_PUNC; bool lcFlag = true; // only take care of the end punctuation for the coreTerm string coreStr = TermUtil.StripEndPuncSpace(mergeWord); // 2. check if the coreStr of mergeWord is in suggest Dic // the merge word is not a Aa, assuming no merge for Aa // becase Aa is short enough if ((suggestDic.IsDicWord(coreStr) == true) && (aADic.IsDicWord(coreStr) == false)) { MergeObj mergeObj = new MergeObj(tarWord, orgMergeWord, mergeWord, coreStr, mergeNo, startIndex, tarIndex, endIndex, startPos, tarPos, endPos); mergeSet.Add(mergeObj); } }
// this method is to be deleted because it has same result as GetScore() public static double GetScore2(string inTerm, DoubleVec contextVec, Word2Vec w2vOm) { List <string> inWordList = TermUtil.ToWordList(inTerm); double score = 0.0d; int count = 0; foreach (string word in inWordList) { DoubleVec wordVec = w2vOm.GetWordVec(word); if (wordVec != null) { score += GetCwobScore(wordVec, contextVec); } count++; } // add score first, then calculate the avg. score = score / count; return(score); }
// get socre for single word and multiwords (for split cases) // 1). multiword: score = avg. score of allwords // 2). single word: score = log(adjust WC) / log (adjust Max. WC). public static double GetAdjustScoreAvg(string inWord, WordWcMap wordWcMap) { // check multiword case for split bool normFlag = false; // don't use punctuation for determiner List <string> wordList = TermUtil.ToWordList(inWord, normFlag); double score = 0.0; double totalScore = 0.0; long totalWords = wordList.Count; double maxWc = GetAdjustedWc(wordWcMap.GetMaxWc()); // use the average score for the multiwords foreach (string word in wordList) { totalScore += GetWordScore(word, maxWc, wordWcMap); } if (totalWords > 0) { score = totalScore / totalWords; } return(score); }
// check all split words private static bool CheckSplitWords(string inTerm, CSpellApi cSpellApi) { // convert to word list List <string> splitWordList = TermUtil.ToWordList(inTerm); // go through all split words, they can be: // 1. digit (pure number) // 2. unit // 3. word in the split word dictionary: English + ProperNoun (not Aa) // if any splitWord is not above, the split is false bool flag = true; foreach (string splitWord in splitWordList) { // check each split word if (IsValidSplitWord(splitWord, cSpellApi) == false) { flag = false; break; } } return(flag); }
// public method /// <summary> /// The core method to correct a word by following steps: /// <ul> /// <li>Convert inToken to removeEndPuncStr /// <li>detect if misspell (OOV) - non-word, exclude Aa /// <li>get candidates /// <ul> /// <li>get candidates from merge. /// </ul> /// <li>Rank candidates /// <ul> /// <li>orthographic /// <li>frequency /// <li>context /// </ul> /// <li>Update information /// /// </ul> /// </summary> /// <param name="tarPos"> postion of target token </param> /// <param name="nonSpaceTokenList"> token list without space token(s) </param> /// <param name="cSpellApi"> CSpell Api object </param> /// <param name="debugFlag"> flag for debug print /// </param> /// <returns> the corrected merged word in MergeObj if the token is OOV /// and suggested merged word found. /// Otherwise, a null of MergeObj is returned. </returns> // return the original term if no good correctin are found public static MergeObj GetCorrectTerm(int tarPos, List <TokenObj> nonSpaceTokenList, CSpellApi cSpellApi, bool debugFlag) { // get tarWord from tarTokenObj and init outTokenObj TokenObj tarTokenObj = nonSpaceTokenList[tarPos]; string tarWord = tarTokenObj.GetTokenStr(); MergeObj outMergeObj = null; // no merge if it is null // 1. only remove ending punctuation for coreTerm string coreStr = TermUtil.StripEndPuncSpace(tarWord).ToLower(); // 2. non-word correction // check if tarWord and removeEndPuncStr is OOV if (NonWordMergeDetector.IsDetect(tarWord, coreStr, cSpellApi, debugFlag) == true) { cSpellApi.UpdateDetectNo(); // 3. get candidates from merge HashSet <MergeObj> mergeSet = NonWordMergeCandidates.GetCandidates(tarPos, nonSpaceTokenList, cSpellApi); // 4. Ranking: get top ranked candidates as corrected terms // 4.1 just use frenquency or context, no orthoGraphic // in case of using context outMergeObj = RankNonWordMergeByMode.GetTopRankMergeObj(mergeSet, cSpellApi, tarPos, nonSpaceTokenList, debugFlag); } return(outMergeObj); }
public static double GetAdjustScoreMin(string inWord, WordWcMap wordWcMap) { // check multiword case for split bool normFlag = false; // don't use punctuation for determiner List <string> wordList = TermUtil.ToWordList(inWord, normFlag); double score = 0.0; double totalScore = 0.0; int totalWords = wordList.Count; double maxWc = GetAdjustedWc(wordWcMap.GetMaxWc()); // use the average score for the multiwords double minScore = int.MaxValue; foreach (string word in wordList) { double curScore = GetWordScore(word, maxWc, wordWcMap); minScore = (curScore < minScore ? curScore : minScore); } if (minScore < int.MaxValue) { score = minScore; } return(score); }
/// <summary> /// Compare two object o1 and o2. Both objects o1 and o2 are /// FrequencyScore. The compare algorithm: /// </summary> /// <param name="o1"> first object to be compared </param> /// <param name="o2"> second object to be compared /// </param> /// <returns> a negative integer, 0, or positive integer to represent the /// object o1 is less, equals, or greater than object 02. </returns> public virtual int Compare(FrequencyScore o1, FrequencyScore o2) { // 1. compare how many words // for now, we assume less word is better, // i.e. whatever is better than "what ever" int @out = 0; string word1 = ((FrequencyScore)o1).GetWord(); string word2 = ((FrequencyScore)o2).GetWord(); int wordNo1 = TermUtil.GetWordNo(word1); int wordNo2 = TermUtil.GetWordNo(word2); if (wordNo1 != wordNo2) { @out = wordNo1 - wordNo2; // less wordNo has higher rank } else // same word no // 2. compare total score first { double score1 = ((FrequencyScore)o1).GetScore(); double score2 = ((FrequencyScore)o2).GetScore(); // SCR-2: use a fixed number to ensure result is not 0. if (score2 > score1) { // from high to low @out = 1; } else if (score2 < score1) { @out = -1; } else // 3. alphabetic order of word { @out = word2.CompareTo(word1); } } return(@out); }
/// <summary> /// This method uses context scores to find the correct term. /// </summary> /// <param name="inTokenObj"> the input tokenObj (single word) </param> /// <param name="cSpellApi"> CSpell Api object </param> /// <param name="debugFlag"> flag for debug print </param> /// <param name="tarPos"> position for target token </param> /// <param name="nonSpaceTokenList"> token list without space token(s) /// </param> /// <returns> the corrected word in tokenObj if the coreTerm is OOV /// and suggested word found. Otherwise, the original input token /// is returned. </returns> public static TokenObj GetCorrectTerm(TokenObj inTokenObj, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList) { // init int funcMode = cSpellApi.GetFuncMode(); // get inWord from inTokenObj and init outTokenObj string inWord = inTokenObj.GetTokenStr(); TokenObj outTokenObj = new TokenObj(inTokenObj); // 1. convert a word to coreTerm (no leading/ending space, punc, digit) int ctType = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT; CoreTermObj coreTermObj = new CoreTermObj(inWord, ctType); string coreStr = coreTermObj.GetCoreTerm(); // 2. non-word detection and correction // check if the coreTerm is spelling errors - non-word //!NonWordDetector.IsValidWord(inWord, coreStr, cSpellApi, debugFlag); // TBD .. need to separate 1-to-1 and split if (NonWordDetector.IsDetect(inWord, coreStr, cSpellApi, debugFlag) == true) { cSpellApi.UpdateDetectNo(); // TBD, should take care of possessive xxx's here // 3.1 get 1-to-1 candidates set from correction, no split HashSet <string> candSet = NonWord1To1Candidates.GetCandidates(coreStr, cSpellApi); // add split // TBD ... if (funcMode != CSpellApi.FUNC_MODE_NW_1) { // 3.2 get candidates from split int maxSplitNo = cSpellApi.GetCanNwMaxSplitNo(); HashSet <string> splitSet = NonWordSplitCandidates.GetCandidates(coreStr, cSpellApi, maxSplitNo); // 3.4 set split candidates to candidate if (funcMode == CSpellApi.FUNC_MODE_NW_S) { candSet = new HashSet <string>(splitSet); } else // 3.4 add split candidates { candSet.addAll(splitSet); } } // 4. Ranking: get top ranked candidates as corrected terms // 4.1 from orthoGraphic /* * // not used context * String topRankStr = RankByMode.GetTopRankStr(coreStr, candSet, * cSpellApi, debugFlag); */ // in case of using context string topRankStr = RankNonWordByMode.GetTopRankStr(coreStr, candSet, cSpellApi, debugFlag, tarPos, nonSpaceTokenList); // 5 update coreTerm and convert back to tokenObj coreTermObj.SetCoreTerm(topRankStr); string outWord = coreTermObj.ToString(); // 6. update info if there is a process if (inWord.Equals(outWord) == false) { outTokenObj.SetTokenStr(outWord); if (TermUtil.IsMultiword(outWord) == true) { cSpellApi.UpdateCorrectNo(); outTokenObj.AddProcToHist(TokenObj.HIST_NW_S); //split DebugPrint.PrintCorrect("NW", "NonWordCorrector-Split", inWord, outWord, debugFlag); } else // 1To1 correct { cSpellApi.UpdateCorrectNo(); outTokenObj.AddProcToHist(TokenObj.HIST_NW_1); DebugPrint.PrintCorrect("NW", "NonWordCorrector-1To1", inWord, outWord, debugFlag); } } } return(outTokenObj); }