private static void TestSplit(CSpellApi cSpellApi) { // setup test case // 10349.txt //String inText = "sounding in my ear every time for along time."; // 13864.txt string inText = "I donate my self to be apart of this study."; TextObj textObj = new TextObj(inText); List <TokenObj> inTextList = textObj.GetTokenList(); List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList); //int tarPos = 7; int tarPos = 6; TokenObj inTokenObj = nonSpaceTokenList[tarPos]; bool debugFlag = false; Console.WriteLine("====== Real-Word One-To-One Correction Test ====="); Console.WriteLine("-- inTextList: [" + inText + "]"); Console.WriteLine("-- tarPos: [" + tarPos + "]"); Console.WriteLine("-- inTokenObj: [" + inTokenObj.ToString() + "]"); // get the correct term TokenObj outTokenObj = GetCorrectTerm(inTokenObj, cSpellApi, debugFlag, tarPos, nonSpaceTokenList); // print out Console.WriteLine("--------- GetCorrectTermStr( ) -----------"); Console.WriteLine("-- outTokenObj: [" + outTokenObj.ToString() + "]"); }
private static void TestOnSet(Word2Vec w2vIm, Word2Vec w2vOm) { string inText = "He was diagnosed early on set dementia 3 years ago."; TextObj textObj = new TextObj(inText); List <TokenObj> inTextList = textObj.GetTokenList(); // remove space token from the list List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList); Console.WriteLine("=========================================="); Console.WriteLine("-- inTextList: [" + inText + "]"); int tarPos = 4; int tarSize = 2; // "on set" has 2 tokens int radius = 2; bool word2VecSkipWord = true; bool debugFlag = false; // 1 context with window radius DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, w2vIm, radius, word2VecSkipWord, debugFlag); string str1 = "onset"; ContextScore s1 = new ContextScore(str1, contextVec, w2vOm); string str2 = "on set"; ContextScore s2 = new ContextScore(str2, contextVec, w2vOm); Console.WriteLine("- [" + str1 + "]: " + s1.ToString()); Console.WriteLine("- [" + str2 + "]: " + s2.ToString()); }
private static void Test1To1(CSpellApi cSpellApi) { // setup test case // 51.txt //String inText = "You'd thing that this is good."; //String inText = "The doctor thing that this is good."; string inText = "you would thing that is good."; TextObj textObj = new TextObj(inText); List <TokenObj> inTextList = textObj.GetTokenList(); List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList); int tarPos = 2; TokenObj inTokenObj = nonSpaceTokenList[tarPos]; bool debugFlag = false; Console.WriteLine("====== Real-Word One-To-One Correction Test ====="); Console.WriteLine("-- inTextList: [" + inText + "]"); Console.WriteLine("-- tarPos: [" + tarPos + "]"); Console.WriteLine("-- inTokenObj: [" + inTokenObj.ToString() + "]"); // get the correct term TokenObj outTokenObj = GetCorrectTerm(inTokenObj, cSpellApi, debugFlag, tarPos, nonSpaceTokenList); // print out Console.WriteLine("--------- GetCorrectTermStr( ) -----------"); Console.WriteLine("-- outTokenObj: [" + outTokenObj.ToString() + "]"); }
// private method // Test merge and Split private static void Test(string inText, int tarPos, int tarSize, int radius, string mergedWord, string splitWords, Word2Vec w2vIm, Word2Vec w2vOm) { // 0. process the inText TextObj textObj = new TextObj(inText); List <TokenObj> inTextList = textObj.GetTokenList(); // remove space token from the list List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList); Console.WriteLine("=========================================="); Console.WriteLine("-- inTextList: [" + inText + "]"); bool word2VecSkipWord = true; bool debugFlag = false; // 1.a context with window radius DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, w2vIm, radius, word2VecSkipWord, debugFlag); // 1.b context with all inText DoubleVec contextVecA = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, w2vIm, word2VecSkipWord, debugFlag); // 1.c get score1 ContextScore score1 = new ContextScore(mergedWord, contextVec, w2vOm); ContextScore score1a = new ContextScore(mergedWord, contextVecA, w2vOm); Console.WriteLine(score1.ToString() + "|" + string.Format("{0,1:F8}", score1a.GetScore())); // 2. split words ContextScore score2 = new ContextScore(splitWords, contextVec, w2vOm); ContextScore score2a = new ContextScore(splitWords, contextVecA, w2vOm); Console.WriteLine(score2.ToString() + "|" + string.Format("{0,1:F8}", score2a.GetScore())); // 3. 3. 3. Use avg. score on single words // This method use different context for each single word List <string> splitWordList = TermUtil.ToWordList(splitWords); int index = 0; double scoreSAvg = 0.0d; // radius double scoreSAAvg = 0.0d; // all inText //debugFlag = false; foreach (string splitWord in splitWordList) { // window radius DoubleVec contextVecS = Word2VecContext.GetContextVec(tarPos + index, 1, nonSpaceTokenList, w2vIm, radius, word2VecSkipWord, debugFlag); ContextScore scoreS = new ContextScore(splitWord, contextVecS, w2vOm); //System.out.println("-- " + scoreS.ToString()); scoreSAvg += scoreS.GetScore(); // all text DoubleVec contextVecSA = Word2VecContext.GetContextVec(tarPos + index, 1, nonSpaceTokenList, w2vIm, word2VecSkipWord, debugFlag); ContextScore scoreSA = new ContextScore(splitWord, contextVecSA, w2vOm); //System.out.println("-- " + scoreSA.ToString()); scoreSAAvg += scoreSA.GetScore(); index++; } scoreSAvg = scoreSAvg / index; // window scoreSAAvg = scoreSAAvg / index; // all text Console.WriteLine("Avg. Single Word|" + string.Format("{0,1:F8}", scoreSAvg) + "|" + string.Format("{0,1:F8}", scoreSAAvg)); }
// recursively process public static string Process(string inWord, int maxProcess) { string lastText = inWord; string outText = Process(inWord); while ((maxProcess > 0) && (outText.Equals(lastText) == false)) { // recusively process lastText = outText; // converts to textObj for recursively process TextObj textObj = new TextObj(lastText); List <TokenObj> inTokenList = textObj.GetTokenList(); List <TokenObj> outTokenList = new List <TokenObj>(inTokenList.Select(tokenObj => tokenObj.GetTokenStr()).Select(tokenStr => Process(tokenStr)).Select(outStr => new TokenObj(outStr)).ToList()); outText = TextObj.TokenListToText(outTokenList); maxProcess--; } return(outText); }
// private methods // this test is not verified private static int RunTest(bool detailFlag, int tarPos, int tarSize, int contextRadius, long limitNo) { // init dic string configFile = "../data/Config/cSpell.properties"; CSpellApi cSpellApi = new CSpellApi(configFile); cSpellApi.SetRankMode(CSpellApi.RANK_MODE_CONTEXT); Word2Vec word2VecIm = cSpellApi.GetWord2VecIm(); Word2Vec word2VecOm = cSpellApi.GetWord2VecOm(); bool word2VecSkipWord = cSpellApi.GetWord2VecSkipWord(); ContextScoreComparator <ContextScore> csc = new ContextScoreComparator <ContextScore>(); // provide cmdLine interface int returnValue = 0; try { StreamReader stdInput = new StreamReader(Console.OpenStandardInput()); try { string inText = null; Console.WriteLine("- Please input a text, only a spell error allowed (type \"Ctl-d\" to quit) > "); while (!string.ReferenceEquals((inText = stdInput.ReadLine()), null)) { // --------------------------------- // Get spell correction on the input // --------------------------------- // convert input text to TokenObj TextObj textObj = new TextObj(inText); List <TokenObj> inTextList = textObj.GetTokenList(); // *2 because tokenList include space string tarWord = inTextList[tarPos * 2].GetTokenStr(); for (int i = 1; i < tarSize; i++) { int ii = (tarPos + 1) * 2; tarWord += " " + inTextList[ii].GetTokenStr(); } Console.WriteLine("- input text: [" + inText + "]"); Console.WriteLine("- target: [" + tarPos + "|" + tarSize + "|" + tarWord + "]"); Console.WriteLine("- context radius: " + contextRadius); // get all possible candidates HashSet <string> candSet = NonWord1To1Candidates.GetCandidates(tarWord, cSpellApi); candSet.Add(tarWord); // add the original word Console.WriteLine("-- canSet.size(): " + candSet.Count); // get final suggestion // remove space token from the list List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList); string topRankStr = GetTopRankStr(tarWord, candSet, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, detailFlag); Console.WriteLine("- top rank str: " + topRankStr); // print details if (detailFlag == true) { HashSet <ContextScore> candScoreSet = GetCandidateScoreSet(candSet, tarPos, tarSize, inTextList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, detailFlag); Console.WriteLine("------ Suggestion List ------"); var list = candScoreSet.OrderBy(x => x, csc).Take((int)limitNo).Select(obj => obj.ToString()); foreach (var item in list) { Console.WriteLine(item); } } // print the prompt Console.WriteLine("- Please input a text, only a spell error allowed (type \"Ctl-d\" to quit) > "); } } catch (Exception e2) { Console.Error.WriteLine(e2.Message); returnValue = -1; } } catch (Exception e) { Console.Error.WriteLine(e.Message); returnValue = -1; } return(returnValue); }